In [26]:
import os
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms, utils
from typing_extensions import Literal
from sklearn.feature_extraction.text import CountVectorizer
from dataclasses import dataclass
from abc import abstractmethod
import nltk
from nltk.tokenize import word_tokenize
import string
from torch.nn.utils.rnn import pad_sequence

In [6]:
df = pd.read_csv('data/train.tsv', delimiter='\t')
df

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
...,...,...,...,...
156055,156056,8544,Hearst 's,2
156056,156057,8544,forced avuncular chortles,1
156057,156058,8544,avuncular chortles,3
156058,156059,8544,avuncular,2


In [7]:
cv = CountVectorizer()
tmp = df.loc[:5]
tmp

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
5,6,1,of escapades demonstrating the adage that what...,2


In [72]:
tmp.loc[:, 'Phrase'] = tmp.loc[:, 'Phrase'].str.lower()
tmp

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,a series of escapades demonstrating the adage ...,1
1,2,1,a series of escapades demonstrating the adage ...,2
2,3,1,a series,2
3,4,1,a,2
4,5,1,series,2
5,6,1,of escapades demonstrating the adage that what...,2


In [58]:
cv.fit_transform(tmp.loc[:, 'Phrase'])

<6x24 sparse matrix of type '<class 'numpy.int64'>'
	with 49 stored elements in Compressed Sparse Row format>

In [83]:
nltk.word_tokenize('hello, it\'s me. I was')

['hello', ',', 'it', "'s", 'me', '.', 'I', 'was']

In [9]:
class TextPreprocesser():
    def __init__(
        self,
    ):
        pass
    
    def __call__(self, corpus: pd.Series):
        corpus = corpus.str.lower()
        res_corpus = []
        for i, row in corpus.items():
            words = nltk.word_tokenize(row)
            res_corpus.append([word for word in words if word not in string.punctuation])
        
        return res_corpus

In [10]:
preprocesser = TextPreprocesser()
print(tmp.loc[:, 'Phrase'])
prepared = preprocesser(tmp.loc[:, 'Phrase'])
print(prepared)

0    A series of escapades demonstrating the adage ...
1    A series of escapades demonstrating the adage ...
2                                             A series
3                                                    A
4                                               series
5    of escapades demonstrating the adage that what...
Name: Phrase, dtype: object
[['a', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose', 'is', 'also', 'good', 'for', 'the', 'gander', 'some', 'of', 'which', 'occasionally', 'amuses', 'but', 'none', 'of', 'which', 'amounts', 'to', 'much', 'of', 'a', 'story'], ['a', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose'], ['a', 'series'], ['a'], ['series'], ['of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose']]


In [11]:
class Vectorizer():
    @abstractmethod
    def fit(self):
        pass

    @abstractmethod
    def transform(self):
        pass

class IndexVectorizer(Vectorizer):
    def __init__(self):
        self.vocab = dict()
        self.vocab['<PAD>'] = 0
        
    def fit(self, corpus):
        cnt = len(self.vocab)
        for row in corpus:
            for word in row:
                if word not in self.vocab:
                    self.vocab[word] = cnt
                    cnt += 1

    def transform(self, corpus):
        res = []
        for row in corpus:
            cur = []
            for word in row:
                if word in self.vocab:
                    cur.append(self.vocab[word])
                else:
                    cur.append(0)
            res.append(cur)
        return pd.Series(res)


class CountVectorizer(Vectorizer): 
    def __init__(self):
        self.vectorizer = CountVectorizer()

    def fit(self, df, column_w_features):
        self.vectorizer.fit(df.loc[:, column_w_features].str)
        
    def transform(self, df, column_w_features):
        return self.vectorizer.transform(df.loc[:, column_w_features])
        

In [22]:
vect = IndexVectorizer()
vect.fit(prepared)
vectorized = vect.transform(prepared)
vectorized

0    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 6, 13,...
1       [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 6, 13]
2                                               [1, 2]
3                                                  [1]
4                                                  [2]
5             [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 6, 13]
dtype: object

In [28]:
MAX_SEQUENCE_LENGTH = vectorized.apply(len).max()
vectorized = vectorized.apply(lambda x: torch.tensor(x))
vectorized

0    [tensor(1), tensor(2), tensor(3), tensor(4), t...
1    [tensor(1), tensor(2), tensor(3), tensor(4), t...
2                               [tensor(1), tensor(2)]
3                                          [tensor(1)]
4                                          [tensor(2)]
5    [tensor(3), tensor(4), tensor(5), tensor(6), t...
dtype: object

In [29]:
padded_sequences = pad_sequence(vectorized, batch_first=True, padding_value=0)
padded_sequences

tensor([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12,  6, 13, 10, 14, 11, 12,
          6, 15, 16,  3, 17, 18, 19, 20, 21,  3, 17, 22, 23, 24,  3,  1, 25],
        [ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12,  6, 13,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 1,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12,  6, 13,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])

In [20]:
import torch
import torch.nn as nn


# Sample data: list of sequences with different lengths
sequences = [
    torch.tensor([1, 2, 3]),
    torch.tensor([4, 5]),
    torch.tensor([6, 7, 8, 9])
]
# print(f'Input shape: {sequences.shape}')

# Pad the sequences
padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0)
print(f'Output shape: {padded_sequences.shape}')
print("Padded sequences:")
print(padded_sequences)

Output shape: torch.Size([3, 4])
Padded sequences:
tensor([[1, 2, 3, 0],
        [4, 5, 0, 0],
        [6, 7, 8, 9]])


In [None]:
class SentimentDataset(Dataset):
    def __init__(self, path: str, vectorizer: Vectorizer, inference_mode: bool = False):
        self.inference_mode = inference_mode
        self.df = pd.read_csv(path, delimiter='\t')
        self.label_column = 'Sentiment'
        self.vectorizer = vectorizer
        print(self.df)

    def __len__(self):
        return len(self.df)

    def transform_features(self, features):
        features_transformed = self.vectorizer()
        return torch.tensor(features.values)

    def transform_label(self, label):
        return torch.tensor(label.values)

    def __getitem__(self, idx: int):
        if not self.inference_mode:
            return self.transform_features(self.df.iloc[idx].drop(self.label_column)), self.transform_label(self.df.iloc[idx, self.label_column])
        else:
            return self.transform_features(self.df.iloc[idx])

In [28]:
BATCH_SIZE = 32

full_train_dataset = SentimentDataset('data/train.tsv')
test_dataset =  SentimentDataset('data/test.tsv')
# Define the sizes of your splits
train_size = int(0.9 * len(full_train_dataset))
val_size = len(full_train_dataset) - train_size
train_dataset, val_dataset = random_split(
    full_train_dataset, [train_size, val_size]
)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
full_train_dataset[5]

        PhraseId  SentenceId  \
0              1           1   
1              2           1   
2              3           1   
3              4           1   
4              5           1   
...          ...         ...   
156055    156056        8544   
156056    156057        8544   
156057    156058        8544   
156058    156059        8544   
156059    156060        8544   

                                                   Phrase  Sentiment  
0       A series of escapades demonstrating the adage ...          1  
1       A series of escapades demonstrating the adage ...          2  
2                                                A series          2  
3                                                       A          2  
4                                                  series          2  
...                                                   ...        ...  
156055                                          Hearst 's          2  
156056                          forced avuncula

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint64, uint32, uint16, uint8, and bool.

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
from sklearn.preprocessing import LabelEncoder

class KaggleDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        self.data = pd.read_csv(csv_file)
        self.transform = transform

        # Preprocessing
        le = LabelEncoder()
        self.data['target'] = le.fit_transform(self.data['target'])
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        features = torch.tensor(self.data.iloc[idx, :-1].values, dtype=torch.float32)
        label = torch.tensor(self.data.iloc[idx, -1], dtype=torch.long)
        
        if self.transform:
            features = self.transform(features)
        
        return features, label

# Create the full dataset
full_dataset = KaggleDataset('path_to_your_csv_file.csv')

# Define the sizes of your splits
train_size = int(0.7 * len(full_dataset))
val_size = int(0.15 * len(full_dataset))
test_size = len(full_dataset) - train_size - val_size

# Split the dataset
train_dataset, val_dataset, test_dataset = random_split(
    full_dataset, [train_size, val_size, test_size]
)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)