In [258]:
import os
import pandas as pd
import spacy
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, random_split
from keras.utils import pad_sequences
from sklearn.preprocessing import LabelEncoder
from PIL import Image
from torch.utils.data import DataLoader
le = LabelEncoder()
nlp = spacy.load("en_core_web_sm")

In [259]:
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem import WordNetLemmatizer
import re
import nltk
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')
def clean_text(text):
  text = re.sub('www.\S+', ' ', text)
  text = re.sub('https?.\S+', ' ', text)
  text = re.sub('[^A-Za-z@]+', ' ', text)
  text = [word.lower() for word in text.split(' ') if word not in stop_words]
  return ' '.join(text)

In [260]:
class Vocabulary:
  def __init__(self, freq_threshold, sequence_len):
    self.itos = {0:'<PAD>', 1:'<SOS>', 2:'<EOS>', 3:'<UNK>'}
    self.stoi = {v:k for k, v in self.itos.items()}
    self.freq_threshold = freq_threshold
    self.sequence_len = sequence_len
  
  def __len__(self):
    return len(self.itos)
  
  @staticmethod
  def tokenizer_eng(text):
    return [token.text.lower() for token in nlp.tokenizer(text)]
  
  def build_vocabulary(self, sentence_list):
    frequencies = {}
    idx = 4
    for sentence in sentence_list:
      for word in self.tokenizer_eng(sentence):
        if word in frequencies:
          frequencies[word] += 1
        else: frequencies[word] = 1
        if frequencies[word] == self.freq_threshold:
          self.stoi[word] = idx
          self.itos[idx] = word
          idx += 1
  
  def numericalize(self, text, padding=False):
    pad_idx = self.stoi['<PAD>']
    tokenized_text = []
    for token in self.tokenizer_eng(text):
      if token in self.stoi:
        tokenized_text.append(self.stoi[token])
      else:
        tokenized_text.append(self.stoi['<UNK>'])
    if padding:
      tokenized_text = [pad_idx]*(self.sequence_len - len(tokenized_text)) + tokenized_text
    return tokenized_text[-self.sequence_len:]
  
    
class TweetDataset(Dataset):
  def __init__(self, root_dir, sequence_len, freq_threshold=5):
    self.df = pd.read_csv(root_dir).iloc[:, 2:].dropna()
    self.df.columns = ['sentiment', 'text']
    self.df['text'] = self.df['text'].apply(clean_text)
    self.le = LabelEncoder()
    self.df['sentiment'] = le.fit_transform(self.df['sentiment'])
    self.vocab = Vocabulary(freq_threshold, sequence_len)
    self.vocab.build_vocabulary(self.df['text'])

  def __len__(self):
    return len(self.df)
  
  def __getitem__(self, index):
    sentiment = self.df['sentiment'][index]
    text = self.df['text'][index]
    numericalized_text = [self.vocab.stoi["<SOS>"]]
    numericalized_text += self.vocab.numericalize(text, padding=True)
    numericalized_text.append(self.vocab.stoi["<EOS>"])
    return torch.tensor(numericalized_text), sentiment

dataset = TweetDataset('../data/twitter_training.csv', sequence_len=10)
dataset.df.head()

Unnamed: 0,sentiment,text
0,3,im getting borderlands murder
1,3,i coming borders i kill
2,3,im getting borderlands kill
3,3,im coming borderlands murder
4,3,im getting borderlands murder


In [261]:
len(dataset)

73996

In [264]:
total_size = len(dataset)
train_size = int(total_size*0.9)
train_data, test_data = random_split(dataset, [train_size, total_size-train_size])
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=True)
text, sentiments = next(iter(train_loader))

In [None]:
import torch.nn as nn
class Model(nn.Module):
  def __init__(self, input_size, output_size, hidden_size) -> None:
    super(Model).__init__()
    self.hidden_size = hidden_size
    self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
    self.fc = nn.Linear(hidden_size, output_size)
  
  def forward(self, x):
    batch_size = x.size(0)
    hidden = self.init_hidden(batch_size)
    output, hidden = self.rnn(x, hidden)
    output = output.contiguous.view(-1, self.hidden_size)
    output = self.fc(output)
    return output, hidden
  
  def init_hidden(self, batch_size):
    hidden = torch.zeros(batch_size, self.hidden_size)
    return hidden

model = Model()


