#Imports

In [None]:
pip install torchtext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import torchtext
from torchtext.data.utils import get_tokenizer


In [None]:
from collections import Counter

In [None]:
import gensim.downloader as api
w2v_model = api.load("word2vec-google-news-300")



In [None]:
import numpy as np
import math
import pandas as pd
import torch
import torch.nn as nn
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


#Loading dataset

In [None]:
data = pd.read_csv('restuarents.csv', encoding='ISO-8859-1')
print(data.head())

                                                text  label
0                           Wow... Loved this place.      1
1                                 Crust is not good.      0
2          Not tasty and the texture was just nasty.      0
3  Stopped by during the late May bank holiday of...      1
4  The selection on the menu was great and so wer...      1


defining Textpreprocessor

In [None]:
class TextPreprocessor:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.punctuations = set(string.punctuation)

    def process_text(self, text):
        text = text.lower()
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'<.*?>', '', text)
        text = ''.join(char for char in text if char not in self.punctuations)
        tokens = word_tokenize(text)
        tokens = [token for token in tokens if token not in self.stop_words]
        text = ' '.join(tokens)

        return text

    def process_column(self, df, column_name):
        df[column_name] = df[column_name].apply(self.process_text)

        return df

Applying preprocessor on dataset

In [None]:
preprocessor = TextPreprocessor()
df = preprocessor.process_column(data, 'text')

print(df.head())

                                                text  label
0                                    wow loved place      1
1                                         crust good      0
2                                tasty texture nasty      0
3  stopped late may bank holiday rick steve recom...      1
4                        selection menu great prices      1


In [None]:
def w2v(sentence):
    tokenized_data = sentence.split()
    n_tokens = len(tokenized_data)
    if n_tokens >= 10:
        tokenized_data = tokenized_data[:10]
    else:
        pad_length = 10 - n_tokens
        tokenized_data += ["<EOS>"] * pad_length

    vectors = []
    for token in tokenized_data:
        if token in w2v_model:
            vec = w2v_model[token]
        else:
            vec = np.zeros(w2v_model.vector_size)
        vectors.append(vec)

    tensor = torch.stack([torch.tensor(vec, dtype=torch.float32) for vec in vectors])
    return tensor

In [None]:
t = w2v("I am a good student")
print(t.shape)

g = w2v("Hey are you going to the zoo tomorrow morning with me and my friends")
print(g.shape)

y = w2v("you are a fool")

f =w2v("I quit")
print(y.shape)
print(f.shape)

torch.Size([10, 300])
torch.Size([10, 300])
torch.Size([10, 300])
torch.Size([10, 300])


Defining datasethelper child class

In [None]:
class Datasethelper(Dataset):
  def __init__(self, df):
    super().__init__()
    self.data = df['text'].values
    self.labels = df['label'].values

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index):
    text = self.data[index]
    label = self.labels[index]
    w2v_data = w2v(text)
    label = torch.tensor( label , dtype=torch.float32)
    #print(w2v_data.shape)
    #print(w2v_data.dtype)
    return w2v_data, label

#Split the dataset and prepare by Dataloader

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_helper = Datasethelper(train_df)
test_helper = Datasethelper(test_df)


train_dloader = DataLoader(train_helper, batch_size = 12 , shuffle = True)
test_dloader = DataLoader(test_helper, batch_size = 12 , shuffle = False)

Checking shapes of dataloader objects

In [None]:
for x,y in train_dloader:
  break
print(x.shape)
print(y.shape)
print(x.dtype)

torch.Size([12, 10, 300])
torch.Size([12])
torch.float32


#Defining attention class

In [None]:
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.hidden_dim = hidden_dim
        self.linear = nn.Linear(hidden_dim, 1)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, encoder_outputs):
        energy = self.linear(encoder_outputs)
        weights = self.softmax(energy)
        weighted_outputs = encoder_outputs * weights
        context_vector = torch.sum(weighted_outputs, dim=1)
        return context_vector, weights




#Defining Rnn class which uses attention

In [None]:
class RNNWithAttention(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super(RNNWithAttention, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_dim, hidden_dim, num_layers, batch_first=True)
        self.attention = Attention(hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        output, _ = self.rnn(x)
        context_vector, attention_weights = self.attention(output)
        output = self.fc(context_vector)
        output = self.sigmoid(output)
        return output, attention_weights

#Initializing the model

In [None]:
model = RNNWithAttention(input_dim=300, hidden_dim=64, num_layers=1, output_dim=1)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

#Training

In [None]:
for epoch in range(10):
    overall_accuracy = 0
    for x, y in train_dloader:
        #print(f'from tloader y: {y.shape}')
        predicted_y, _ = model(x)
        #print(f'rnn out:  {rnn_output.shape}')
        #predicted_y = nn_model2(rnn_output)
        #print(f'pred-y from NN6: {predicted_y.shape}')
        batch_size = x.shape[0]
        #predicted_y = predicted_y.squeeze(1).t()
        #print(f'pred-y from squeeze: {predicted_y.shape}')
        #y = y.view(batch_size, 1)
        #print(f'final y from view: {y.shape}')
        loss = criterion(predicted_y.squeeze(), y.float())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        y_true = y.detach().numpy()
        y_pred = (predicted_y.detach().numpy() > 0.5).astype(int)
        accuracy = accuracy_score(y_true, y_pred)
        overall_accuracy += accuracy * batch_size

    print(f'Epoch: {epoch} --> Accuracy {overall_accuracy/len(train_helper)}')

Epoch: 0 --> Accuracy 0.7068493150684931
Epoch: 1 --> Accuracy 0.8136986301369863
Epoch: 2 --> Accuracy 0.8438356164383561
Epoch: 3 --> Accuracy 0.8616438356164383
Epoch: 4 --> Accuracy 0.8753424657534247
Epoch: 5 --> Accuracy 0.8863013698630137
Epoch: 6 --> Accuracy 0.9095890410958904
Epoch: 7 --> Accuracy 0.9273972602739726
Epoch: 8 --> Accuracy 0.9356164383561644
Epoch: 9 --> Accuracy 0.9383561643835616


#Testing

In [None]:
test_accuracy = 0
overall_accuracy = 0
for x, y in test_dloader:
    predicted_y, _ = model(x)
    batch_size = x.shape[0]
    #y = y.view(batch_size, 1)
    y_true = y.detach().numpy()
    y_pred = (predicted_y.detach().numpy() > 0.5).astype(int)
    accuracy = accuracy_score(y_true, y_pred)
    overall_accuracy += accuracy * batch_size
test_accuracy = overall_accuracy / len(test_helper)
print(f'Test Accuracy: {test_accuracy}')

Test Accuracy: 0.8087431693989071


=====================================

=====================================

In [None]:
class RNNWithAttention2(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super(RNNWithAttention2, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_dim, hidden_dim, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        self.softmax = nn.Softmax(dim=1)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        output, _ = self.rnn(x)
        energy = self.linear(output)
        weights = self.softmax(energy)
        weighted_outputs = output * weights
        context_vector = torch.sum(weighted_outputs, dim=1)
        output = self.fc(context_vector)
        output = self.sigmoid(output)
        return output, weights


In [None]:
model2 = RNNWithAttention2(input_dim=300, hidden_dim=64, num_layers=1, output_dim=1)
criterion2 = nn.BCELoss()
optimizer2 = torch.optim.Adam(model2.parameters(), lr=0.01)

In [None]:
for epoch in range(10):
    overall_accuracy = 0
    for x, y in train_dloader:
        #print(f'from tloader y: {y.shape}')
        predicted_y, _ = model2(x)
        #print(f'rnn out:  {rnn_output.shape}')
        #predicted_y = nn_model2(rnn_output)
        #print(f'pred-y from NN6: {predicted_y.shape}')
        batch_size = x.shape[0]
        #predicted_y = predicted_y.squeeze(1).t()
        #print(f'pred-y from squeeze: {predicted_y.shape}')
        #y = y.view(batch_size, 1)
        #print(f'final y from view: {y.shape}')
        loss = criterion2(predicted_y.squeeze(), y.float())
        optimizer2.zero_grad()
        loss.backward()
        optimizer2.step()
        y_true = y.detach().numpy()
        y_pred = (predicted_y.detach().numpy() > 0.5).astype(int)
        accuracy = accuracy_score(y_true, y_pred)
        overall_accuracy += accuracy * batch_size

    print(f'Epoch: {epoch} --> Accuracy {overall_accuracy/len(train_helper)}')

Epoch: 0 --> Accuracy 0.7232876712328767
Epoch: 1 --> Accuracy 0.8054794520547945
Epoch: 2 --> Accuracy 0.852054794520548
Epoch: 3 --> Accuracy 0.8794520547945206
Epoch: 4 --> Accuracy 0.8808219178082192
Epoch: 5 --> Accuracy 0.9013698630136986
Epoch: 6 --> Accuracy 0.9205479452054794
Epoch: 7 --> Accuracy 0.9465753424657535
Epoch: 8 --> Accuracy 0.9465753424657535
Epoch: 9 --> Accuracy 0.9602739726027397


In [None]:
test_accuracy = 0
overall_accuracy = 0
for x, y in test_dloader:
    predicted_y, _ = model2(x)
    batch_size = x.shape[0]
    #y = y.view(batch_size, 1)
    y_true = y.detach().numpy()
    y_pred = (predicted_y.detach().numpy() > 0.5).astype(int)
    accuracy = accuracy_score(y_true, y_pred)
    overall_accuracy += accuracy * batch_size
test_accuracy = overall_accuracy / len(test_helper)
print(f'Test Accuracy: {test_accuracy}')

Test Accuracy: 0.8087431693989071


=================================

============================================

==========================================

================================================

In [None]:
class RNNWithAttention(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim, embedding_dim):
        super(RNNWithAttention, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.attention = Attention(hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = x.long()
        x = self.embedding(x)
        x = torch.mean(x, dim=2)
        x = x.float()
        output, _ = self.rnn(x)
        context_vector, attention_weights = self.attention(output)
        output = self.fc(context_vector)
        output = self.sigmoid(output)
        return output, attention_weights

In [None]:
from nltk.tokenize import word_tokenize


class Datasethelper(Dataset):
    vocab = None

    def __init__(self, df, max_length):
        super().__init__()
        self.data = df['text'].values
        self.labels = df['label'].values
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data[index]
        label = self.labels[index]
        tokens = word_tokenize(text)
        tokens = tokens[:self.max_length] + ['<pad>'] * (self.max_length - len(tokens))
        indices = [self.vocab[token] for token in tokens]
        indices = torch.tensor(indices, dtype=torch.long)
        label = torch.tensor(label, dtype=torch.float32)
        return indices, label


tokenizer = nltk.word_tokenize
counter = Counter()
for text in df['text'].values:
    tokens = tokenizer(text)
    counter.update(tokens)
vocab = dict(counter)
vocab['<pad>'] = 0
vocab['<unk>'] = 1
vocab_size = len(vocab)

Datasethelper.vocab = vocab

max_length = 10
train_helper = Datasethelper(train_df, max_length)
test_helper = Datasethelper(test_df, max_length)


In [None]:
for x,y in train_helper:
  print(x.dtype)
  print(x.shape)
  print("hehe")
  print(y.dtype)
  print(y.shape)

torch.int64
torch.Size([10])
hehe
torch.float32
torch.Size([])
torch.int64
torch.Size([10])
hehe
torch.float32
torch.Size([])
torch.int64
torch.Size([10])
hehe
torch.float32
torch.Size([])
torch.int64
torch.Size([10])
hehe
torch.float32
torch.Size([])
torch.int64
torch.Size([10])
hehe
torch.float32
torch.Size([])
torch.int64
torch.Size([10])
hehe
torch.float32
torch.Size([])
torch.int64
torch.Size([10])
hehe
torch.float32
torch.Size([])
torch.int64
torch.Size([10])
hehe
torch.float32
torch.Size([])
torch.int64
torch.Size([10])
hehe
torch.float32
torch.Size([])
torch.int64
torch.Size([10])
hehe
torch.float32
torch.Size([])
torch.int64
torch.Size([10])
hehe
torch.float32
torch.Size([])
torch.int64
torch.Size([10])
hehe
torch.float32
torch.Size([])
torch.int64
torch.Size([10])
hehe
torch.float32
torch.Size([])
torch.int64
torch.Size([10])
hehe
torch.float32
torch.Size([])
torch.int64
torch.Size([10])
hehe
torch.float32
torch.Size([])
torch.int64
torch.Size([10])
hehe
torch.float32
torch.S

In [None]:
import torchtext
from torchtext.data.utils import get_tokenizer
from collections import Counter

class Datasethelper(Dataset):
    vocab = None  # Define the vocab class variable

    def __init__(self, df, max_length):
        super().__init__()
        self.data = df['text'].values
        self.labels = df['label'].values
        self.tokenizer = get_tokenizer('basic_english')
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data[index]
        label = self.labels[index]
        tokens = self.tokenizer(text)
        # Truncate or pad tokens to max_length
        tokens = tokens[:self.max_length] + ['<pad>'] * (self.max_length - len(tokens))
        # Convert tokens to indices using vocabulary
        indices = [self.vocab.stoi[token] for token in tokens]
        indices = torch.tensor(indices, dtype=torch.long)
        label = torch.tensor(label, dtype=torch.float32)
        return indices, label

# Tokenize and build vocabulary
tokenizer = get_tokenizer('basic_english')
counter = Counter()
for text in df['text'].values:
    counter.update(tokenizer(text))
vocab = torchtext.vocab.vocab(counter)

# Set the vocabulary in Datasethelper
Datasethelper.vocab = vocab

# Instantiate Datasethelper
max_length = 10  # Maximum sequence length
train_helper = Datasethelper(train_df, max_length)
test_helper = Datasethelper(test_df, max_length)


In [None]:
for x,y in train_helper:
  print(x.dtype)

AttributeError: ignored

In [None]:
class Datasethelper2(Dataset):
    vocab = None  # Define the vocab class variable

    def __init__(self, df, max_length):
        super().__init__()
        self.data = df['text'].values
        self.labels = df['label'].values
        self.tokenizer = get_tokenizer('basic_english')
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data[index]
        label = self.labels[index]
        tokens = self.tokenizer(text)
        # Truncate or pad tokens to max_length
        tokens = tokens[:self.max_length] + ['<pad>'] * (self.max_length - len(tokens))
        # Convert tokens to indices using vocabulary
        indices = [self.vocab.stoi[token] for token in tokens]
        indices = torch.tensor(indices, dtype=torch.long)
        label = torch.tensor(label, dtype=torch.float32)
        return indices, label

In [None]:
tokenizer = get_tokenizer('basic_english')
counter = Counter()
for text in df['text'].values:
    counter.update(tokenizer(text))
vocab = torchtext.vocab.Vocab(counter)

# Set the vocabulary in Datasethelper
Datasethelper2.vocab = vocab.stoi


AttributeError: ignored

In [None]:
train_df2, test_df2 = train_test_split(df, test_size=0.2, random_state=42)

# Instantiate Datasethelper
train_helper2 = Datasethelper2(train_df, 10)
test_helper2 = Datasethelper2(test_df, 10)

train_dloader2 = DataLoader(train_helper2, batch_size=12, shuffle=True)
test_dloader2 = DataLoader(test_helper2, batch_size=12, shuffle=False)

In [None]:
for x,y in train_helper2:
  print(x.dtype)
  print(y.dtype)
  print(x.shape)




AttributeError: ignored