In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('reviews_dataset2.csv')
print(df.head())

shuffled_df = df.sample(frac=1).reset_index(drop=True)
print(shuffled_df.head())

texts = shuffled_df['text'].values
labels = shuffled_df['label'].values

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(texts[:10000]).toarray()

label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(labels[:10000])

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

print(y_train[:3])
print(X[:100])

In [None]:
# simple neuron network
# near 85 % accuracy only

import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

# create
class SimpleNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return self.softmax(x)

model = SimpleNN(input_dim=x_train.shape[1], hidden_dim=1000, output_dim=len(set(Y)))

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# train
for epoch in range(100):
    model.train()
    inputs = torch.tensor(x_train, dtype=torch.float32)
    targets = torch.tensor(y_train, dtype=torch.long)

    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, targets)
    loss.backward()
    optimizer.step()

    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

# estimating
model.eval()
with torch.no_grad():
    inputs = torch.tensor(x_test, dtype=torch.float32)
    targets = torch.tensor(y_test, dtype=torch.long)
    outputs = model(inputs)
    _, predicted = torch.max(outputs, 1)
    accuracy = (predicted == targets).float().mean()
    print(f'Test Accuracy: {accuracy.item()}')

In [None]:
# near 80 % accuracy only...
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from collections import Counter

df = pd.read_csv('reviews_dataset2.csv')
print(df.head())

shuffled_df = df.sample(frac=1).reset_index(drop=True)
print(shuffled_df.head())

texts = shuffled_df['text'].values
labels = shuffled_df['label'].values

# tokenize and build vocabulary
vocab = Counter(" ".join(texts).split())

# words to indexes mapping
vocab = {word: i for i, (word, freq) in enumerate(vocab.items(), 1)}  

# convert texts to sequences of numbers
def text_to_sequence(text, vocab):
    return [vocab[word] for word in text.split() if word in vocab]

sequences = [text_to_sequence(text, vocab) for text in texts]

# setting sequences to have the same length
max_len = max(len(seq) for seq in sequences)
padded_sequences = [seq + [0] * (max_len - len(seq)) for seq in sequences]

inputs = torch.tensor(padded_sequences, dtype=torch.long)
labels = torch.tensor(labels, dtype=torch.float32).unsqueeze(1)

# split into train and test sets
train_inputs, test_inputs, train_labels, test_labels = train_test_split(inputs, labels, test_size=0.3)

# model
class SentimentAnalysisModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(SentimentAnalysisModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim * max_len, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        flattened = embedded.view(embedded.size(0), -1)
        out = torch.relu(self.fc1(flattened))
        out = self.sigmoid(self.fc2(out))
        return out

# hyperparameters
vocab_size = len(vocab) + 1  # plus one for padding (index 0)
embedding_dim = 27
hidden_dim = 17
output_dim = 1
learning_rate = 0.001
num_epochs = 100

model = SentimentAnalysisModel(vocab_size, embedding_dim, hidden_dim, output_dim)
criterion = nn.BCEWithLogitsLoss()  # Binary Cross-Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# training
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    outputs = model(train_inputs)
    loss = criterion(outputs, train_labels)

    loss.backward()
    optimizer.step()

    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

model.eval()
with torch.no_grad():
    test_outputs = model(test_inputs)
    predictions = (test_outputs >= 0.5).float()  # Convert probabilities to binary predictions
    accuracy = (predictions == test_labels).sum() / len(test_labels)
    print(f'Accuracy: {accuracy.item() * 100:.2f}%')


In [143]:
# Neural Bag of Words appoach also known as continuous bag-of-words, CBoW
# more than 92 % accuracy ! 
import pandas as pd
from datasets import Dataset, DatasetDict

from transformers import AutoTokenizer

import torch
from torch.utils.data import DataLoader, TensorDataset

import pickle
import dill

# pandas to tokens
df = pd.read_csv('reviews_dataset2.csv')
print(df.head())

dataset = Dataset.from_pandas(df)
train_test_split = dataset.train_test_split(test_size=0.2)
dataset_dict = DatasetDict(train_test_split)

train_data = dataset_dict['train']
test_data = dataset_dict['test']
train_data, test_data

train_data.features
train_data[0]

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# key parameter !
max_length=1024

def tokenize_text(example):
    tokens = tokenizer.tokenize(example["text"], truncation=True, max_length=max_length)
    example["tokens"] = tokens
    return example

# apply the tokenization function
train_data = train_data.map(tokenize_text)
test_data = test_data.map(tokenize_text)

test_size = 0.25

# splitting data
train_valid_data = train_data.train_test_split(test_size=test_size)
train_data = train_valid_data["train"]
valid_data = train_valid_data["test"]

min_freq = 1 # to more accuracy... :)
special_tokens = ["<unk>", "<pad>"]

# building vocabulary
def build_vocab_from_iterator(tokens_data, min_freq, specials):
    all_tokens = [token for tokens in tokens_data for token in tokens]
    token_counts = Counter(all_tokens)
    vocab = {token for token, count in token_counts.items() if count >= min_freq}
    vocab = specials + sorted(vocab)
    vocab_dict = {token: idx for idx, token in enumerate(vocab)}
    return vocab_dict

vocab = build_vocab_from_iterator(
    train_data["tokens"],
    min_freq=min_freq,
    specials=special_tokens,
)

unk_index = vocab["<unk>"]
pad_index = vocab["<pad>"]

# converting tokenized texts to numerical indices
def encode_texts_to_indices(texts, vocab):
    return [[vocab.get(token, vocab["<unk>"]) for token in tokens] for tokens in texts]

# padding sequences to the same length
def pad_sequences(sequences, pad_index):
    max_len = max(len(seq) for seq in sequences)
    return ([seq + [pad_index] * (max_len - len(seq)) for seq in sequences], max_len)

# converting tokenized texts to numerical indices
train_indices = encode_texts_to_indices(train_data["tokens"], vocab)
valid_indices = encode_texts_to_indices(valid_data["tokens"], vocab)
test_indices = encode_texts_to_indices(test_data["tokens"], vocab)

# padding sequences
pad_index = vocab["<pad>"]
train_indices_padded, max_len_train  = pad_sequences(train_indices, pad_index)
valid_indices_padded, max_len_valid = pad_sequences(valid_indices, pad_index)
test_indices_padded, max_len_test = pad_sequences(test_indices, pad_index)

print(f"Max lengths: {max_len_train} {max_len_valid} {max_len_test}")

# building tensors
X_train = torch.tensor(train_indices_padded, dtype=torch.long)
y_train = torch.tensor(train_data["label"], dtype=torch.long)

X_valid = torch.tensor(valid_indices_padded, dtype=torch.long)
y_valid = torch.tensor(valid_data["label"], dtype=torch.long)

X_test = torch.tensor(test_indices_padded, dtype=torch.long)
y_test = torch.tensor(test_data["label"], dtype=torch.long)

# instancing data loaders
train_dataset = TensorDataset(X_train, y_train)
valid_dataset = TensorDataset(X_valid, y_valid)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

# Neural Bag of Words model
class NBoW(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_index):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_index)
        self.fc = nn.Linear(embedding_dim, output_dim)

    def forward(self, ids):
        embedded = self.embedding(ids)
        pooled = embedded.mean(dim=1)
        prediction = self.fc(pooled)
        return prediction

# defining model parameters
embedding_dim = 300
output_dim = len(set(df['label']))
vocab_size = len(vocab)
pad_index = vocab["<pad>"]

# initializing model
model = NBoW(vocab_size, embedding_dim, output_dim, pad_index)

# loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# training the model
def train_model_iteration(model, train_loader, optimizer, criterion):
    # train part
    model.train()
    train_loss = 0
    for batch in train_loader:
        texts, labels = batch
        
        optimizer.zero_grad()
        outputs = model(texts)
        
        loss = criterion(outputs, labels)
        loss.backward()
        
        optimizer.step()
        train_loss += loss.item()
    
    train_loss /= len(train_loader)        
    model.eval()
    return train_loss

# estimating the model
def estimate_model_iteration(valid_loader, model, criterion):
    valid_loss = 0
    corrects = 0
    total = 0
    
    with torch.no_grad():
        for batch in valid_loader:
            texts, labels = batch
            
            outputs = model(texts)
            loss = criterion(outputs, labels)
            
            valid_loss += loss.item()
            
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            corrects += (predicted == labels).sum().item()
    
    valid_loss /= len(valid_loader)
    accuracy = 100 * corrects / total
    
    return valid_loss, accuracy 

def train_model(model, train_loader, valid_loader, criterion, optimizer, num_epochs=6):
    for epoch in range(num_epochs):
        train_loss = train_model_iteration(model, train_loader, optimizer, criterion)        
        valid_loss, accuracy = estimate_model_iteration(valid_loader, model, criterion)
        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Valid Loss: {valid_loss:.4f}, Accuracy: {accuracy:.2f}%')

train_model(model, train_loader, valid_loader, criterion, optimizer)

def save_model_and_vocab(model, vocab, filename):
    with open(filename, 'wb') as f:
        dill.dump({'model': model, 'vocab': vocab}, f)
    print(f'Model and vocabulary saved to {filename}')

model_path = "PyTorchNBoWModel.pkl"
save_model_and_vocab(model, vocab, model_path)


                                                text  label
0  one best crichton novel sphere michael crichto...      1
1  medicine future z accomplished heart surgeon f...      1
2  beautiful gorgeous network comic book contains...      1
3  lover robicheaux book lover robicheaux demon s...      1
4  excellent broad survey development civilizatio...      1




Map:   0%|          | 0/56368 [00:00<?, ? examples/s]

Map:   0%|          | 0/14093 [00:00<?, ? examples/s]

Max lengths: 1024 1024 1024
Epoch 1/6, Train Loss: 0.4592, Valid Loss: 0.3512, Accuracy: 84.05%
Epoch 2/6, Train Loss: 0.2843, Valid Loss: 0.2676, Accuracy: 89.48%
Epoch 3/6, Train Loss: 0.2191, Valid Loss: 0.2484, Accuracy: 90.49%
Epoch 4/6, Train Loss: 0.1864, Valid Loss: 0.2435, Accuracy: 90.92%
Epoch 5/6, Train Loss: 0.1652, Valid Loss: 0.2452, Accuracy: 90.99%
Epoch 6/6, Train Loss: 0.1478, Valid Loss: 0.2518, Accuracy: 91.04%
Model and vocabulary saved to PyTorchNBoWModel.pkl


In [140]:
def load_model_and_vocab(filename):
    with open(filename, 'rb') as f:
        data = pickle.load(f)
    model = data['model']
    vocab = data['vocab']
    model.eval()
    return model, vocab

# Load the model and vocabulary
model, vocab = load_model_and_vocab(model_path)
vocab_size = len(vocab)

def evaluate_model(model, test_loader):
    model.eval()

    test_loss, accuracy = estimate_model_iteration(test_loader, model, criterion)
    print(f'Test Loss: {test_loss:.4f}, Accuracy: {accuracy:.2f}%')

evaluate_model(model, test_loader)

  return torch.load(io.BytesIO(b))


Test Loss: 0.2544, Accuracy: 91.30%


In [142]:
def preprocess_text(text, vocab, max_length=1024):
    tokens = text.split()
    indices = [vocab.get(token, vocab['<unk>']) for token in tokens]
    
    if len(indices) < max_length:
        indices += [vocab['<pad>']] * (max_length - len(indices))
    return torch.tensor(indices, dtype=torch.long).unsqueeze(0)

def predict(model, text, vocab):
    model.eval()
    with torch.no_grad():
        input_tensor = preprocess_text(text, vocab)
        output = model(input_tensor)
        _, predicted = torch.max(output, 1)
        return predicted.item()

text = "negative reaction terrible bad disgusting"
text2 ="good better maybe forced"
predicted_label = predict(model, text, vocab)
print(f"Predicted label 1: {predicted_label}")
predicted_label = predict(model, text2, vocab)
print(f"Predicted label 2: {predicted_label}")

Predicted label 1: 0
Predicted label 2: 1
