<a href="https://colab.research.google.com/github/Altemir1/Machine_learning/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Loading data set

In [None]:
import pandas as pd
import numpy as np

In [None]:
movies = pd.read_csv("movie.csv")
movies.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [None]:
movies["label"].value_counts()
# 0 & 1 for negative and positive

0    20019
1    19981
Name: label, dtype: int64

#Creating NN

Importing necessary libraries

In [None]:
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset
import spacy
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
!pip install torch torchvision torchtext spacy
!python -m spacy download en_core_web_sm

[0mCollecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


Tokenize the text

In [None]:
# Load spaCy English tokenizer
spacy_en = spacy.load('en_core_web_sm')

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

# Tokenize the text column
movies['tokenized_text'] = movies['text'].apply(tokenize_en)


In [None]:
train_df, val_df = train_test_split(movies, test_size=0.2, random_state=42)

Creating a Vocabulary

In [None]:
def yield_tokens(data_iter):
    for text in data_iter:
        yield text

vocab = build_vocab_from_iterator(yield_tokens(train_df['tokenized_text']), specials=['<unk>', '<pad>'])
vocab.set_default_index(vocab['<unk>'])


Numericalize the tokenized text

In [None]:
def numericalize(text, vocab=vocab):
    return [vocab[token] for token in text]

train_df['numericalized_text'] = train_df['tokenized_text'].apply(numericalize)
val_df['numericalized_text'] = val_df['tokenized_text'].apply(numericalize)


Create Dataset and DataLoader

In [None]:
from torch.nn.utils.rnn import pad_sequence

def collate_batch(batch):
    text_list, label_list = [], []
    for _text, _label in batch:
        text_list.append(torch.tensor(_text, dtype=torch.long))
        label_list.append(torch.tensor(_label, dtype=torch.float))
    text_list = pad_sequence(text_list, batch_first=True, padding_value=vocab['<pad>'])
    label_list = torch.stack(label_list)
    return text_list, label_list


In [None]:
class SentimentDataset(Dataset):
    def __init__(self, data, vocab):
        self.data = data
        self.vocab = vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = torch.tensor(self.data.iloc[idx]['numericalized_text'], dtype=torch.long)
        label = torch.tensor(self.data.iloc[idx]['label'], dtype=torch.float)
        return text, label

train_dataset = SentimentDataset(train_df, vocab)
val_dataset = SentimentDataset(val_df, vocab)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_batch)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_batch)


Define the model

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class SentimentClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(SentimentClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc = nn.Linear(embedding_dim, hidden_dim)
        self.out = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1)
        return self.out(F.relu(self.fc(pooled)))

model = SentimentClassifier(len(vocab), 100, 256, 1)


Training model

In [None]:
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

for epoch in range(5):
    model.train()
    for text, label in train_dataloader:
        optimizer.zero_grad()
        predictions = model(text).squeeze(1)  # Ensure predictions and labels are the same shape
        loss = criterion(predictions, label)
        loss.backward()
        optimizer.step()
    print(f'Epoch: {epoch+1}, Loss: {loss.item()}')


  text_list.append(torch.tensor(_text, dtype=torch.long))
  label_list.append(torch.tensor(_label, dtype=torch.float))


Epoch: 1, Loss: 0.48953700065612793
Epoch: 2, Loss: 0.18165171146392822
Epoch: 3, Loss: 0.14091083407402039
Epoch: 4, Loss: 0.24141095578670502
Epoch: 5, Loss: 0.08210613578557968


Evaluate the model

In [None]:
model.eval()
total_acc, total_count = 0, 0

with torch.no_grad():
    for text, label in val_dataloader:
        predictions = model(text).squeeze(1)
        total_acc += ((torch.sigmoid(predictions) >= 0.5).float() == label).sum().item()
        total_count += label.size(0)

print(f'Validation Accuracy: {total_acc / total_count:.4f}')


  text_list.append(torch.tensor(_text, dtype=torch.long))
  label_list.append(torch.tensor(_label, dtype=torch.float))


Validation Accuracy: 0.8904


Predicting single value

In [None]:
input_text = input("Enter review on the movie: ")
tokenized_text = tokenize_en(input_text)  # Assuming 'tokenize_en' is your tokenization function
numericalized_text = [vocab[token] for token in tokenized_text]
text_tensor = torch.tensor([numericalized_text], dtype=torch.long)  # Add batch dimension
model.eval()
with torch.no_grad():
    predictions = model(text_tensor)
    predicted_prob = torch.sigmoid(predictions.squeeze(0))

threshold = 0.5
predicted_class = 'Positive' if predicted_prob >= threshold else 'Negative'
print(f'Predicted Sentiment: {predicted_class}, Probability: {predicted_prob.item():.4f}')


Enter review on the movie: Great movie, I'll watch it again
Predicted Sentiment: Positive, Probability: 1.0000
