## <div style="text-align:center;">Yelp Revies Sentiment Classification</div> 
### [SpaCy--> TF-iDF --> MultiLayer Perceptron in Pytorch]

In [2]:
import torch
import pandas as pd
import numpy as np
import spacy
import torch.nn as nn
import torch.optim as optim
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader

In [3]:
df = pd.read_csv('sampled_reviews.csv')
df.sample(5)

Unnamed: 0,review,rating
0,What a horrible experience we had at this reso...,0
1,"So I'm not usually one to fuss, pretty laid ba...",0
2,Let me first be clear as to why I gave two sta...,0
3,"Well, it's safe to say that the majority of th...",0
4,"This wasn't my first time here, but it's been ...",0


**Preprocessing with SpaCy**

In [4]:
# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

def preprocessing(text):
    # Tokenize the text using spaCy
    doc = nlp(text)

    # Initialize a list to store filtered words
    filtered_words = []

    for token in doc:
        # Convert token text to lowercase
        token_text = token.text.lower()

        # Check if the token is a stop word or punctuation
        if token.is_stop or token_text in string.punctuation:
            continue

        # Lemmatize the token and add it to the filtered list
        lemma = token.lemma_
        filtered_words.append(lemma)

    # Join the filtered words to form preprocessed text
    preprocessed_text = ' '.join(filtered_words)

    # Additional text replacements using the provided replacer dictionary
    replacer = {
        '\n': '',
        "[\[].*?[\]]": "",
        '[!"#%\'()*+,-./:;<=>?@\[\]^_`{|}~1234567890’""′‘\\\]': ""
    }

    for pattern, replacement in replacer.items():
        preprocessed_text = re.sub(pattern, replacement, preprocessed_text)

    # Remove extra spaces
    preprocessed_text = ' '.join(preprocessed_text.split())

    return preprocessed_text


In [6]:
#apply the preprocessing on the dataset
df['review'] = df['review'].apply(preprocessing)

**Vectorizing**

In [24]:
# Split the dataset into training and validation sets.
reviews_train, reviews_val, ratings_train, ratings_val = train_test_split(df.review, df.rating, test_size=0.2, random_state=42)

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the training data
X_train = tfidf_vectorizer.fit_transform(reviews_train)

# Transform the test data 
X_test = tfidf_vectorizer.transform(reviews_val)

**Dataset and DataLoader**

In [72]:
class MyDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


# Convert TF-IDF matrices to tensors
X_train_tensor = torch.Tensor(X_train.toarray())
X_test_tensor = torch.Tensor(X_test.toarray())

# Convert rating labels to tensors
y_train_tensor = torch.Tensor(ratings_train.to_numpy())
y_test_tensor = torch.Tensor(ratings_val.to_numpy())
# Create DataLoader objects
batch_size = 32
train_dataset = MyDataset(X_train_tensor,y_train_tensor.view(-1,1))
test_dataset = MyDataset(X_test_tensor, y_test_tensor.view(-1,1))

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

**Model Architecture**

In [82]:
class MLPClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLPClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = torch.sigmoid(x)
        return x

# Initialize the model
input_size = X_train_tensor.shape[1]
hidden_size = 64  
output_size = 1 

model = MLPClassifier(input_size, hidden_size, output_size)

# Define loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


**Training loop**

In [83]:
num_epochs = 50  # Adjust as needed
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)  
        loss.backward()
        optimizer.step()
    if (epoch+1)%5 ==0:
            print(f'epochs: [{epoch+1}/{num_epochs}] Loss: {loss.item():.4f}')
print('Training Finished')
print('\n')
# Evaluation loop
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        predicted = outputs.round()
        accuracy = (predicted == labels.view(-1, 1)).float().mean()
    print(f"Accuracy on test data: {accuracy:.2f}%")

epochs: [5/50] Loss: 0.1258
epochs: [10/50] Loss: 0.0118
epochs: [15/50] Loss: 0.0068
epochs: [20/50] Loss: 0.0057
epochs: [25/50] Loss: 0.0017
epochs: [30/50] Loss: 0.0007
epochs: [35/50] Loss: 0.0007
epochs: [40/50] Loss: 0.0005
epochs: [45/50] Loss: 0.0005
epochs: [50/50] Loss: 0.0003
Training Finished


Accuracy on test data: 0.83%


**With a better computation power, more data can be used and the model can be re-trained. Random sample of 4000 rows of Yelp Sentiment dataset was used.** 

In [None]:
# Save the trained model
torch.save(model.state_dict(), 'sentiment_mlp_model.pth')