In [95]:
import random
import matplotlib.pyplot as plt
import os
from pathlib import Path

# pytorch

import torch
import torch.nn as nn
from torchvision import transforms
import torch.nn.functional as F 


In [96]:
#Import Dependencies
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.decomposition import TruncatedSVD


### Importing the data

First, lets import the data for training and validation

In [97]:
#specify CSV path
csv_path = '../Preprocessing/data_tokenized.csv'

#creating a df from the file
data = pd.read_csv(csv_path)

data.head(5)

Unnamed: 0,text,rating_overall
0,"['wonderful', 'boutique', 'hotel', 'located', ...",5.0
1,"['stayed', '4', '5', 'star', 'hotel', 'manhatt...",5.0
2,"['service', 'location', 'cleanliness', 'outsta...",5.0
3,"['pulled', 'curb', 'front', 'hotel', 'greeted'...",5.0
4,"['stayed', 'setai', 'special', 'company', 'pro...",5.0


In [98]:
data['rating_overall'] = data['rating_overall'].replace(range(0, 3), 'aNegative')
data['rating_overall'] = data['rating_overall'].replace(3, 'bNeutral')
data['rating_overall'] = data['rating_overall'].replace(range(4, 6), 'cPositive')

result = data.groupby('rating_overall').size()

result

rating_overall
aNegative     8632
bNeutral     10754
cPositive    93212
dtype: int64

The data is siginficantly imbalanced, having way to many positive reviews compared to the negative and neutral, so we will therefore balance the data by downsampling the majoruty vakulue

In [99]:
from sklearn.utils import resample

# Count the number of samples in each class
class_counts = data['rating_overall'].value_counts()
print("Class counts before downsampling:")
print(class_counts)

# Separate the classes
neutral_data = data[data['rating_overall'] == 'bNeutral']
positive_data = data[data['rating_overall'] == 'cPositive']
negative_data = data[data['rating_overall'] == 'aNegative']

# Downsample the cPositive class to match the size of bNeutral
positive_data_downsampled = resample(
    positive_data,
    replace=False,  # Do not sample with replacement
    n_samples=len(neutral_data),  # Match the size of the bNeutral class
    random_state=42  # For reproducibility
)

# Combine the downsampled cPositive class with the other classes
data_balanced = pd.concat([neutral_data, positive_data_downsampled, negative_data])

# Shuffle the dataset
data_balanced = data_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Verify the class distribution after downsampling
print("Class counts after downsampling:")
print(data_balanced['rating_overall'].value_counts())

Class counts before downsampling:
rating_overall
cPositive    93212
bNeutral     10754
aNegative     8632
Name: count, dtype: int64
Class counts after downsampling:
rating_overall
bNeutral     10754
cPositive    10754
aNegative     8632
Name: count, dtype: int64


In [100]:
preprocessed_data = data_balanced['text'].tolist()
labels = data_balanced['rating_overall'].tolist()

In [101]:
# Split the data into 80% training and 20% validation
X_train, X_valid, y_train, y_valid = train_test_split(
    preprocessed_data, labels, train_size=0.8, stratify=labels, random_state=42
)

# Verify the sizes
print(f"Training set: {len(X_train)} samples")
print(f"Validation set: {len(X_valid)} samples")

Training set: 24112 samples
Validation set: 6028 samples


Now we can import the Seattle data for testing

In [102]:
#specify CSV path
csv_path_SEA = '../Preprocessing/data_tokenized_SEA.csv'

#creating a df from the file
data_SEA = pd.read_csv(csv_path_SEA)

data_SEA.head(5)

Unnamed: 0,text,rating_overall
0,"['booked', 'hotel', 'good', 'trip', 'advisor',...",3.0
1,"['busy', 'area', 'wonderful', 'stay', 'room', ...",5.0
2,"['stayed', '2', 'night', 'contented', 'hotel',...",4.0
3,"['though', 'decor', 'could', 'use', 'update', ...",4.0
4,"['love', 'hotel', 'booked', 'thru', 'clipper',...",4.0


In [103]:
data_SEA['rating_overall'] = data_SEA['rating_overall'].replace(range(0, 3), 'aNegative')
data_SEA['rating_overall'] = data_SEA['rating_overall'].replace(3, 'bNeutral')
data_SEA['rating_overall'] = data_SEA['rating_overall'].replace(range(4, 6), 'cPositive')

result_SEA = data_SEA.groupby('rating_overall').size()

result_SEA

rating_overall
aNegative     308
bNeutral      470
cPositive    4358
dtype: int64

To make the test realistic, the original class distrubution will be used, meaning it will not be downsampled to adjust for the class imbalance

In [104]:
preprocessed_data_SEA = data_SEA['text'].tolist()
labels_SEA = data_SEA['rating_overall'].tolist()

In [105]:
X_test, y_test = preprocessed_data_SEA, labels_SEA

print(f"Test set: {len(X_test)} samples")

Test set: 5136 samples


In [106]:
# Define pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),  # Vectorization
    ('svd', TruncatedSVD(n_components=100)),  # Dimensionality reduction
    ('scaler', StandardScaler(with_mean=False)),  # Scaling
    ('mlp', MLPClassifier()),  # Classifier
])

In [107]:
# Define parameters for grid search
parameters = {
    'tfidf__max_features': [500, 1000, 5000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],  # Unigrams and bigrams
    'mlp__hidden_layer_sizes': [(50,), (100,), (150,)],
    'mlp__activation': ['relu', 'tanh'],
    'mlp__solver': ['adam', 'sgd'],
    'mlp__alpha': [0.0001, 0.001],  # Regularization strength
}

In [108]:

grid_search = GridSearchCV(pipeline, parameters, scoring='f1_weighted', cv=5, n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)



Now we can used the best found parameter settings and see how it performs on the validation set

In [109]:
print(grid_search.best_params_)

{'mlp__activation': 'tanh', 'mlp__alpha': 0.0001, 'mlp__hidden_layer_sizes': (50,), 'mlp__solver': 'sgd', 'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 2)}


In [110]:
# Evaluate the best model on the val data
best_model = grid_search.best_estimator_
y_pred_val = best_model.predict(X_valid)


print(classification_report(y_valid, y_pred_val, target_names=target_names))

              precision    recall  f1-score   support

    Negative       0.75      0.74      0.75      1726
     Netural       0.64      0.65      0.64      2151
    Positive       0.79      0.78      0.78      2151

    accuracy                           0.72      6028
   macro avg       0.73      0.72      0.73      6028
weighted avg       0.72      0.72      0.72      6028



And finally, we can test the model on the imbalanced Seattle test data

In [111]:
# Evaluate the best model on the test data
best_model = grid_search.best_estimator_
y_pred_test = best_model.predict(X_test)

# Test accuracy
y_pred_test = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
print("Test Accuracy:", test_accuracy)

print(classification_report(y_test, y_pred_test, target_names=target_names))

Test Accuracy: 0.7782320872274143
              precision    recall  f1-score   support

    Negative       0.55      0.69      0.61       308
     Netural       0.26      0.63      0.37       470
    Positive       0.97      0.80      0.88      4358

    accuracy                           0.78      5136
   macro avg       0.59      0.71      0.62      5136
weighted avg       0.88      0.78      0.81      5136

