In [29]:
#Import Dependencies
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


### Importing the data

First, lets import the data for training and validation

In [12]:
#specify CSV path
csv_path = '../Preprocessing/data_tokenized_BoW.csv'

#creating a df from the file
data = pd.read_csv(csv_path)

data.head(5)

Unnamed: 0,text,rating_overall
0,"['room', 'fine', 'service', 'try', 'hard', 'al...",3.0
1,"['best', 'place', 'stay', 'nyc', 'want', 'go',...",5.0
2,"['great', 'place', 'ill', 'always', 'check', '...",5.0
3,"['hotel', 'biggest', 'room', 'manhattan', 'lig...",5.0
4,"['want', 'stay', 'upper', 'west', 'side', 'nic...",4.0


In [13]:
data['rating_overall'] = data['rating_overall'].replace(range(0, 3), 'Negative')
data['rating_overall'] = data['rating_overall'].replace(3, 'Neutral')
data['rating_overall'] = data['rating_overall'].replace(range(4, 6), 'Positive')

result = data.groupby('rating_overall').size()

result

rating_overall
Negative     3263
Neutral      3982
Positive    38291
dtype: int64

In [14]:
(
    
    X_train,
    X_rem,
    y_train,
    y_rem

) = train_test_split(data["text"], data["rating_overall"], train_size=0.9, random_state=42)

(X_valid, X_test, y_valid, y_test) = train_test_split(X_rem,y_rem, test_size=0.5)

In [15]:
# balancing the training set

from sklearn.utils import resample

# Combine X_train and y_train into a single DataFrame for resampling
train_data = pd.DataFrame({'text': X_train, 'rating_overall': y_train})

# Separate the classes in the training set using the correct labels
positive_data = train_data[train_data['rating_overall'] == 'Positive']
neutral_data = train_data[train_data['rating_overall'] == 'Neutral']
negative_data = train_data[train_data['rating_overall'] == 'Negative']

# Downsample the majority class (Positive) to match the size of the minority class (Neutral or Negative, whichever is larger)
minority_class_size = max(len(neutral_data), len(negative_data))  # Use max to find the larger minority class

positive_data_downsampled = resample(
    positive_data,
    replace=False,  # Do not sample with replacement
    n_samples=minority_class_size,  # Match the size of the larger minority class
    random_state=42  # For reproducibility
)

# Combine the downsampled majority class with the other classes
train_data_balanced = pd.concat([positive_data_downsampled, neutral_data, negative_data])

# Shuffle the balanced training data
train_data_balanced = train_data_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Separate the balanced training data back into X_train and y_train
X_train = train_data_balanced['text']
y_train = train_data_balanced['rating_overall']

# Verify the class distribution in the balanced training set
print("Class distribution in the balanced training set:")
print(y_train.value_counts())

Class distribution in the balanced training set:
rating_overall
Positive    3560
Neutral     3560
Negative    2933
Name: count, dtype: int64


In [16]:
print(f"Train set: {len(X_train)} samples")
print(f"Valid set: {len(X_valid)} samples")
print(f"Test set: {len(X_test)} samples")

Train set: 10053 samples
Valid set: 2277 samples
Test set: 2277 samples


In [17]:
# TFIDF vectorization
tfidf = TfidfVectorizer()
X_train_vectorized = tfidf.fit_transform(X_train)

X_train_vectorized.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], shape=(10053, 10616))

### MLP model

In [18]:
# Define pipeline
MLP_pipeline = Pipeline([ 
    ('svd', TruncatedSVD(n_components=100)),  # Dimensionality reduction
    ('scaler', StandardScaler(with_mean=False)),  # Scaling
    ('mlp', MLPClassifier()),  # Classifier
])

In [20]:
# Define parameters for grid search
MLP_parameters = {
    'mlp__hidden_layer_sizes': [(50,), (100,), (150,)],
    'mlp__activation': ['relu', 'tanh'],
    'mlp__solver': ['adam', 'sgd'],
    'mlp__alpha': [0.0001, 0.001,0.01],  # Regularization strength
}

In [21]:

MLP_grid_search = GridSearchCV(MLP_pipeline, MLP_parameters, scoring='f1_weighted', cv=5, n_jobs=-1)

# Fit the grid search to the training data
MLP_grid_search.fit(X_train_vectorized, y_train)



Now we can used the best found parameter settings and see how it performs on the validation set

In [22]:
X_val_vectorized = tfidf.transform(X_valid)

best_MLP_model = MLP_grid_search.best_estimator_
MLP_y_pred = best_MLP_model.predict(X_val_vectorized)

# Print best parameters and score
print("Best parameters: ", MLP_grid_search.best_params_)
print("Best accuracy score: ", MLP_grid_search.best_score_)

Best parameters:  {'mlp__activation': 'tanh', 'mlp__alpha': 0.001, 'mlp__hidden_layer_sizes': (150,), 'mlp__solver': 'sgd'}
Best accuracy score:  0.7147099911968239


In [23]:
print("Performance on the training set:")
print(classification_report(y_train, best_MLP_model.predict(X_train_vectorized)))

print("Performance on the validation set:")
print(classification_report(y_valid, MLP_y_pred))

Performance on the training set:
              precision    recall  f1-score   support

    Negative       0.79      0.79      0.79      2933
     Neutral       0.73      0.71      0.72      3560
    Positive       0.83      0.85      0.84      3560

    accuracy                           0.79     10053
   macro avg       0.78      0.79      0.79     10053
weighted avg       0.78      0.79      0.78     10053

Performance on the validation set:
              precision    recall  f1-score   support

    Negative       0.57      0.71      0.63       171
     Neutral       0.25      0.58      0.35       216
    Positive       0.96      0.79      0.87      1890

    accuracy                           0.77      2277
   macro avg       0.59      0.69      0.62      2277
weighted avg       0.86      0.77      0.80      2277



And finally, we can test the model on the imbalanced Seattle test data

In [25]:
test_df_vectorized = tfidf.transform(X_test)

print("Performance on the test set:")
print(classification_report(y_test, best_MLP_model.predict(test_df_vectorized)))

Performance on the test set:
              precision    recall  f1-score   support

    Negative       0.54      0.74      0.62       159
     Neutral       0.25      0.66      0.36       206
    Positive       0.97      0.77      0.86      1912

    accuracy                           0.76      2277
   macro avg       0.59      0.72      0.62      2277
weighted avg       0.88      0.76      0.80      2277



### Logistical Regression

In [30]:
# Define pipeline
LR_pipeline = Pipeline([
    ('logreg', LogisticRegression()),
])


# Define parameters for grid search
LR_parameters = {
    'logreg__C': [0.1, 1.0, 10.0],
    'logreg__max_iter': [10000],
    'logreg__solver': ['saga'],
}


# Create grid search object
LR_grid_search = GridSearchCV(LR_pipeline, LR_parameters, scoring='f1_weighted', cv=5, n_jobs=-1)


# Fit the grid search to the training data
#grid_search.fit(X_train, y_train)
LR_grid_search.fit(X_train_vectorized, y_train)

In [32]:
best_LR_model = LR_grid_search.best_estimator_
LR_y_pred = best_LR_model.predict(X_val_vectorized)

# Print best parameters and score
print("Best parameters: ", LR_grid_search.best_params_)
print("Best accuracy score: ", LR_grid_search.best_score_)

Best parameters:  {'logreg__C': 1.0, 'logreg__max_iter': 10000, 'logreg__solver': 'saga'}
Best accuracy score:  0.7269221958543884


In [33]:
print("Performance on the training set:")
print(classification_report(y_train, best_LR_model.predict(X_train_vectorized)))

print("Performance on the validation set:")
print(classification_report(y_valid, LR_y_pred))

Performance on the training set:
              precision    recall  f1-score   support

    Negative       0.88      0.87      0.87      2933
     Neutral       0.81      0.81      0.81      3560
    Positive       0.87      0.89      0.88      3560

    accuracy                           0.85     10053
   macro avg       0.85      0.85      0.85     10053
weighted avg       0.85      0.85      0.85     10053

Performance on the validation set:
              precision    recall  f1-score   support

    Negative       0.59      0.70      0.64       171
     Neutral       0.27      0.62      0.38       216
    Positive       0.96      0.81      0.88      1890

    accuracy                           0.78      2277
   macro avg       0.61      0.71      0.63      2277
weighted avg       0.87      0.78      0.81      2277



In [34]:
print("Performance on the test set:")
print(classification_report(y_test, best_LR_model.predict(test_df_vectorized)))

Performance on the test set:
              precision    recall  f1-score   support

    Negative       0.57      0.74      0.64       159
     Neutral       0.26      0.64      0.37       206
    Positive       0.97      0.80      0.88      1912

    accuracy                           0.78      2277
   macro avg       0.60      0.73      0.63      2277
weighted avg       0.88      0.78      0.81      2277



## Random Forest

In [35]:
# Define pipeline
RF_pipeline = Pipeline([
    ('rf', RandomForestClassifier()),
])


# Define parameters for grid search
RF_parameters = {
   'rf__criterion': ['gini', 'entropy', 'log_loss'],
    'rf__n_estimators': [100, 200, 300],
    'rf__max_depth': [60,70], #[10, 20, 30],
    'rf__min_samples_leaf':[5],
    'rf__class_weight':['balanced'],
}


# Create grid search object
RF_grid_search = GridSearchCV(RF_pipeline, RF_parameters, scoring='f1_weighted', cv=5, n_jobs=-1)


# Fit the grid search to the training data
#grid_search.fit(X_train, y_train)
RF_grid_search.fit(X_train_vectorized, y_train)

In [36]:
best_RF_model = RF_grid_search.best_estimator_
RF_y_pred = best_RF_model.predict(X_val_vectorized)

# Print best parameters and score
print("Best parameters: ", RF_grid_search.best_params_)
print("Best accuracy score: ", RF_grid_search.best_score_)

Best parameters:  {'rf__class_weight': 'balanced', 'rf__criterion': 'gini', 'rf__max_depth': 70, 'rf__min_samples_leaf': 5, 'rf__n_estimators': 300}
Best accuracy score:  0.6880953800643155


In [37]:
print("Performance on the training set:")
print(classification_report(y_train, best_RF_model.predict(X_train_vectorized)))

print("Performance on the validation set:")
print(classification_report(y_valid, RF_y_pred))

Performance on the training set:
              precision    recall  f1-score   support

    Negative       0.74      0.84      0.79      2933
     Neutral       0.76      0.59      0.66      3560
    Positive       0.77      0.86      0.81      3560

    accuracy                           0.76     10053
   macro avg       0.76      0.76      0.75     10053
weighted avg       0.76      0.76      0.75     10053

Performance on the validation set:
              precision    recall  f1-score   support

    Negative       0.50      0.74      0.60       171
     Neutral       0.28      0.47      0.35       216
    Positive       0.95      0.84      0.89      1890

    accuracy                           0.79      2277
   macro avg       0.58      0.68      0.61      2277
weighted avg       0.85      0.79      0.82      2277



In [38]:
print("Performance on the test set:")
print(classification_report(y_test, best_RF_model.predict(test_df_vectorized)))

Performance on the test set:
              precision    recall  f1-score   support

    Negative       0.51      0.78      0.62       159
     Neutral       0.28      0.53      0.37       206
    Positive       0.96      0.83      0.89      1912

    accuracy                           0.80      2277
   macro avg       0.58      0.71      0.62      2277
weighted avg       0.87      0.80      0.82      2277

