In [22]:
import pandas as pd

file_path = "Dataset-SA.csv"


df_sa = pd.read_csv(file_path)

print("Columns in Dataset-SA.csv:")
print(list(df_sa.columns))


print("\nFirst 5 rows of the dataset:")
print(df_sa.head())

Columns in Dataset-SA.csv:
['product_name', 'product_price', 'Rate', 'Review', 'Summary', 'Sentiment']

First 5 rows of the dataset:
                                        product_name product_price Rate  \
0  Candes 12 L Room/Personal Air Cooler??????(Whi...          3999    5   
1  Candes 12 L Room/Personal Air Cooler??????(Whi...          3999    5   
2  Candes 12 L Room/Personal Air Cooler??????(Whi...          3999    3   
3  Candes 12 L Room/Personal Air Cooler??????(Whi...          3999    1   
4  Candes 12 L Room/Personal Air Cooler??????(Whi...          3999    3   

            Review                                            Summary  \
0           super!  great cooler excellent air flow and for this p...   
1          awesome              best budget 2 fit cooler nice cooling   
2             fair  the quality is good but the power of air is de...   
3  useless product                  very bad product its a only a fan   
4             fair                                 

In [23]:


file_path = "Reviews.csv"


df_re = pd.read_csv(file_path)

print("Columns in review.csv:")
print(list(df_re.columns))


print("\nFirst 5 rows of the dataset:")
print(df_re.head())

Columns in review.csv:
['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text']

First 5 rows of the dataset:
   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      5  1303862400   
1                     0                       0      1  1346976000   
2                     1                       1      4  1219017600   
3                     3                       3      2  1307923200   
4                     0                      

In [24]:
# Ensure df_sa is loaded (it should be from the previous step)
if 'df_sa' in locals():
    # Check if the 'Sentiment' column exists
    if 'Sentiment' in df_sa.columns:
        unique_sentiments = df_sa['Sentiment'].unique()
        print("Unique values found in the 'Sentiment' column:")
        print(unique_sentiments)

        # Optional: Print the count of each unique value
        print("\nCounts for each unique sentiment:")
        print(df_sa['Sentiment'].value_counts())
    else:
        print("Error: 'Sentiment' column not found in the DataFrame.")
else:
    print("Error: DataFrame 'df_sa' not found. Please load the dataset first.")


Unique values found in the 'Sentiment' column:
['positive' 'negative' 'neutral']

Counts for each unique sentiment:
Sentiment
positive    166581
negative     28232
neutral      10239
Name: count, dtype: int64


# **First we balance the data, split it then run VADER as baseline, use the same dataset for consistency**


In [25]:
import re
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

df_full = df_sa.copy()
df_full.dropna(subset=['Summary', 'Sentiment'], inplace=True)
df_full['original_summary'] = df_full['Summary']
def clean_summary(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text
df_full['clean_summary'] = df_full['Summary'].apply(clean_summary)
sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
df_full['sentiment_label'] = df_full['Sentiment'].map(sentiment_map)
df_full.dropna(subset=['sentiment_label'], inplace=True)
df_full['sentiment_label'] = df_full['sentiment_label'].astype(int)
counts = df_full['sentiment_label'].value_counts()
minority_count = counts.min()
balanced_indices = []
for label in df_full['sentiment_label'].unique():
    label_indices = df_full[df_full['sentiment_label'] == label].index
    sampled_indices = np.random.choice(label_indices, size=minority_count, replace=False)
    balanced_indices.extend(sampled_indices)
np.random.shuffle(balanced_indices)
df_balanced = df_full.loc[balanced_indices]

df_train, df_temp, y_train, y_temp = train_test_split(df_balanced, df_balanced['sentiment_label'], test_size=0.4,random_state=42, stratify=df_balanced['sentiment_label'])
df_val, df_test, y_val, y_test = train_test_split(df_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

df_test_vader = df_test[['original_summary', 'sentiment_label']].copy()

nltk.download('vader_lexicon', quiet=True)
analyzer = SentimentIntensityAnalyzer()
def get_vader_3class_label(text):
    scores = analyzer.polarity_scores(str(text))
    compound_score = scores['compound']
    if compound_score <= -0.25:
        return 0
    elif compound_score >= 0.25:
        return 2
    else:
        return 1
df_test_vader['vader_prediction'] = df_test_vader['original_summary'].apply(get_vader_3class_label)
accuracy_vader = accuracy_score(df_test_vader['sentiment_label'], df_test_vader['vader_prediction'])
print("VADER Test Set Accuracy: {:.4f}".format(accuracy_vader))
print(classification_report(df_test_vader['sentiment_label'], df_test_vader['vader_prediction'], target_names=['negative', 'neutral', 'positive']))


VADER Test Set Accuracy: 0.7541
              precision    recall  f1-score   support

    negative       0.95      0.73      0.83      2047
     neutral       0.66      0.61      0.63      2047
    positive       0.71      0.92      0.80      2047

    accuracy                           0.75      6141
   macro avg       0.77      0.75      0.75      6141
weighted avg       0.77      0.75      0.75      6141



# **Next we run a basic lineaer regression, see how it works, and check which featurre is most important(because im worreid that as nic said we can just rating so this project is useless**


In [26]:
# Data manipulation
import pandas as pd
import numpy as np

# Scikit-learn components
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [None]:
df_train_lr = df_train[['Rate', 'Review', 'Summary']].reset_index(drop=True)
df_test_lr = df_test[['Rate', 'Review', 'Summary']].reset_index(drop=True)
df_train_lr['Review'] = df_train_lr['Review'].fillna('')
df_train_lr['Summary'] = df_train_lr['Summary'].fillna('')
df_test_lr['Review'] = df_test_lr['Review'].fillna('')
df_test_lr['Summary'] = df_test_lr['Summary'].fillna('')

# Convert the "Rate" column to numeric, coercing errors to NaN then filling with 0
# df_train_lr['Rate'] = pd.to_numeric(df_train_lr['Rate'], errors='coerce').fillna(0)
# df_test_lr['Rate'] = pd.to_numeric(df_test_lr['Rate'], errors='coerce').fillna(0)


df_train_lr['Rate'] = pd.to_numeric(df_train_lr['Rate'], errors='coerce').fillna(0)
df_val_lr['Rate'] = pd.to_numeric(df_val_lr['Rate'], errors='coerce').fillna(0)
df_test_lr['Rate'] = pd.to_numeric(df_test_lr['Rate'], errors='coerce').fillna(0)

numeric_transformer = Pipeline([('scaler', StandardScaler())])
text_transformer_review = Pipeline([('tfidf', TfidfVectorizer())])
text_transformer_summary = Pipeline([('tfidf', TfidfVectorizer())])
preprocessor = ColumnTransformer(transformers=[
    ('rate', numeric_transformer, ['Rate']),
    ('review', text_transformer_review, 'Review'),
    ('summary', text_transformer_summary, 'Summary')
])
clf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])
clf_pipeline.fit(df_train_lr, y_train)
y_pred_lr = clf_pipeline.predict(df_test_lr)
print("Logistic Regression Test Set Accuracy: {:.4f}".format(accuracy_score(y_test, y_pred_lr)))
print(classification_report(y_test, y_pred_lr, target_names=['negative', 'neutral', 'positive']))

coef = clf_pipeline.named_steps['classifier'].coef_
n_rate = 1
review_vectorizer = clf_pipeline.named_steps['preprocessor'].transformers_[1][1].named_steps['tfidf']
n_review = len(review_vectorizer.get_feature_names_out())
summary_vectorizer = clf_pipeline.named_steps['preprocessor'].transformers_[2][1].named_steps['tfidf']
n_summary = len(summary_vectorizer.get_feature_names_out())
rate_coef = coef[:, :n_rate]
review_coef = coef[:, n_rate:n_rate+n_review]
summary_coef = coef[:, n_rate+n_review:n_rate+n_review+n_summary]
for i, class_name in enumerate(['negative', 'neutral', 'positive']):
    print("\nClass: '{}' ({})".format(class_name, i))
    print("  Rate (Scaled) Avg Abs Coef: {:.4f}".format(np.mean(np.abs(rate_coef[i]))))
    print("  Review (TF-IDF) Avg Abs Coef: {:.4f}".format(np.mean(np.abs(review_coef[i]))))
    print("  Summary (TF-IDF) Avg Abs Coef: {:.4f}".format(np.mean(np.abs(summary_coef[i]))))

Logistic Regression Test Set Accuracy: 0.8346
              precision    recall  f1-score   support

    negative       0.84      0.87      0.85      2047
     neutral       0.79      0.74      0.76      2047
    positive       0.87      0.90      0.88      2047

    accuracy                           0.83      6141
   macro avg       0.83      0.83      0.83      6141
weighted avg       0.83      0.83      0.83      6141


Class: 'negative' (0)
  Rate (Scaled) Avg Abs Coef: 1.2418
  Review (TF-IDF) Avg Abs Coef: 0.1481
  Summary (TF-IDF) Avg Abs Coef: 0.1230

Class: 'neutral' (1)
  Rate (Scaled) Avg Abs Coef: 0.0980
  Review (TF-IDF) Avg Abs Coef: 0.1495
  Summary (TF-IDF) Avg Abs Coef: 0.1513

Class: 'positive' (2)
  Rate (Scaled) Avg Abs Coef: 1.1438
  Review (TF-IDF) Avg Abs Coef: 0.1443
  Summary (TF-IDF) Avg Abs Coef: 0.1057


In [28]:
# df_train_lr = df_train[['Rate', 'Review', 'Summary']].reset_index(drop=True)
# df_test_lr = df_test[['Rate', 'Review', 'Summary']].reset_index(drop=True)
# df_train_lr['Review'] = df_train_lr['Review'].fillna('')
# df_train_lr['Summary'] = df_train_lr['Summary'].fillna('')
# df_test_lr['Review'] = df_test_lr['Review'].fillna('')
# df_test_lr['Summary'] = df_test_lr['Summary'].fillna('')

# numeric_transformer = Pipeline([('scaler', StandardScaler())])
# text_transformer_review = Pipeline([('tfidf', TfidfVectorizer())])
# text_transformer_summary = Pipeline([('tfidf', TfidfVectorizer())])
# preprocessor = ColumnTransformer(transformers=[
#     ('rate', numeric_transformer, ['Rate']),
#     ('review', text_transformer_review, 'Review'),
#     ('summary', text_transformer_summary, 'Summary')
# ])
# clf_pipeline = Pipeline([
#     ('preprocessor', preprocessor),
#     ('classifier', LogisticRegression(max_iter=1000))
# ])
# clf_pipeline.fit(df_train_lr, y_train)
# y_pred_lr = clf_pipeline.predict(df_test_lr)
# print("Logistic Regression Test Set Accuracy: {:.4f}".format(accuracy_score(y_test, y_pred_lr)))
# print(classification_report(y_test, y_pred_lr, target_names=['negative', 'neutral', 'positive']))

# coef = clf_pipeline.named_steps['classifier'].coef_
# n_rate = 1
# review_vectorizer = clf_pipeline.named_steps['preprocessor'].transformers_[1][1].named_steps['tfidf']
# n_review = len(review_vectorizer.get_feature_names_out())
# summary_vectorizer = clf_pipeline.named_steps['preprocessor'].transformers_[2][1].named_steps['tfidf']
# n_summary = len(summary_vectorizer.get_feature_names_out())
# rate_coef = coef[:, :n_rate]
# review_coef = coef[:, n_rate:n_rate+n_review]
# summary_coef = coef[:, n_rate+n_review:n_rate+n_review+n_summary]
# for i, class_name in enumerate(['negative', 'neutral', 'positive']):
#     print("\nClass: '{}' ({})".format(class_name, i))
#     print("  Rate (Scaled) Avg Abs Coef: {:.4f}".format(np.mean(np.abs(rate_coef[i]))))
#     print("  Review (TF-IDF) Avg Abs Coef: {:.4f}".format(np.mean(np.abs(review_coef[i]))))
#     print("  Summary (TF-IDF) Avg Abs Coef: {:.4f}".format(np.mean(np.abs(summary_coef[i]))))


In [None]:
# #whoops forgor to define val previously

# df_train_lr = df_train[['Rate', 'Review', 'Summary']].reset_index(drop=True)
# df_val_lr = df_val[['Rate', 'Review', 'Summary']].reset_index(drop=True)
# df_test_lr = df_test[['Rate', 'Review', 'Summary']].reset_index(drop=True)
# df_train_lr['Review'] = df_train_lr['Review'].fillna('')
# df_train_lr['Summary'] = df_train_lr['Summary'].fillna('')
# df_val_lr['Review'] = df_val_lr['Review'].fillna('')
# df_val_lr['Summary'] = df_val_lr['Summary'].fillna('')
# df_test_lr['Review'] = df_test_lr['Review'].fillna('')
# df_test_lr['Summary'] = df_test_lr['Summary'].fillna('')

# best_acc = 0
# best_C = None
# for c in [0.1, 1, 10]:
#     clf_pipeline.set_params(classifier__C=c)
#     clf_pipeline.fit(df_train_lr, y_train)
#     val_pred = clf_pipeline.predict(df_val_lr)
#     val_acc = accuracy_score(y_val, val_pred)
#     print("C: {}, Validation Accuracy: {:.4f}".format(c, val_acc))
#     if val_acc > best_acc:
#         best_acc = val_acc
#         best_C = c

# print("Best C based on validation: {}".format(best_C))

# clf_pipeline.set_params(classifier__C=best_C)
# clf_pipeline.fit(df_train_lr, y_train)
# train_pred = clf_pipeline.predict(df_train_lr)
# train_acc = accuracy_score(y_train, train_pred)
# print("Training Accuracy with best C: {:.4f}".format(train_acc))
# test_pred = clf_pipeline.predict(df_test_lr)
# test_acc = accuracy_score(y_test, test_pred)
# print("Test Accuracy with best C: {:.4f}".format(test_acc))
# print("Test Classification Report:")
# print(classification_report(y_test, test_pred, target_names=['negative', 'neutral', 'positive']))


In [30]:
from sklearn.metrics import f1_score

best_acc = 0
best_C = None
for c in [0.1, 1, 10]:
    clf_pipeline.set_params(classifier__C=c)
    clf_pipeline.fit(df_train_lr, y_train)
    val_pred = clf_pipeline.predict(df_val_lr)
    val_acc = accuracy_score(y_val, val_pred)
    val_f1 = f1_score(y_val, val_pred, average='weighted')
    print("C: {}, Validation Accuracy: {:.4f}, Validation F1: {:.4f}".format(c, val_acc, val_f1))
    if val_acc > best_acc:
        best_acc = val_acc
        best_C = c

print("Best C based on validation: {}".format(best_C))

clf_pipeline.set_params(classifier__C=best_C)
clf_pipeline.fit(df_train_lr, y_train)
train_pred = clf_pipeline.predict(df_train_lr)
train_acc = accuracy_score(y_train, train_pred)
train_f1 = f1_score(y_train, train_pred, average='weighted')
print("Training Accuracy with best C: {:.4f}, Training F1: {:.4f}".format(train_acc, train_f1))

test_pred = clf_pipeline.predict(df_test_lr)
test_acc = accuracy_score(y_test, test_pred)
test_f1 = f1_score(y_test, test_pred, average='weighted')
print("Test Accuracy with best C: {:.4f}, Test F1: {:.4f}".format(test_acc, test_f1))
print("Test Classification Report:")
print(classification_report(y_test, test_pred, target_names=['negative', 'neutral', 'positive']))


C: 0.1, Validation Accuracy: 0.8178, Validation F1: 0.8148
C: 1, Validation Accuracy: 0.8529, Validation F1: 0.8523
C: 10, Validation Accuracy: 0.8647, Validation F1: 0.8649
Best C based on validation: 10
Training Accuracy with best C: 0.9304, Training F1: 0.9301
Test Accuracy with best C: 0.8271, Test F1: 0.8266
Test Classification Report:
              precision    recall  f1-score   support

    negative       0.84      0.85      0.85      2047
     neutral       0.77      0.75      0.76      2047
    positive       0.87      0.88      0.87      2047

    accuracy                           0.83      6141
   macro avg       0.83      0.83      0.83      6141
weighted avg       0.83      0.83      0.83      6141



I think the differences is too low to justify an action, lets skip ahead to the second model

# **testing for SVM**

In [31]:
from sklearn.svm import SVC
from sklearn.metrics import f1_score

best_val_acc = 0
best_params = {}
for kernel in ['linear', 'rbf']:
    for C in [0.1, 1, 10, 100]:
        svm_pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', SVC(C=C, kernel=kernel))
        ])
        svm_pipeline.fit(df_train_lr, y_train)
        val_pred = svm_pipeline.predict(df_val_lr)
        val_acc = accuracy_score(y_val, val_pred)
        val_f1 = f1_score(y_val, val_pred, average='weighted')
        print("Kernel: {}, C: {}, Validation Accuracy: {:.4f}, Validation F1: {:.4f}"
              .format(kernel, C, val_acc, val_f1))
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_params = {'kernel': kernel, 'C': C}

print("Best parameters based on validation: {}".format(best_params))

svm_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', SVC(C=best_params['C'], kernel=best_params['kernel']))
])
svm_pipeline.fit(df_train_lr, y_train)

train_pred = svm_pipeline.predict(df_train_lr)
train_acc = accuracy_score(y_train, train_pred)
train_f1 = f1_score(y_train, train_pred, average='weighted')
print("Training Accuracy: {:.4f}, Training F1: {:.4f}".format(train_acc, train_f1))

test_pred = svm_pipeline.predict(df_test_lr)
test_acc = accuracy_score(y_test, test_pred)
test_f1 = f1_score(y_test, test_pred, average='weighted')
print("Test Accuracy: {:.4f}, Test F1: {:.4f}".format(test_acc, test_f1))
print("Test Classification Report:")
print(classification_report(y_test, test_pred, target_names=['negative', 'neutral', 'positive']))


Kernel: linear, C: 0.1, Validation Accuracy: 0.8129, Validation F1: 0.8099
Kernel: linear, C: 1, Validation Accuracy: 0.8596, Validation F1: 0.8596
Kernel: linear, C: 10, Validation Accuracy: 0.8630, Validation F1: 0.8634
Kernel: linear, C: 100, Validation Accuracy: 0.8533, Validation F1: 0.8542
Kernel: rbf, C: 0.1, Validation Accuracy: 0.7839, Validation F1: 0.7768
Kernel: rbf, C: 1, Validation Accuracy: 0.8586, Validation F1: 0.8585
Kernel: rbf, C: 10, Validation Accuracy: 0.8914, Validation F1: 0.8920
Kernel: rbf, C: 100, Validation Accuracy: 0.8852, Validation F1: 0.8860
Best parameters based on validation: {'kernel': 'rbf', 'C': 10}
Training Accuracy: 0.9845, Training F1: 0.9845
Test Accuracy: 0.8476, Test F1: 0.8474
Test Classification Report:
              precision    recall  f1-score   support

    negative       0.86      0.87      0.87      2047
     neutral       0.79      0.78      0.79      2047
    positive       0.89      0.89      0.89      2047

    accuracy          

Usually SVM works best for this kind of stuff, might overfit a bit though

# **testing for random forest**

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

best_val_acc = 0
best_params = {}
for n_estimators in [100, 200]:
    for max_depth in [None, 10, 20]:
        rf_pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42))
        ])
        rf_pipeline.fit(df_train_lr, y_train)
        val_pred = rf_pipeline.predict(df_val_lr)
        val_acc = accuracy_score(y_val, val_pred)
        val_f1 = f1_score(y_val, val_pred, average='weighted')
        print("n_estimators: {}, max_depth: {}, Validation Accuracy: {:.4f}, Validation F1: {:.4f}".format(n_estimators, max_depth, val_acc, val_f1))
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_params = {"n_estimators": n_estimators, "max_depth": max_depth}

print("Best parameters: ", best_params)

rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=best_params['n_estimators'], max_depth=best_params['max_depth'], random_state=42))
])
rf_pipeline.fit(df_train_lr, y_train)

train_pred = rf_pipeline.predict(df_train_lr)
train_acc = accuracy_score(y_train, train_pred)
train_f1 = f1_score(y_train, train_pred, average='weighted')
print("Training Accuracy: {:.4f}, Training F1: {:.4f}".format(train_acc, train_f1))

test_pred = rf_pipeline.predict(df_test_lr)
test_acc = accuracy_score(y_test, test_pred)
test_f1 = f1_score(y_test, test_pred, average='weighted')
print("Test Accuracy: {:.4f}, Test F1: {:.4f}".format(test_acc, test_f1))
print("Test Classification Report:")
print(classification_report(y_test, test_pred, target_names=['negative', 'neutral', 'positive']))


n_estimators: 100, max_depth: None, Validation Accuracy: 0.8995, Validation F1: 0.8998
n_estimators: 100, max_depth: 10, Validation Accuracy: 0.7912, Validation F1: 0.7835
n_estimators: 100, max_depth: 20, Validation Accuracy: 0.8171, Validation F1: 0.8118
n_estimators: 200, max_depth: None, Validation Accuracy: 0.9002, Validation F1: 0.9005
n_estimators: 200, max_depth: 10, Validation Accuracy: 0.7958, Validation F1: 0.7877
n_estimators: 200, max_depth: 20, Validation Accuracy: 0.8143, Validation F1: 0.8090
Best parameters:  {'n_estimators': 200, 'max_depth': None}
Training Accuracy: 0.9884, Training F1: 0.9884
Test Accuracy: 0.8378, Test F1: 0.8367
Test Classification Report:
              precision    recall  f1-score   support

    negative       0.86      0.87      0.86      2047
     neutral       0.79      0.74      0.77      2047
    positive       0.86      0.90      0.88      2047

    accuracy                           0.84      6141
   macro avg       0.84      0.84      0.

Too unstable, we aint using this

# **testing for Xgboost**

In [34]:
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

best_val_acc = 0
best_params = {}
for n_estimators in [100, 200]:
    for learning_rate in [0.1, 0.05]:
        for max_depth in [3, 5]:
            gb_pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('classifier', XGBClassifier(
                    n_estimators=n_estimators,
                    learning_rate=learning_rate,
                    max_depth=max_depth,
                    use_label_encoder=False,
                    eval_metric='mlogloss',
                    random_state=42))
            ])
            gb_pipeline.fit(df_train_lr, y_train)
            val_pred = gb_pipeline.predict(df_val_lr)
            val_acc = accuracy_score(y_val, val_pred)
            val_f1 = f1_score(y_val, val_pred, average='weighted')
            print("n_estimators: {}, learning_rate: {}, max_depth: {} - Validation Accuracy: {:.4f}, Validation F1: {:.4f}"
                  .format(n_estimators, learning_rate, max_depth, val_acc, val_f1))
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                best_params = {'n_estimators': n_estimators, 'learning_rate': learning_rate, 'max_depth': max_depth}

print("Best parameters based on validation:", best_params)

gb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        n_estimators=best_params['n_estimators'],
        learning_rate=best_params['learning_rate'],
        max_depth=best_params['max_depth'],
        use_label_encoder=False,
        eval_metric='mlogloss',
        random_state=42))
])
gb_pipeline.fit(df_train_lr, y_train)

train_pred = gb_pipeline.predict(df_train_lr)
train_acc = accuracy_score(y_train, train_pred)
train_f1 = f1_score(y_train, train_pred, average='weighted')
print("Training Accuracy: {:.4f}, Training F1: {:.4f}".format(train_acc, train_f1))

test_pred = gb_pipeline.predict(df_test_lr)
test_acc = accuracy_score(y_test, test_pred)
test_f1 = f1_score(y_test, test_pred, average='weighted')
print("Test Accuracy: {:.4f}, Test F1: {:.4f}".format(test_acc, test_f1))
print("Test Classification Report:")
print(classification_report(y_test, test_pred, target_names=['negative', 'neutral', 'positive']))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


n_estimators: 100, learning_rate: 0.1, max_depth: 3 - Validation Accuracy: 0.8192, Validation F1: 0.8153


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


n_estimators: 100, learning_rate: 0.1, max_depth: 5 - Validation Accuracy: 0.8363, Validation F1: 0.8342


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


n_estimators: 100, learning_rate: 0.05, max_depth: 3 - Validation Accuracy: 0.7976, Validation F1: 0.7906


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


n_estimators: 100, learning_rate: 0.05, max_depth: 5 - Validation Accuracy: 0.8231, Validation F1: 0.8194


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


n_estimators: 200, learning_rate: 0.1, max_depth: 3 - Validation Accuracy: 0.8347, Validation F1: 0.8321


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


n_estimators: 200, learning_rate: 0.1, max_depth: 5 - Validation Accuracy: 0.8503, Validation F1: 0.8491


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


n_estimators: 200, learning_rate: 0.05, max_depth: 3 - Validation Accuracy: 0.8187, Validation F1: 0.8148


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


n_estimators: 200, learning_rate: 0.05, max_depth: 5 - Validation Accuracy: 0.8336, Validation F1: 0.8313
Best parameters based on validation: {'n_estimators': 200, 'learning_rate': 0.1, 'max_depth': 5}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training Accuracy: 0.8729, Training F1: 0.8716
Test Accuracy: 0.8404, Test F1: 0.8390
Test Classification Report:
              precision    recall  f1-score   support

    negative       0.86      0.88      0.87      2047
     neutral       0.79      0.73      0.76      2047
    positive       0.87      0.91      0.89      2047

    accuracy                           0.84      6141
   macro avg       0.84      0.84      0.84      6141
weighted avg       0.84      0.84      0.84      6141



Huh this works really well and no sign of overfitting, probably best model

for these smaller and simpler problems, there is no need for deep learning hence why we dont try any of those

In [None]:
final_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=5,
        use_label_encoder=False,
        eval_metric='mlogloss',
        random_state=42))
])
final_model.fit(df_train_lr, y_train)

final_train_pred = final_model.predict(df_train_lr)
final_train_acc = accuracy_score(y_train, final_train_pred)
final_test_pred = final_model.predict(df_test_lr)
final_test_acc = accuracy_score(y_test, final_test_pred)

print("Final Model - Training Accuracy: {:.4f}".format(final_train_acc))
print("Final Model - Test Accuracy: {:.4f}".format(final_test_acc))
print("Final Model - Test Classification Report:")
print(classification_report(y_test, final_test_pred, target_names=['negative', 'neutral', 'positive']))


In [None]:
#how to call

new_data = pd.DataFrame({
    "Rate": [4.0],
    "Review": ["The product exceeded my expectations and works flawlessly."],
    "Summary": ["Excellent product with superb performance."]
})

predicted_sentiment = final_model.predict(new_data)
print("Predicted sentiment:", predicted_sentiment)