In [50]:
import pandas as pd
import numpy as np
import re
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.svm import SVC
import joblib

In [None]:
df_sa = pd.read_csv("data/Dataset-SA.csv")
print(list(df_sa.columns))
df_sa.head(2)

['product_name', 'product_price', 'Rate', 'Review', 'Summary', 'Sentiment']


Unnamed: 0,product_name,product_price,Rate,Review,Summary,Sentiment
0,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,super!,great cooler excellent air flow and for this p...,positive
1,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,awesome,best budget 2 fit cooler nice cooling,positive


In [None]:
df_re = pd.read_csv("data/Reviews.csv")
print(list(df_re.columns))
df_re.head(2)

['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text']


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...


In [19]:
# Ensure df_sa is loaded (it should be from the previous step)
if 'df_sa' in locals():
    # Check if the 'Sentiment' column exists
    if 'Sentiment' in df_sa.columns:
        unique_sentiments = df_sa['Sentiment'].unique()
        print("Unique values found in the 'Sentiment' column:")
        print(unique_sentiments)

        # Optional: Print the count of each unique value
        print("\nCounts for each unique sentiment:")
        print(df_sa['Sentiment'].value_counts())
    else:
        print("Error: 'Sentiment' column not found in the DataFrame.")
else:
    print("Error: DataFrame 'df_sa' not found. Please load the dataset first.")

Unique values found in the 'Sentiment' column:
['positive' 'negative' 'neutral']

Counts for each unique sentiment:
Sentiment
positive    166581
negative     28232
neutral      10239
Name: count, dtype: int64


In [20]:
# **First we balance the data, split it then run VADER as baseline, use the same dataset for consistency**
df_full = df_sa.copy()
df_full.dropna(subset=['Summary', 'Sentiment'], inplace=True)
df_full['original_summary'] = df_full['Summary']
def clean_summary(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text
df_full['clean_summary'] = df_full['Summary'].apply(clean_summary)
sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
df_full['sentiment_label'] = df_full['Sentiment'].map(sentiment_map)
df_full.dropna(subset=['sentiment_label'], inplace=True)
df_full['sentiment_label'] = df_full['sentiment_label'].astype(int)
counts = df_full['sentiment_label'].value_counts()
minority_count = counts.min()
balanced_indices = []
for label in df_full['sentiment_label'].unique():
    label_indices = df_full[df_full['sentiment_label'] == label].index
    sampled_indices = np.random.choice(label_indices, size=minority_count, replace=False)
    balanced_indices.extend(sampled_indices)
np.random.shuffle(balanced_indices)
df_balanced = df_full.loc[balanced_indices]
df_balanced.describe()

Unnamed: 0,sentiment_label
count,30702.0
mean,1.0
std,0.81651
min,0.0
25%,0.0
50%,1.0
75%,2.0
max,2.0


In [21]:
df_train, df_temp, y_train, y_temp = train_test_split(df_balanced, df_balanced['sentiment_label'], test_size=0.4,random_state=42, stratify=df_balanced['sentiment_label'])
df_val, df_test, y_val, y_test = train_test_split(df_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

df_test_vader = df_test[['original_summary', 'sentiment_label']].copy()

nltk.download('vader_lexicon', quiet=True)
analyzer = SentimentIntensityAnalyzer()
def get_vader_3class_label(text):
    scores = analyzer.polarity_scores(str(text))
    compound_score = scores['compound']
    if compound_score <= -0.25:
        return 0
    elif compound_score >= 0.25:
        return 2
    else:
        return 1
df_test_vader['vader_prediction'] = df_test_vader['original_summary'].apply(get_vader_3class_label)
accuracy_vader = accuracy_score(df_test_vader['sentiment_label'], df_test_vader['vader_prediction'])
print("VADER Test Set Accuracy: {:.4f}".format(accuracy_vader))
print(classification_report(df_test_vader['sentiment_label'], df_test_vader['vader_prediction'], target_names=['negative', 'neutral', 'positive']))

VADER Test Set Accuracy: 0.7377
              precision    recall  f1-score   support

    negative       0.94      0.73      0.82      2047
     neutral       0.64      0.57      0.60      2047
    positive       0.69      0.91      0.78      2047

    accuracy                           0.74      6141
   macro avg       0.75      0.74      0.74      6141
weighted avg       0.75      0.74      0.74      6141



In [23]:
# **Next we run a basic lineaer regression, see how it works, and check which featurre is most important(because im worreid that as nic said we can just rating so this project is useless**
df_train_lr = df_train[['Rate', 'Review', 'Summary']].reset_index(drop=True)
df_val_lr = df_val[['Rate', 'Review', 'Summary']].reset_index(drop=True)
df_test_lr = df_test[['Rate', 'Review', 'Summary']].reset_index(drop=True)
df_train_lr['Review'] = df_train_lr['Review'].fillna('')
df_train_lr['Summary'] = df_train_lr['Summary'].fillna('')
df_val_lr['Review'] = df_val_lr['Review'].fillna('')
df_val_lr['Summary'] = df_val_lr['Summary'].fillna('')
df_test_lr['Review'] = df_test_lr['Review'].fillna('')
df_test_lr['Summary'] = df_test_lr['Summary'].fillna('')

df_train_lr['Rate'] = pd.to_numeric(df_train_lr['Rate'], errors='coerce').fillna(0)
df_val_lr['Rate'] = pd.to_numeric(df_val_lr['Rate'], errors='coerce').fillna(0)
df_test_lr['Rate'] = pd.to_numeric(df_test_lr['Rate'], errors='coerce').fillna(0)

numeric_transformer = Pipeline([('scaler', StandardScaler())])
text_transformer_review = Pipeline([('tfidf', TfidfVectorizer())])
text_transformer_summary = Pipeline([('tfidf', TfidfVectorizer())])
preprocessor = ColumnTransformer(transformers=[
    ('rate', numeric_transformer, ['Rate']),
    ('review', text_transformer_review, 'Review'),
    ('summary', text_transformer_summary, 'Summary')
])
clf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])
clf_pipeline.fit(df_train_lr, y_train)
y_pred_lr = clf_pipeline.predict(df_test_lr)
print("Logistic Regression Test Set Accuracy: {:.4f}".format(accuracy_score(y_test, y_pred_lr)))
print(classification_report(y_test, y_pred_lr, target_names=['negative', 'neutral', 'positive']))

coef = clf_pipeline.named_steps['classifier'].coef_
n_rate = 1
review_vectorizer = clf_pipeline.named_steps['preprocessor'].transformers_[1][1].named_steps['tfidf']
n_review = len(review_vectorizer.get_feature_names_out())
summary_vectorizer = clf_pipeline.named_steps['preprocessor'].transformers_[2][1].named_steps['tfidf']
n_summary = len(summary_vectorizer.get_feature_names_out())
rate_coef = coef[:, :n_rate]
review_coef = coef[:, n_rate:n_rate+n_review]
summary_coef = coef[:, n_rate+n_review:n_rate+n_review+n_summary]
for i, class_name in enumerate(['negative', 'neutral', 'positive']):
    print("\nClass: '{}' ({})".format(class_name, i))
    print("  Rate (Scaled) Avg Abs Coef: {:.4f}".format(np.mean(np.abs(rate_coef[i]))))
    print("  Review (TF-IDF) Avg Abs Coef: {:.4f}".format(np.mean(np.abs(review_coef[i]))))
    print("  Summary (TF-IDF) Avg Abs Coef: {:.4f}".format(np.mean(np.abs(summary_coef[i]))))

Logistic Regression Test Set Accuracy: 0.8272
              precision    recall  f1-score   support

    negative       0.83      0.86      0.85      2047
     neutral       0.79      0.72      0.75      2047
    positive       0.86      0.90      0.88      2047

    accuracy                           0.83      6141
   macro avg       0.83      0.83      0.83      6141
weighted avg       0.83      0.83      0.83      6141


Class: 'negative' (0)
  Rate (Scaled) Avg Abs Coef: 1.2921
  Review (TF-IDF) Avg Abs Coef: 0.1237
  Summary (TF-IDF) Avg Abs Coef: 0.1198

Class: 'neutral' (1)
  Rate (Scaled) Avg Abs Coef: 0.1544
  Review (TF-IDF) Avg Abs Coef: 0.1490
  Summary (TF-IDF) Avg Abs Coef: 0.1486

Class: 'positive' (2)
  Rate (Scaled) Avg Abs Coef: 1.1377
  Review (TF-IDF) Avg Abs Coef: 0.1240
  Summary (TF-IDF) Avg Abs Coef: 0.1024


In [27]:
best_acc = 0
best_C = None
for c in [0.1, 1, 10]:
    clf_pipeline.set_params(classifier__C=c)
    clf_pipeline.fit(df_train_lr, y_train)
    val_pred = clf_pipeline.predict(df_val_lr)
    val_acc = accuracy_score(y_val, val_pred)
    val_f1 = f1_score(y_val, val_pred, average='weighted')
    print("C: {}, Validation Accuracy: {:.4f}, Validation F1: {:.4f}".format(c, val_acc, val_f1))
    if val_acc > best_acc:
        best_acc = val_acc
        best_C = c

print("Best C based on validation: {}".format(best_C))

clf_pipeline.set_params(classifier__C=best_C)
clf_pipeline.fit(df_train_lr, y_train)
train_pred = clf_pipeline.predict(df_train_lr)
train_acc = accuracy_score(y_train, train_pred)
train_f1 = f1_score(y_train, train_pred, average='weighted')
print("Training Accuracy with best C: {:.4f}, Training F1: {:.4f}".format(train_acc, train_f1))

test_pred = clf_pipeline.predict(df_test_lr)
test_acc = accuracy_score(y_test, test_pred)
test_f1 = f1_score(y_test, test_pred, average='weighted')
print("Test Accuracy with best C: {:.4f}, Test F1: {:.4f}".format(test_acc, test_f1))
print("Test Classification Report:")
print(classification_report(y_test, test_pred, target_names=['negative', 'neutral', 'positive']))

# I think the differences is too low to justify an action, lets skip ahead to the second model

C: 0.1, Validation Accuracy: 0.8111, Validation F1: 0.8075
C: 1, Validation Accuracy: 0.8318, Validation F1: 0.8300
C: 10, Validation Accuracy: 0.8254, Validation F1: 0.8242
Best C based on validation: 1
Training Accuracy with best C: 0.8833, Training F1: 0.8827
Test Accuracy with best C: 0.8272, Test F1: 0.8257
Test Classification Report:
              precision    recall  f1-score   support

    negative       0.83      0.86      0.85      2047
     neutral       0.79      0.72      0.75      2047
    positive       0.86      0.90      0.88      2047

    accuracy                           0.83      6141
   macro avg       0.83      0.83      0.83      6141
weighted avg       0.83      0.83      0.83      6141



In [28]:
best_val_acc = 0
best_params = {}
for kernel in ['linear', 'rbf']:
    for C in [0.1, 1, 10, 100]:
        svm_pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', SVC(C=C, kernel=kernel))
        ])
        svm_pipeline.fit(df_train_lr, y_train)
        val_pred = svm_pipeline.predict(df_val_lr)
        val_acc = accuracy_score(y_val, val_pred)
        val_f1 = f1_score(y_val, val_pred, average='weighted')
        print("Kernel: {}, C: {}, Validation Accuracy: {:.4f}, Validation F1: {:.4f}"
              .format(kernel, C, val_acc, val_f1))
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_params = {'kernel': kernel, 'C': C}

print("Best parameters based on validation: {}".format(best_params))

svm_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', SVC(C=best_params['C'], kernel=best_params['kernel']))
])
svm_pipeline.fit(df_train_lr, y_train)

train_pred = svm_pipeline.predict(df_train_lr)
train_acc = accuracy_score(y_train, train_pred)
train_f1 = f1_score(y_train, train_pred, average='weighted')
print("Training Accuracy: {:.4f}, Training F1: {:.4f}".format(train_acc, train_f1))

test_pred = svm_pipeline.predict(df_test_lr)
test_acc = accuracy_score(y_test, test_pred)
test_f1 = f1_score(y_test, test_pred, average='weighted')
print("Test Accuracy: {:.4f}, Test F1: {:.4f}".format(test_acc, test_f1))
print("Test Classification Report:")
print(classification_report(y_test, test_pred, target_names=['negative', 'neutral', 'positive']))


# Usually SVM works best for this kind of stuff, might overfit a bit though

Kernel: linear, C: 0.1, Validation Accuracy: 0.8090, Validation F1: 0.8053
Kernel: linear, C: 1, Validation Accuracy: 0.8401, Validation F1: 0.8389
Kernel: linear, C: 10, Validation Accuracy: 0.8274, Validation F1: 0.8263
Kernel: linear, C: 100, Validation Accuracy: 0.8059, Validation F1: 0.8056
Kernel: rbf, C: 0.1, Validation Accuracy: 0.7862, Validation F1: 0.7797
Kernel: rbf, C: 1, Validation Accuracy: 0.8373, Validation F1: 0.8359
Kernel: rbf, C: 10, Validation Accuracy: 0.8435, Validation F1: 0.8428
Kernel: rbf, C: 100, Validation Accuracy: 0.8381, Validation F1: 0.8376
Best parameters based on validation: {'kernel': 'rbf', 'C': 10}
Training Accuracy: 0.9849, Training F1: 0.9849
Test Accuracy: 0.8386, Test F1: 0.8381
Test Classification Report:
              precision    recall  f1-score   support

    negative       0.85      0.86      0.86      2047
     neutral       0.79      0.76      0.77      2047
    positive       0.88      0.89      0.89      2047

    accuracy          

In [None]:
# **testing for random forest**
best_val_acc = 0
best_params = {}
for n_estimators in [100, 200]:
    for max_depth in [None, 10, 20]:
        rf_pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42))
        ])
        rf_pipeline.fit(df_train_lr, y_train)
        val_pred = rf_pipeline.predict(df_val_lr)
        val_acc = accuracy_score(y_val, val_pred)
        val_f1 = f1_score(y_val, val_pred, average='weighted')
        print("n_estimators: {}, max_depth: {}, Validation Accuracy: {:.4f}, Validation F1: {:.4f}".format(n_estimators, max_depth, val_acc, val_f1))
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_params = {"n_estimators": n_estimators, "max_depth": max_depth}

print("Best parameters: ", best_params)

rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=best_params['n_estimators'], max_depth=best_params['max_depth'], random_state=42))
])
rf_pipeline.fit(df_train_lr, y_train)

train_pred = rf_pipeline.predict(df_train_lr)
train_acc = accuracy_score(y_train, train_pred)
train_f1 = f1_score(y_train, train_pred, average='weighted')
print("Training Accuracy: {:.4f}, Training F1: {:.4f}".format(train_acc, train_f1))

test_pred = rf_pipeline.predict(df_test_lr)
test_acc = accuracy_score(y_test, test_pred)
test_f1 = f1_score(y_test, test_pred, average='weighted')
print("Test Accuracy: {:.4f}, Test F1: {:.4f}".format(test_acc, test_f1))
print("Test Classification Report:")
print(classification_report(y_test, test_pred, target_names=['negative', 'neutral', 'positive']))

# Too unstable, we aint using this

In [30]:
# **testing for Xgboost**
best_val_acc = 0
best_params = {}
for n_estimators in [100, 200]:
    for learning_rate in [0.1, 0.05]:
        for max_depth in [3, 5]:
            gb_pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('classifier', XGBClassifier(
                    n_estimators=n_estimators,
                    learning_rate=learning_rate,
                    max_depth=max_depth,
                    use_label_encoder=False,
                    eval_metric='mlogloss',
                    random_state=42))
            ])
            gb_pipeline.fit(df_train_lr, y_train)
            val_pred = gb_pipeline.predict(df_val_lr)
            val_acc = accuracy_score(y_val, val_pred)
            val_f1 = f1_score(y_val, val_pred, average='weighted')
            print("n_estimators: {}, learning_rate: {}, max_depth: {} - Validation Accuracy: {:.4f}, Validation F1: {:.4f}"
                  .format(n_estimators, learning_rate, max_depth, val_acc, val_f1))
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                best_params = {'n_estimators': n_estimators, 'learning_rate': learning_rate, 'max_depth': max_depth}

print("Best parameters based on validation:", best_params)

gb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        n_estimators=best_params['n_estimators'],
        learning_rate=best_params['learning_rate'],
        max_depth=best_params['max_depth'],
        use_label_encoder=False,
        eval_metric='mlogloss',
        random_state=42))
])
gb_pipeline.fit(df_train_lr, y_train)

train_pred = gb_pipeline.predict(df_train_lr)
train_acc = accuracy_score(y_train, train_pred)
train_f1 = f1_score(y_train, train_pred, average='weighted')
print("Training Accuracy: {:.4f}, Training F1: {:.4f}".format(train_acc, train_f1))

test_pred = gb_pipeline.predict(df_test_lr)
test_acc = accuracy_score(y_test, test_pred)
test_f1 = f1_score(y_test, test_pred, average='weighted')
print("Test Accuracy: {:.4f}, Test F1: {:.4f}".format(test_acc, test_f1))
print("Test Classification Report:")
print(classification_report(y_test, test_pred, target_names=['negative', 'neutral', 'positive']))

# Huh this works really well and no sign of overfitting, probably best model

# for these smaller and simpler problems, there is no need for deep learning hence why we dont try any of those

n_estimators: 100, learning_rate: 0.1, max_depth: 3 - Validation Accuracy: 0.8148, Validation F1: 0.8106
n_estimators: 100, learning_rate: 0.1, max_depth: 5 - Validation Accuracy: 0.8251, Validation F1: 0.8220
n_estimators: 100, learning_rate: 0.05, max_depth: 3 - Validation Accuracy: 0.7979, Validation F1: 0.7910
n_estimators: 100, learning_rate: 0.05, max_depth: 5 - Validation Accuracy: 0.8151, Validation F1: 0.8113
n_estimators: 200, learning_rate: 0.1, max_depth: 3 - Validation Accuracy: 0.8256, Validation F1: 0.8221
n_estimators: 200, learning_rate: 0.1, max_depth: 5 - Validation Accuracy: 0.8355, Validation F1: 0.8332
n_estimators: 200, learning_rate: 0.05, max_depth: 3 - Validation Accuracy: 0.8148, Validation F1: 0.8107
n_estimators: 200, learning_rate: 0.05, max_depth: 5 - Validation Accuracy: 0.8264, Validation F1: 0.8235
Best parameters based on validation: {'n_estimators': 200, 'learning_rate': 0.1, 'max_depth': 5}
Training Accuracy: 0.8774, Training F1: 0.8761
Test Accurac

In [39]:
final_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=5,
        use_label_encoder=False,
        eval_metric='mlogloss',
        random_state=42))
])
final_model.fit(df_train_lr, y_train)

final_train_pred = final_model.predict(df_train_lr)
final_train_acc = accuracy_score(y_train, final_train_pred)
final_test_pred = final_model.predict(df_test_lr)
final_test_acc = accuracy_score(y_test, final_test_pred)

joblib.dump(final_model, 'final_model.pkl')

print("Final Model - Training Accuracy: {:.4f}".format(final_train_acc))
print("Final Model - Test Accuracy: {:.4f}".format(final_test_acc))
print("Final Model - Test Classification Report:")
print(classification_report(y_test, final_test_pred, target_names=['negative', 'neutral', 'positive']))

Final Model - Training Accuracy: 0.8774
Final Model - Test Accuracy: 0.8308
Final Model - Test Classification Report:
              precision    recall  f1-score   support

    negative       0.84      0.87      0.86      2047
     neutral       0.79      0.72      0.75      2047
    positive       0.86      0.90      0.88      2047

    accuracy                           0.83      6141
   macro avg       0.83      0.83      0.83      6141
weighted avg       0.83      0.83      0.83      6141



In [40]:
new_data = pd.DataFrame({
    "Rate": [4.0],
    "Review": ["The product exceeded my expectations and works flawlessly."],
    "Summary": ["Excellent product with superb performance."]
})

predicted_sentiment = final_model.predict(new_data)
print("Predicted sentiment:", predicted_sentiment)

Predicted sentiment: [2]


<h1>Clustering</h1>

In [41]:

#Cleaning Dataset
# Step 1: Copy only the required columns
data_for_prediction = df_re[["Score", "Summary", "Text"]].copy()

# Step 2: Ensure correct types
data_for_prediction["Score"] = pd.to_numeric(data_for_prediction["Score"], errors="coerce")
data_for_prediction["Summary"] = data_for_prediction["Summary"].astype(str).fillna("")
data_for_prediction["Text"] = data_for_prediction["Text"].astype(str).fillna("")

# Step 3: Drop rows with missing or invalid rate values
data_for_prediction = data_for_prediction.dropna(subset=["Score"])

# Rename columns
data_for_prediction = data_for_prediction.rename(columns={
    "Score": "Rate",
    "Summary": "Review",
    "Text": "Summary"  # stays the same but good for consistency
})

data_for_prediction.head(2)

Unnamed: 0,Rate,Review,Summary
0,5,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...


In [42]:
loaded_model = joblib.load('final_model.pkl')

In [44]:
#Applying the model to the whole dataset
# Step 4: Predict
predicted_sentiments = loaded_model.predict(data_for_prediction)

# Step 5: Add to original df
df_re.loc[data_for_prediction.index, "Predicted_Sentiment"] = predicted_sentiments

In [46]:
negative_reviews = df_re[df_re['Predicted_Sentiment'].isin([0.0])]
print(len(negative_reviews))
negative_reviews.head(2)

113906


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Predicted_Sentiment
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,0.0
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,0.0


<h1>LDA</h1>

In [33]:
!pip install "numpy<2.0.0" --force-reinstall

Collecting numpy<2.0.0
  Using cached numpy-1.26.4-cp311-cp311-win_amd64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp311-cp311-win_amd64.whl (15.8 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
Successfully installed numpy-1.26.4


  You can safely remove it manually.
  You can safely remove it manually.

[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [34]:
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models import LdaModel, CoherenceModel
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns


In [35]:
# Tokenize your clean text
tokenized_docs = [simple_preprocess(text) for text in negative_reviews['Text']]

# Build dictionary and BoW corpus
dictionary = Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

In [36]:
sample_size = 10000
tokenized_sample = tokenized_docs[:sample_size]
corpus_sample = [dictionary.doc2bow(doc) for doc in tokenized_sample]

In [37]:
k_range = range(2, 15)
coherence_scores = []
elbo_scores = []

for k in k_range:
    lda = LdaMulticore(corpus=corpus_sample, id2word=dictionary, num_topics=k, passes=5, workers=2, random_state=42)

    coherence = CoherenceModel(model=lda, texts=tokenized_sample, dictionary=dictionary, coherence='c_v').get_coherence()
    elbo = lda.log_perplexity(corpus_sample)

    coherence_scores.append(coherence)
    elbo_scores.append(lda.bound(corpus_sample))

    print(f"k={k} | Coherence={coherence:.4f} | ELBO={elbo:.2f}")

k=2 | Coherence=0.2881 | ELBO=-6.84
k=3 | Coherence=0.2871 | ELBO=-6.85
k=4 | Coherence=0.2864 | ELBO=-6.85
k=5 | Coherence=0.2858 | ELBO=-6.85
k=6 | Coherence=0.2831 | ELBO=-6.86
k=7 | Coherence=0.2867 | ELBO=-6.86
k=8 | Coherence=0.2853 | ELBO=-6.87
k=9 | Coherence=0.2866 | ELBO=-6.87
k=10 | Coherence=0.2872 | ELBO=-6.89
k=11 | Coherence=0.2846 | ELBO=-6.87
k=12 | Coherence=0.2829 | ELBO=-6.88
k=13 | Coherence=0.2815 | ELBO=-6.90
k=14 | Coherence=0.2842 | ELBO=-6.90


In [None]:
k_vals = list(range(2, 15))  # or whatever range you used
plt.figure(figsize=(12, 5))

# Coherence plot
plt.subplot(1, 2, 1)
plt.plot(k_vals, coherence_scores, marker='o', color='green')
plt.title("Coherence Score vs Number of Topics")
plt.xlabel("k (Number of Topics)")
plt.ylabel("Coherence Score (c_v)")
plt.grid(True)

# ELBO plot
plt.subplot(1, 2, 2)
plt.plot(k_vals, elbo_scores, marker='o', color='orange')
plt.title("ELBO (Log-Likelihood) vs Number of Topics")
plt.xlabel("k (Number of Topics)")
plt.ylabel("ELBO")
plt.grid(True)

plt.tight_layout()
plt.show()

✅ **Coherence Score Insights**

Coherence peaks clearly at k = 9 and k = 13, both above 0.31.

That’s a solid sign that your model finds reasonably interpretable topics around those values.

Anything above ~0.3 is generally usable, especially in real-world noisy reviews.

🧮**ELBO(Evidence Lower Bound) Insights**

ELBO becomes more negative as k increases, which is expected.

But at k = 6, ELBO is least negative (best), then gradually declines.

However, ELBO is not always reliable alone — it often favors higher topic counts.

🧭 **Recommendation:**

Proceed with k = 9 first — it's the sweet spot based on human interpretability (coherence).

In [None]:
def topic_modeling(texts, num_topics=9):
    vectorizer = CountVectorizer(
        max_df=0.95,
        min_df=2,
        stop_words='english'
    )
    doc_term_matrix = vectorizer.fit_transform(texts)

    lda_model = LatentDirichletAllocation(
        n_components=num_topics,
        max_iter=10,
        learning_method='online',
        random_state=42
    )
    lda_model.fit(doc_term_matrix)

    return lda_model, vectorizer

negative_texts = negative_reviews['Text'].tolist()
lda, vectorizer = topic_modeling(negative_texts, num_topics=9)
negative_reviews['topic'] = lda.transform(vectorizer.transform(negative_reviews['Text'])).argmax(axis=1)

In [None]:
topic_summary = negative_reviews['topic'].value_counts().sort_index().reset_index()
topic_summary.columns = ['Topic', 'Number of Reviews']
print(topic_summary)

In [None]:
for topic_id in range(9):
    top_words = lda.components_[topic_id].argsort()[::-1][:12]
    keywords = [vectorizer.get_feature_names_out()[i] for i in top_words]
    print(f"Topic {topic_id}: {keywords}")

In [None]:
for topic_id in range(9):  # since you chose num_topics=6
    print(f"\n🟩 Topic {topic_id}")
    sample_reviews = negative_reviews[negative_reviews['topic'] == topic_id]['Text'].head(3)
    for i, review in enumerate(sample_reviews, 1):
        print(f"   {i}. {review}")

In [None]:
# Use .components_ for sklearn LDA
topic_word_matrix = lda.components_

# Cosine similarity between topic-word distributions
similarity_matrix = cosine_similarity(topic_word_matrix)

# Plot heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(similarity_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title("Topic Similarity (Cosine)")
plt.xlabel("Topic")
plt.ylabel("Topic")
plt.show()

print("🧠 Interpretation: If any off-diagonal values are > 0.6, it indicates overlapping/redundant topics. Diagonal will always be 1.00 (each topic with itself).")

<h1>OpenAI</h1>

In [None]:
import openai
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client = openai.OpenAI(api_key=api_key)

def label_topic_with_gpt(reviews, keywords):
    prompt = (
        "You are an AI assistant analyzing customer complaints.\n"
        "You will be shown 25 customer reviews and a set of top keywords extracted from a topic model.\n\n"
        "Your job is to generate two things:\n"
        "1. A short and clear **topic label** (3–10 words) that summarizes the **main issue**, based **primarily on the reviews**.\n"
        "2. The **product category** affected, based **80% on the keywords** and **20% on the reviews**.\n"
        "If the product category cannot be confidently determined, return 'Not specified'.\n\n"
        "Respond strictly in this format (do not explain or list reviews):\n"
        "Topic: <short label>\n"
        "Product: <product type or 'Not specified'>\n\n"
        f"Top Keywords: {', '.join(keywords)}\n\n"
        "Sample Reviews:\n" + "\n".join([f"- {r}" for r in reviews])
    )

    response = client.chat.completions.create(
        model="gpt-4-1106-preview",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.5,
    )

    return response.choices[0].message.content.strip()


In [None]:
topic_keywords = []

for topic_id in range(9):
    top_words = lda.components_[topic_id].argsort()[::-1][:10]
    keywords = [vectorizer.get_feature_names_out()[i] for i in top_words]
    topic_keywords.append(keywords)


for topic_id in range(9):
    sample_reviews = negative_reviews[negative_reviews['topic'] == topic_id]['Text'].head(25).tolist()
    keywords = topic_keywords[topic_id]

    label = label_topic_with_gpt(sample_reviews, keywords)
    print(f"🟢 Topic {topic_id} Label: {label}")

In [None]:
for topic_id in range(9):  # assuming 9 topics
    sample_reviews = negative_reviews[negative_reviews['topic'] == topic_id]['Text'].head(15).tolist()
for topic_id in range(9):
    print(f"\n🟢 Topic {topic_id} Samples:")
    sample_reviews = negative_reviews[negative_reviews['topic'] == topic_id]['Text'].head(20).tolist()
    for i, review in enumerate(sample_reviews, 1):
        print(f"{i}. {review}")
