In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
# Download resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
reviews_df = pd.read_csv('/content/clothing_reviews.csv')
description_df = pd.read_csv('/content/clothing_description.csv')

# Merge the data on 'product_id'
df = pd.merge(
    reviews_df,
    description_df[['product_id', 'sub_category']],  # Select only the columns needed
    on='product_id',
    how='left'  # Keep all rows from reviews_df even if no match in description_df
)

df.head()

  reviews_df = pd.read_csv('/content/clothing_reviews.csv')


Unnamed: 0,Customer ID,Customer Name,Customer Age,Gender,Purchase Date,product_id,Product Category,product_name,Quantity,Payment Method,Review Text,Rating,sub_category
0,46251.0,Christine Hernandez,37.0,Male,12-11-2020 13:13,778036.0,Clothing,Slim Fit Tank Top with Denim Fabric,1.0,PayPal,Absolutely wonderful - silky and sexy and comf...,4.0,Tank Tops
1,13593.0,James Grant,49.0,Female,05-05-2020 20:14,905147.0,Clothing,Fitted Jacket with Breathable Fabric,2.0,PayPal,Love this dress! it's sooo pretty. i happene...,5.0,Jackets
2,13593.0,James Grant,49.0,Female,05-05-2020 20:14,938121.0,Clothing,Fitted Jacket with Breathable Fabric,2.0,PayPal,Love this dress! it's sooo pretty. i happene...,5.0,Jackets
3,28805.0,Jose Collier,19.0,Male,31-03-2021 09:50,763149.0,Clothing,Textured T-Shirt with Cotton Fabric,1.0,PayPal,I had such high hopes for this dress and reall...,3.0,T-Shirts
4,28805.0,Jose Collier,19.0,Male,02-07-2020 02:54,708904.0,Clothing,Acid Wash Cargo Pants with Cotton Fabric,1.0,Credit Card,"I love, love, love this jumpsuit. it's fun, fl...",5.0,Pants


In [4]:
# Drop missing reviews
df.dropna(subset=['Review Text'], inplace=True)

In [5]:
# Filter to exclude only ratings 1-5 (in case of outliers)
df = df[df['Rating'].isin([1, 2, 3, 4, 5])]

# Re-map sentiment based on new condition:
# Rating 1 or 2 → 0 (negative), 3 to 5 → 1 (positive)
df['sentiment'] = df['Rating'].apply(lambda x: 1 if x >= 3 else 0)

In [6]:
# Clean review text
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation/numbers
    text = text.strip()
    return text

df['cleaned_review'] = df['Review Text'].apply(clean_text)

In [7]:
# Lemmatize and remove stopwords
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    words = text.split()
    filtered = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(filtered)

df['final_review'] = df['cleaned_review'].apply(preprocess)

In [8]:
# Split into training and test sets (before vectorization)
X_train_text, X_test_text, y_train, y_test = train_test_split(
    df['final_review'], df['sentiment'], test_size=0.2, random_state=42, stratify=df['sentiment']
)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=500)

# Fit only on training data
xv_train = tfidf.fit_transform(X_train_text)

# Transform test data
xv_test = tfidf.transform(X_test_text)

In [9]:
from collections import Counter

y_train_original = y_train.copy()

# Before SMOTE
print("Class distribution before SMOTE:")
print(Counter(y_train_original))

Class distribution before SMOTE:
Counter({1: 19942, 0: 2330})


In [10]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE
smote = SMOTE(random_state=42)
xv_train, y_train = smote.fit_resample(xv_train, y_train)

# After SMOTE
print("\nClass distribution after SMOTE:")
print(Counter(y_train))


Class distribution after SMOTE:
Counter({1: 19942, 0: 19942})


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [12]:
svc = SVC()
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier()
lrc = LogisticRegression()
rfc = RandomForestClassifier()
abc = AdaBoostClassifier()
bc = BaggingClassifier()
etc = ExtraTreesClassifier()
gbdt = GradientBoostingClassifier()

In [13]:
clfs = {
    'SVC': svc,
    'KN': knc,
    'NB': mnb,
    'DT': dtc,
    'LR': lrc,
    'RF': rfc,
    'AdaBoost': abc,
    'BgC': bc,
    'ETC': etc,
    'GBDT':gbdt
}

In [14]:
from sklearn.metrics import accuracy_score, precision_score

results = []

# Train and evaluate classifiers
for name, clf in clfs.items():
    print(f"\nTraining {name} ...")
    try:
        clf.fit(xv_train, y_train)
        y_pred = clf.predict(xv_test)
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, pos_label=1)

        print(f"Accuracy: {acc:.4f}, Precision: {prec:.4f}")
        results.append((name, acc, prec))
    except Exception as e:
        print(f"Error training {name}: {e}")


Training SVC ...
Accuracy: 0.9350, Precision: 0.9439

Training KN ...
Accuracy: 0.5416, Precision: 0.9818

Training NB ...
Accuracy: 0.8177, Precision: 0.9731

Training DT ...
Accuracy: 0.8844, Precision: 0.9467

Training LR ...
Accuracy: 0.8481, Precision: 0.9698

Training RF ...
Accuracy: 0.9267, Precision: 0.9426

Training AdaBoost ...
Accuracy: 0.8273, Precision: 0.9493

Training BgC ...
Accuracy: 0.8975, Precision: 0.9504

Training ETC ...
Accuracy: 0.9339, Precision: 0.9418

Training GBDT ...
Accuracy: 0.8799, Precision: 0.9355


In [15]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve, f1_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns

In [16]:
# Select top 3 models
top_models = ['SVC', 'ETC', 'RF']

for model_name in top_models:
    clf = clfs[model_name]
    print(f"\n=== {model_name} ===")

    # Predict on test set
    y_pred = clf.predict(xv_test)

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, pos_label=1)
    rec = recall_score(y_test, y_pred, pos_label=1)
    f1 = f1_score(y_test, y_pred, pos_label=1)
    specificity = tn / (tn + fp)
    sensitivity = rec  # same as recall for positive class

    # Print metrics
    print("Confusion Matrix:")
    print(cm)
    print("\nMetrics:")
    print(f"Accuracy   : {acc:.4f}")
    print(f"Precision  : {prec:.4f}")
    print(f"Recall     : {rec:.4f}")
    print(f"F1 Score   : {f1:.4f}")
    print(f"Sensitivity: {sensitivity:.4f}")
    print(f"Specificity: {specificity:.4f}")



=== SVC ===
Confusion Matrix:
[[ 290  292]
 [  70 4917]]

Metrics:
Accuracy   : 0.9350
Precision  : 0.9439
Recall     : 0.9860
F1 Score   : 0.9645
Sensitivity: 0.9860
Specificity: 0.4983

=== ETC ===
Confusion Matrix:
[[ 278  304]
 [  64 4923]]

Metrics:
Accuracy   : 0.9339
Precision  : 0.9418
Recall     : 0.9872
F1 Score   : 0.9640
Sensitivity: 0.9872
Specificity: 0.4777

=== RF ===
Confusion Matrix:
[[ 285  297]
 [ 111 4876]]

Metrics:
Accuracy   : 0.9267
Precision  : 0.9426
Recall     : 0.9777
F1 Score   : 0.9598
Sensitivity: 0.9777
Specificity: 0.4897


In [24]:
from sklearn.ensemble import ExtraTreesClassifier

# Step 1: Train ETC on training data
etc = ExtraTreesClassifier(random_state=42)
etc.fit(xv_train, y_train)

# Step 2: Predict sentiment for all reviews in the full dataset
all_reviews_vectorized = tfidf.transform(df['final_review'])
df['predicted_sentiment'] = etc.predict(all_reviews_vectorized)

# Step 3: Group by sub_category and product to compute sentiment score (mean predicted sentiment)
sentiment_scores = df.groupby(['sub_category', 'product_id', 'product_name'])['predicted_sentiment'].mean().reset_index()
sentiment_scores.rename(columns={'predicted_sentiment': 'sentiment_score'}, inplace=True)

# Step 4: Rank products within each sub_category
sentiment_scores['rank_in_subcategory'] = sentiment_scores.groupby('sub_category')['sentiment_score'] \
                                                           .rank(method='dense', ascending=False)

# Step 5: Get top 5 products per sub_category
sentiment_scores = sentiment_scores.sort_values(['sub_category', 'rank_in_subcategory'])
top5_per_subcategory = sentiment_scores.groupby('sub_category').head(5)

# Step 6: Display the results
print(top5_per_subcategory[['product_id','sub_category', 'product_name', 'sentiment_score', 'rank_in_subcategory']])


     product_id sub_category  \
1      123902.0      Blazers   
12     408400.0      Blazers   
17     739379.0      Blazers   
5      248009.0      Blazers   
24     816884.0      Blazers   
33     207193.0      Dresses   
47     597497.0      Dresses   
40     341049.0      Dresses   
52     697189.0      Dresses   
43     390866.0      Dresses   
62     224131.0      Hoodies   
67     286699.0      Hoodies   
65     260660.0      Hoodies   
75     582534.0      Hoodies   
81     867764.0      Hoodies   
93     348872.0      Jackets   
113    951879.0      Jackets   
110    905147.0      Jackets   
112    938121.0      Jackets   
90     187514.0      Jackets   
140    732923.0        Jeans   
131    496635.0        Jeans   
143    810777.0        Jeans   
132    502213.0        Jeans   
123    342909.0        Jeans   
170    724429.0        Pants   
158    438736.0        Pants   
162    579650.0        Pants   
152    213881.0        Pants   
180    995275.0        Pants   
213    8

In [26]:
from sklearn.ensemble import ExtraTreesClassifier

# Train the Extra Trees Classifier
etc = ExtraTreesClassifier(random_state=42)
etc.fit(xv_train, y_train)

# Predict sentiment for all reviews
all_reviews_vectorized = tfidf.transform(df['final_review'])
df['predicted_sentiment'] = etc.predict(all_reviews_vectorized)

# Group by product_id and product_name to compute average sentiment score
overall_scores = df.groupby(['product_id', 'product_name'])['predicted_sentiment'].mean().reset_index()
overall_scores.rename(columns={'predicted_sentiment': 'sentiment_score'}, inplace=True)

# Rank all products based on sentiment score
overall_scores['overall_rank'] = overall_scores['sentiment_score'].rank(method='dense', ascending=False)

# Sort and get top 10 products
top_products = overall_scores.sort_values('sentiment_score', ascending=False).head(300)

# Display result with product_id
print(top_products[['product_id', 'product_name', 'sentiment_score', 'overall_rank']])



     product_id                                   product_name  \
9      123902.0              Vintage Blazer with Luxury Fabric   
36     207193.0                Ripped Dress with Luxury Fabric   
250    840307.0             Textured Shirt with Premium Fabric   
40     224131.0            Acid Wash Hoodie with Cotton Fabric   
86     345402.0            Acid Wash Shirt with Premium Fabric   
..          ...                                            ...   
291    970321.0  Oversized Cargo Pants with Lightweight Fabric   
45     233768.0          Vintage Blazer with Breathable Fabric   
144    517518.0              Ripped Hoodie with Stretch Fabric   
183    659309.0   Slim Fit Sweatpants with Eco-Friendly Fabric   
223    769499.0          Textured Tank Top with Stretch Fabric   

     sentiment_score  overall_rank  
9           0.987179           1.0  
36          0.974684           2.0  
250         0.972973           3.0  
40          0.971831           4.0  
86          0.970149  

In [27]:
top_products[['product_id', 'product_name', 'sentiment_score', 'overall_rank']].to_csv('top_ranked_products.csv', index=False)