In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [2]:
df = pd.read_csv("../data/processed/cleaned_reviews.csv")

X = df['cleaned_review']
y_category = df['category']

y_fake = df['label'].map({'CG': 0, 'OR': 1})

In [4]:
df['cleaned_review'].isna().sum()

np.int64(1)

In [10]:
# Make sure df is the cleaned dataframe
df = df.dropna(subset=['cleaned_review'])
df = df[df['cleaned_review'].str.strip() != ""]
df = df.reset_index(drop=True)

# Recreate labels AFTER cleaning
X = df['cleaned_review']
y_category = df['category']
y_fake = df['label'].map({'CG': 0, 'OR': 1})

In [11]:
df['cleaned_review'].isna().sum()

np.int64(0)

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1,2),
    min_df=5
)

X_tfidf = tfidf.fit_transform(X)

In [13]:
print(X_tfidf.shape)
print(len(y_category))
print(len(y_fake))

(40431, 10000)
40431
40431


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y_category, test_size=0.2, random_state=42
)

model_category = LogisticRegression(max_iter=1000)
model_category.fit(X_train, y_train)

y_pred = model_category.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.781501174724867
                              precision    recall  f1-score   support

                     Books_5       0.75      0.75      0.75       915
Clothing_Shoes_and_Jewelry_5       0.79      0.83      0.81       753
               Electronics_5       0.80      0.80      0.80       760
          Home_and_Kitchen_5       0.69      0.74      0.72       847
              Kindle_Store_5       0.81      0.78      0.79       971
             Movies_and_TV_5       0.92      0.88      0.90       735
              Pet_Supplies_5       0.90      0.83      0.87       867
       Sports_and_Outdoors_5       0.65      0.63      0.64       802
Tools_and_Home_Improvement_5       0.70      0.75      0.72       711
            Toys_and_Games_5       0.82      0.83      0.82       726

                    accuracy                           0.78      8087
                   macro avg       0.78      0.78      0.78      8087
                weighted avg       0.78      0.78      0.78 

In [15]:
X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(
    X_tfidf, y_fake, test_size=0.2, random_state=42
)

model_fake = LogisticRegression(max_iter=1000)
model_fake.fit(X_train_f, y_train_f)

y_pred_f = model_fake.predict(X_test_f)

print("Accuracy:", accuracy_score(y_test_f, y_pred_f))
print(classification_report(y_test_f, y_pred_f))

Accuracy: 0.8972424879436132
              precision    recall  f1-score   support

           0       0.91      0.88      0.90      4018
           1       0.89      0.91      0.90      4069

    accuracy                           0.90      8087
   macro avg       0.90      0.90      0.90      8087
weighted avg       0.90      0.90      0.90      8087

