# Please double check for errors or misinterpretations ~Kailash

In [2]:
from bigdata_a3_utils import *
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.metrics import classification_report
from pathlib import Path
import gc
import numpy as np

In [3]:
def get_needed_cols(base_path, category):
    #columns we want
    needed= ["rating", "text"]
    #read in pickle file
    print(f"Reading in {category} data")
    df= pd.read_pickle(base_path / f"cleaned_data_{category}.pkl")
    print(f"Finished reading in {category} data")
    
    columns_to_drop = df.columns.difference(needed)
    df.drop(columns=columns_to_drop, inplace=True)
    print("Dropped unneeded columns")
    gc.collect()
    return df

In [4]:
base_path= Path(r"F:\Saeed\frames")

dfs = []

categories= VALID_CATEGORIES
for category in categories:
    x = get_needed_cols(base_path, category).sample(frac=0.2, random_state=42)
    dfs.append(x)
    gc.collect()

temp_unified = pd.concat(dfs, ignore_index=True)

Reading in All_Beauty data
Finished reading in All_Beauty data
Dropped unneeded columns
Reading in Amazon_Fashion data
Finished reading in Amazon_Fashion data
Dropped unneeded columns
Reading in Appliances data
Finished reading in Appliances data
Dropped unneeded columns
Reading in Arts_Crafts_and_Sewing data
Finished reading in Arts_Crafts_and_Sewing data
Dropped unneeded columns
Reading in Automotive data
Finished reading in Automotive data
Dropped unneeded columns
Reading in Baby_Products data
Finished reading in Baby_Products data
Dropped unneeded columns
Reading in Beauty_and_Personal_Care data
Finished reading in Beauty_and_Personal_Care data
Dropped unneeded columns
Reading in Books data
Finished reading in Books data
Dropped unneeded columns
Reading in CDs_and_Vinyl data
Finished reading in CDs_and_Vinyl data
Dropped unneeded columns
Reading in Cell_Phones_and_Accessories data
Finished reading in Cell_Phones_and_Accessories data
Dropped unneeded columns
Reading in Clothing_Shoe

In [5]:
temp_unified['sentiment'] = temp_unified['rating'].apply(lambda x: 1 if x > 3 else 0)

In [6]:
temp_unified.head(1)

Unnamed: 0,rating,text,sentiment
0,5.0,Best hand cream ever! I love it! Light fragran...,1


In [7]:
X_train, X_test, y_train, y_test = train_test_split(temp_unified['text'], temp_unified['sentiment'], test_size=0.2, random_state=42)
# indices = np.arange(len(temp_unified))
# np.random.shuffle(indices)
# X_train, X_test = temp_unified['text'].iloc[indices[:int(len(temp_unified)*0.8)]], temp_unified['text'].iloc[indices[int(len(temp_unified)*0.8):]]
# y_train, y_test = temp_unified['sentiment'].iloc[indices[:int(len(temp_unified)*0.8)]], temp_unified['sentiment'].iloc[indices[int(len(temp_unified)*0.8):]]

In [8]:
vectorizer = TfidfVectorizer(lowercase=True, min_df=5, max_df=0.8)
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

In [9]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vectors, y_train)

In [10]:
y_pred = model.predict(X_test_vectors)

In [11]:
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

In [12]:
# Print evaluation metrics
print(f'Accuracy: {accuracy:.2f}')
print(f'F1 Score: {f1:.2f}')
print('Confusion Matrix:')
print(conf_matrix)

# Classification Report
report = classification_report(y_test, y_pred, target_names=['Negative', 'Positive'])
print('Classification Report:')
print(report)

Accuracy: 0.90
F1 Score: 0.94
Confusion Matrix:
[[ 3150326  1260348]
 [  718175 14990549]]
Classification Report:
              precision    recall  f1-score   support

    Negative       0.81      0.71      0.76   4410674
    Positive       0.92      0.95      0.94  15708724

    accuracy                           0.90  20119398
   macro avg       0.87      0.83      0.85  20119398
weighted avg       0.90      0.90      0.90  20119398

