In [1]:
import json
import pandas as pd

# ---------- LOAD REVIEWS ----------

review_data = []

with open("../data/Handmade_Products.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        try:
            review_data.append(json.loads(line))
        except:
            continue  # skip any broken lines safely

reviews = pd.DataFrame(review_data)

print("Reviews shape:", reviews.shape)


# ---------- LOAD METADATA ----------

meta_data = []

with open("../data/meta_Handmade_Products.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        try:
            meta_data.append(json.loads(line))
        except:
            continue

meta = pd.DataFrame(meta_data)

print("Metadata shape:", meta.shape)


# ---------- BASIC CLEANING ----------

# Rename columns for consistency (if needed)
if "text" in reviews.columns:
    reviews.rename(columns={"text": "reviewText"}, inplace=True)

if "rating" in reviews.columns:
    reviews.rename(columns={"rating": "overall"}, inplace=True)

# Keep important columns only
reviews = reviews[[
    "overall",
    "reviewText",
    "helpful_vote",
    "verified_purchase",
    "asin"
]]

meta.rename(columns={"parent_asin": "asin"}, inplace=True)

meta = meta[["asin", "title"]]  # adjust if needed


# ---------- MERGE REVIEWS + META ----------

df = pd.merge(reviews, meta, on="asin", how="left")

print("Merged shape:", df.shape)


# ---------- OPTIONAL: CREATE LABEL (Example Heuristic) ----------

# Example: fake review proxy (low helpful votes + unverified purchase)
df["label"] = (
    (df["helpful_vote"] == 0) &
    (df["verified_purchase"] == False)
).astype(int)

print(df["label"].value_counts())

df.head()


Reviews shape: (664162, 10)
Metadata shape: (164817, 14)
Merged shape: (664162, 6)
label
0    639619
1     24543
Name: count, dtype: int64


Unnamed: 0,overall,reviewText,helpful_vote,verified_purchase,asin,title,label
0,5.0,I bought one for myself and one for my grandda...,1,True,B08GPJ1MSN,Orgone Chakra Necklace With Adjustable Cord - ...,0
1,5.0,I’ve ordered three bows so far. Have not been ...,0,True,B084TWHS7W,Yellow Floral Green Gingham Spring Wreath Bow,0
2,5.0,As pictured. Used a frame from the dollar stor...,0,True,B07V3NRQC4,Live Simply Enjoy the Ride Vintage Bicycle Ins...,0
3,5.0,"This is beyond beautiful. So shiny, the size ...",2,True,B071ZMDK26,"Sterling Silver Moon Phase Charm Necklace, 18""",0
4,5.0,Oh wow what a pleasant surprise! This smells g...,1,True,B01MPVZ4YP,FLOWER DANCE Fragrance Oil 1oz Beaumondes Perf...,0


In [2]:
from sklearn.model_selection import train_test_split

X = df
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (531329, 7)
Test shape: (132833, 7)


In [None]:
import sys
import os
sys.path.append(os.path.abspath(".."))

# from src.text_pipeline import TextPipeline
# from src.metadata_pipeline import MetadataPipeline
from src.fusion_model import FusionModel

In [None]:
'''text_pipeline = TextPipeline(max_features=3000)

train_text = text_pipeline.preprocess(X_train)
test_text = text_pipeline.preprocess(X_test)

X_text_train = text_pipeline.fit_transform(train_text)
X_text_test = text_pipeline.transform(test_text)'''

In [None]:
# print("Text feature shape:", X_text_train.shape)

Text feature shape: (531329, 3000)


In [None]:
''' meta_pipeline = MetadataPipeline()

train_meta = meta_pipeline.extract_features(X_train)
test_meta = meta_pipeline.extract_features(X_test)

X_meta_train = meta_pipeline.fit_transform(train_meta)
X_meta_test = meta_pipeline.transform(test_meta)

print("Metadata feature shape:", X_meta_train.shape)
'''

Metadata feature shape: (531329, 4)


In [7]:
fusion = FusionModel()

X_train_fused = fusion.fuse_features(X_text_train, X_meta_train)
X_test_fused = fusion.fuse_features(X_text_test, X_meta_test)

print("Fused train shape:", X_train_fused.shape)
print("Fused test shape:", X_test_fused.shape)

Fused train shape: (531329, 3004)
Fused test shape: (132833, 3004)


In [8]:
fusion.train(X_train_fused, y_train)

In [9]:
fusion.evaluate(X_test_fused, y_test)

              precision    recall  f1-score   support

           0       1.00      0.99      1.00    127924
           1       0.87      1.00      0.93      4909

    accuracy                           0.99    132833
   macro avg       0.93      1.00      0.96    132833
weighted avg       1.00      0.99      0.99    132833



In [None]:
# 
''' from sklearn.linear_model import LogisticRegression

baseline_model = LogisticRegression(max_iter=1000)
baseline_model.fit(X_text_train, y_train)

baseline_preds = baseline_model.predict(X_text_test)

from sklearn.metrics import classification_report
print("Text Only Model:")
print(classification_report(y_test, baseline_preds)) '''

Text Only Model:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98    127924
           1       0.69      0.03      0.05      4909

    accuracy                           0.96    132833
   macro avg       0.83      0.51      0.52    132833
weighted avg       0.95      0.96      0.95    132833

