#  Task 3: Product Recommendation Model  



In [3]:
# STEP 1: IMPORT LIBRARIES

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

import seaborn as sns
import matplotlib.pyplot as plt

import joblib

print("Libraries imported successfully!")


Libraries imported successfully!


In [4]:
import pandas as pd

merged = pd.read_csv(
    "https://raw.githubusercontent.com/DavBelM/Multimodal_Data_Preprocessing/main/data/processed/merged_customer_data.csv"
)

merged.head(), merged.shape


(   customer_id social_media_platform  engagement_score  \
 0          190               Twitter                82   
 1          150              Facebook                96   
 2          162               Twitter                89   
 3          151                TikTok                61   
 4          137              LinkedIn                93   
 
    purchase_interest_score review_sentiment  transaction_id_mean  \
 0                      4.8          Neutral               1085.5   
 1                      1.6         Positive               1044.0   
 2                      2.6         Positive               1097.5   
 3                      1.3          Neutral               1001.0   
 4                      3.5          Neutral               1020.0   
 
    transaction_id_sum  transaction_id_std  purchase_amount_mean  \
 0                2171           77.074639                 367.0   
 1                2088            2.828427                 283.0   
 2                2195  

In [5]:
merged.info()
merged.head()
merged.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109 entries, 0 to 108
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   customer_id                109 non-null    int64  
 1   social_media_platform      109 non-null    object 
 2   engagement_score           109 non-null    int64  
 3   purchase_interest_score    109 non-null    float64
 4   review_sentiment           109 non-null    object 
 5   transaction_id_mean        109 non-null    float64
 6   transaction_id_sum         109 non-null    int64  
 7   transaction_id_std         65 non-null     float64
 8   purchase_amount_mean       109 non-null    float64
 9   purchase_amount_sum        109 non-null    int64  
 10  purchase_amount_std        65 non-null     float64
 11  customer_rating_mean       109 non-null    float64
 12  customer_rating_sum        109 non-null    float64
 13  customer_rating_std        65 non-null     float64

Unnamed: 0,0
customer_id,0
social_media_platform,0
engagement_score,0
purchase_interest_score,0
review_sentiment,0
transaction_id_mean,0
transaction_id_sum,0
transaction_id_std,44
purchase_amount_mean,0
purchase_amount_sum,0


In [6]:
# Target
y = merged['product_category_<lambda>']

# Features: drop ID and aggregated target
X = merged.drop(columns=[
    'product_category_<lambda>',
    'customer_id',
    'purchase_date_<lambda>'
])


In [7]:
categorical_cols = [col for col in X.columns if X[col].dtype == 'object']
numerical_cols   = [col for col in X.columns if X[col].dtype != 'object']

categorical_cols, numerical_cols


(['social_media_platform', 'review_sentiment'],
 ['engagement_score',
  'purchase_interest_score',
  'transaction_id_mean',
  'transaction_id_sum',
  'transaction_id_std',
  'purchase_amount_mean',
  'purchase_amount_sum',
  'purchase_amount_std',
  'customer_rating_mean',
  'customer_rating_sum',
  'customer_rating_std',
  'customer_value_score',
  'avg_transaction_value',
  'transaction_consistency',
  'social_engagement_score',
  'avg_customer_rating'])

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', StandardScaler(), numerical_cols)
    ]
)


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

model = Pipeline(steps=[
    ('preprocessor', preprocess),
    ('classifier', RandomForestClassifier(n_estimators=200, random_state=42))
])

model.fit(X_train, y_train)


In [11]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Macro F1:", f1_score(y_test, y_pred, average='macro'))
print(classification_report(y_test, y_pred))


Accuracy: 0.8181818181818182
Macro F1: 0.8171428571428571
              precision    recall  f1-score   support

       Books       0.80      0.80      0.80         5
    Clothing       0.67      0.50      0.57         4
 Electronics       0.86      0.86      0.86         7
   Groceries       0.75      1.00      0.86         3
      Sports       1.00      1.00      1.00         3

    accuracy                           0.82        22
   macro avg       0.81      0.83      0.82        22
weighted avg       0.81      0.82      0.81        22



In [12]:
import joblib

joblib.dump(model, "product_model.pkl")


['product_model.pkl']

In [13]:
from google.colab import files
files.download("product_model.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>