In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import pickle
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("="*70)
print("PHASE 1: PRODUCT RECOMMENDATION MODEL PIPELINE")
print("="*70)


PHASE 1: PRODUCT RECOMMENDATION MODEL PIPELINE


In [None]:


# ============================================================================
# STEP 1: LOAD DATA
# ============================================================================
print("\n[STEP 1] Loading Datasets...")

# Load customer social profiles
social_df = pd.read_csv('customer_social_profiles.csv')
print(f"âœ“ Social Profiles loaded: {social_df.shape[0]} rows, {social_df.shape[1]} columns")

# Load customer transactions
trans_df = pd.read_csv('customer_transactions.csv')
print(f"âœ“ Transactions loaded: {trans_df.shape[0]} rows, {trans_df.shape[1]} columns")

display(social_df.head())
display(trans_df.head())


[STEP 1] Loading Datasets...
âœ“ Social Profiles loaded: 155 rows, 5 columns
âœ“ Transactions loaded: 150 rows, 6 columns


Unnamed: 0,customer_id_new,social_media_platform,engagement_score,purchase_interest_score,review_sentiment
0,A178,LinkedIn,74,4.9,Positive
1,A190,Twitter,82,4.8,Neutral
2,A150,Facebook,96,1.6,Positive
3,A162,Twitter,89,2.6,Positive
4,A197,Twitter,92,2.3,Neutral


Unnamed: 0,customer_id_legacy,transaction_id,purchase_amount,purchase_date,product_category,customer_rating
0,151,1001,408,2024-01-01,Sports,2.3
1,192,1002,332,2024-01-02,Electronics,4.2
2,114,1003,442,2024-01-03,Electronics,2.1
3,171,1004,256,2024-01-04,Clothing,2.8
4,160,1005,64,2024-01-05,Clothing,1.3


In [None]:

# ============================================================================
# STEP 2: EXPLORATORY DATA ANALYSIS (EDA)
# ============================================================================
print("\n[STEP 2] Exploratory Data Analysis")
print("-" * 70)

print("\nðŸ“Š SOCIAL PROFILES DATASET:")
print(social_df.head())
print("\nData Types:")
print(social_df.dtypes)
print("\nSummary Statistics:")
print(social_df.describe())
print("\nMissing Values:")
print(social_df.isnull().sum())
print(f"\nUnique Customers: {social_df['customer_id_new'].nunique()}")

print("\nðŸ“Š TRANSACTIONS DATASET:")
print(trans_df.head())
print("\nData Types:")
print(trans_df.dtypes)
print("\nSummary Statistics:")
print(trans_df.describe())
print("\nMissing Values:")
print(trans_df.isnull().sum())
print(f"\nUnique Customers: {trans_df['customer_id_legacy'].nunique()}")

# Check product categories
print("\nðŸŽ¯ Product Categories Distribution:")
print(trans_df['product_category'].value_counts())


[STEP 2] Exploratory Data Analysis
----------------------------------------------------------------------

ðŸ“Š SOCIAL PROFILES DATASET:
  customer_id_new social_media_platform  engagement_score  \
0            A178              LinkedIn                74   
1            A190               Twitter                82   
2            A150              Facebook                96   
3            A162               Twitter                89   
4            A197               Twitter                92   

   purchase_interest_score review_sentiment  
0                      4.9         Positive  
1                      4.8          Neutral  
2                      1.6         Positive  
3                      2.6         Positive  
4                      2.3          Neutral  

Data Types:
customer_id_new             object
social_media_platform       object
engagement_score             int64
purchase_interest_score    float64
review_sentiment            object
dtype: object

Summary Statisti

In [None]:
# ============================================================================
# STEP 3: DATA CLEANING
# ============================================================================
print("\n[STEP 3] Data Cleaning")
print("-" * 70)

# Clean Social Profiles
print("\nðŸ§¹ Cleaning Social Profiles...")
social_clean = social_df.copy()
print(f"Duplicates found: {social_clean.duplicated().sum()}")
# Keep duplicates as customers may have multiple social platforms

# Clean Transactions
print("\nðŸ§¹ Cleaning Transactions...")
trans_clean = trans_df.copy()
print(f"Missing customer_rating values: {trans_clean['customer_rating'].isnull().sum()}")

# Fill missing ratings with median
median_rating = trans_clean['customer_rating'].median()
trans_clean['customer_rating'].fillna(median_rating, inplace=True)
print(f"âœ“ Filled missing ratings with median: {median_rating}")

# Convert date to datetime
trans_clean['purchase_date'] = pd.to_datetime(trans_clean['purchase_date'])
print("âœ“ Converted purchase_date to datetime")



[STEP 3] Data Cleaning
----------------------------------------------------------------------

ðŸ§¹ Cleaning Social Profiles...
Duplicates found: 5

ðŸ§¹ Cleaning Transactions...
Missing customer_rating values: 10
âœ“ Filled missing ratings with median: 3.0
âœ“ Converted purchase_date to datetime
