In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import pickle
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("="*70)
print("PHASE 1: PRODUCT RECOMMENDATION MODEL PIPELINE")
print("="*70)


  from pandas.core import (


PHASE 1: PRODUCT RECOMMENDATION MODEL PIPELINE


In [6]:


# ============================================================================
# STEP 1: LOAD DATA
# ============================================================================
print("\n[STEP 1] Loading Datasets...")

# Load customer social profiles
social_df = pd.read_csv('customer_social_profiles.csv')
print(f"âœ“ Social Profiles loaded: {social_df.shape[0]} rows, {social_df.shape[1]} columns")

# Load customer transactions
trans_df = pd.read_csv('customer_transactions.csv')
print(f"âœ“ Transactions loaded: {trans_df.shape[0]} rows, {trans_df.shape[1]} columns")

display(social_df.head())
display(trans_df.head())


[STEP 1] Loading Datasets...
âœ“ Social Profiles loaded: 155 rows, 5 columns
âœ“ Transactions loaded: 150 rows, 6 columns


Unnamed: 0,customer_id_new,social_media_platform,engagement_score,purchase_interest_score,review_sentiment
0,A178,LinkedIn,74,4.9,Positive
1,A190,Twitter,82,4.8,Neutral
2,A150,Facebook,96,1.6,Positive
3,A162,Twitter,89,2.6,Positive
4,A197,Twitter,92,2.3,Neutral


Unnamed: 0,customer_id_legacy,transaction_id,purchase_amount,purchase_date,product_category,customer_rating
0,151,1001,408,2024-01-01,Sports,2.3
1,192,1002,332,2024-01-02,Electronics,4.2
2,114,1003,442,2024-01-03,Electronics,2.1
3,171,1004,256,2024-01-04,Clothing,2.8
4,160,1005,64,2024-01-05,Clothing,1.3


In [7]:

# ============================================================================
# STEP 2: EXPLORATORY DATA ANALYSIS (EDA)
# ============================================================================
print("\n[STEP 2] Exploratory Data Analysis")
print("-" * 70)

print("\nðŸ“Š SOCIAL PROFILES DATASET:")
print(social_df.head())
print("\nData Types:")
print(social_df.dtypes)
print("\nSummary Statistics:")
print(social_df.describe())
print("\nMissing Values:")
print(social_df.isnull().sum())
print(f"\nUnique Customers: {social_df['customer_id_new'].nunique()}")

print("\nðŸ“Š TRANSACTIONS DATASET:")
print(trans_df.head())
print("\nData Types:")
print(trans_df.dtypes)
print("\nSummary Statistics:")
print(trans_df.describe())
print("\nMissing Values:")
print(trans_df.isnull().sum())
print(f"\nUnique Customers: {trans_df['customer_id_legacy'].nunique()}")

# Check product categories
print("\nðŸŽ¯ Product Categories Distribution:")
print(trans_df['product_category'].value_counts())


[STEP 2] Exploratory Data Analysis
----------------------------------------------------------------------

ðŸ“Š SOCIAL PROFILES DATASET:
  customer_id_new social_media_platform  engagement_score  \
0            A178              LinkedIn                74   
1            A190               Twitter                82   
2            A150              Facebook                96   
3            A162               Twitter                89   
4            A197               Twitter                92   

   purchase_interest_score review_sentiment  
0                      4.9         Positive  
1                      4.8          Neutral  
2                      1.6         Positive  
3                      2.6         Positive  
4                      2.3          Neutral  

Data Types:
customer_id_new             object
social_media_platform       object
engagement_score             int64
purchase_interest_score    float64
review_sentiment            object
dtype: object

Summary Statisti

In [8]:
# ============================================================================
# STEP 3: DATA CLEANING
# ============================================================================
print("\n[STEP 3] Data Cleaning")
print("-" * 70)

# Clean Social Profiles
print("\nðŸ§¹ Cleaning Social Profiles...")
social_clean = social_df.copy()
print(f"Duplicates found: {social_clean.duplicated().sum()}")
# Keep duplicates as customers may have multiple social platforms

# Clean Transactions
print("\nðŸ§¹ Cleaning Transactions...")
trans_clean = trans_df.copy()
print(f"Missing customer_rating values: {trans_clean['customer_rating'].isnull().sum()}")

# Fill missing ratings with median
median_rating = trans_clean['customer_rating'].median()
trans_clean['customer_rating'].fillna(median_rating, inplace=True)
print(f"âœ“ Filled missing ratings with median: {median_rating}")

# Convert date to datetime
trans_clean['purchase_date'] = pd.to_datetime(trans_clean['purchase_date'])
print("âœ“ Converted purchase_date to datetime")



[STEP 3] Data Cleaning
----------------------------------------------------------------------

ðŸ§¹ Cleaning Social Profiles...
Duplicates found: 5

ðŸ§¹ Cleaning Transactions...
Missing customer_rating values: 10
âœ“ Filled missing ratings with median: 3.0
âœ“ Converted purchase_date to datetime


In [9]:
# ============================================================================
# STEP 4: KEY INSIGHT - ID MISMATCH
# ============================================================================
print("\n[STEP 4] Identifying ID Pattern")
print("-" * 70)

# Check ID formats
print("Social Profile IDs (sample):", social_clean['customer_id_new'].head().tolist())
print("Transaction IDs (sample):", trans_clean['customer_id_legacy'].head().tolist())

print("\nðŸ’¡ KEY FINDING: ID Format Mismatch Detected!")
print("   Social IDs use format: A### (e.g., A150)")
print("   Transaction IDs use format: ### (e.g., 150)")
print("\n   Solution: Remove 'A' prefix from social IDs for merging")

# Create unified customer ID in social data
social_clean['customer_id'] = social_clean['customer_id_new'].str.replace('A', '').astype(int)
trans_clean['customer_id'] = trans_clean['customer_id_legacy']

print(f"âœ“ Unified customer_id created in both datasets")


[STEP 4] Identifying ID Pattern
----------------------------------------------------------------------
Social Profile IDs (sample): ['A178', 'A190', 'A150', 'A162', 'A197']
Transaction IDs (sample): [151, 192, 114, 171, 160]

ðŸ’¡ KEY FINDING: ID Format Mismatch Detected!
   Social IDs use format: A### (e.g., A150)
   Transaction IDs use format: ### (e.g., 150)

   Solution: Remove 'A' prefix from social IDs for merging
âœ“ Unified customer_id created in both datasets


In [10]:
# ============================================================================
# STEP 5: FEATURE ENGINEERING (BEFORE MERGE)
# ============================================================================
print("\n[STEP 5] Feature Engineering - Pre-Merge")
print("-" * 70)

# Aggregate social media features by customer
print("\nðŸ“ˆ Aggregating social media metrics by customer...")
social_agg = social_clean.groupby('customer_id').agg({
    'engagement_score': ['mean', 'max', 'std'],
    'purchase_interest_score': ['mean', 'max'],
    'social_media_platform': 'count'  # Number of platforms
}).reset_index()

# Flatten column names
social_agg.columns = ['customer_id', 'avg_engagement', 'max_engagement', 'std_engagement',
                      'avg_purchase_interest', 'max_purchase_interest', 'num_platforms']

# Fill NaN std with 0 (single platform users)
social_agg['std_engagement'].fillna(0, inplace=True)

# Encode sentiment (most common sentiment per customer)
sentiment_mode = social_clean.groupby('customer_id')['review_sentiment'].agg(
    lambda x: x.value_counts().index[0]
).reset_index()
sentiment_mode.columns = ['customer_id', 'dominant_sentiment']

# Merge sentiment
social_agg = social_agg.merge(sentiment_mode, on='customer_id', how='left')

print(f"âœ“ Created {social_agg.shape[1]-1} social media features")
print(social_agg.head())

# Aggregate transaction features by customer
print("\nðŸ“ˆ Aggregating transaction metrics by customer...")
trans_agg = trans_clean.groupby('customer_id').agg({
    'purchase_amount': ['mean', 'sum', 'count'],
    'customer_rating': 'mean',
    'product_category': lambda x: x.mode()[0] if len(x.mode()) > 0 else x.iloc[0]
}).reset_index()

# Flatten column names
trans_agg.columns = ['customer_id', 'avg_purchase_amount', 'total_spent', 
                     'purchase_frequency', 'avg_rating', 'most_purchased_category']

print(f"âœ“ Created {trans_agg.shape[1]-1} transaction features")
print(trans_agg.head())



[STEP 5] Feature Engineering - Pre-Merge
----------------------------------------------------------------------

ðŸ“ˆ Aggregating social media metrics by customer...
âœ“ Created 7 social media features
   customer_id  avg_engagement  max_engagement  std_engagement  \
0          100       77.000000              81        5.656854   
1          101       68.000000              68        0.000000   
2          102       51.000000              51        0.000000   
3          103       64.333333              77       10.969655   
4          104       83.000000              91        7.549834   

   avg_purchase_interest  max_purchase_interest  num_platforms  \
0               4.400000                    4.4              2   
1               1.000000                    1.0              1   
2               4.800000                    4.8              1   
3               2.866667                    3.6              3   
4               2.933333                    4.6              3   

  d

In [11]:
# ============================================================================
# STEP 6: MERGE DATASETS
# ============================================================================
print("\n[STEP 6] Merging Datasets")
print("-" * 70)

# Merge on customer_id
merged_df = social_agg.merge(trans_agg, on='customer_id', how='inner')
print(f"âœ“ Merged dataset shape: {merged_df.shape}")
print(f"âœ“ Customers in merged data: {merged_df['customer_id'].nunique()}")

# Display merged data
print("\nðŸ“‹ Merged Dataset Preview:")
print(merged_df.head(10))
print("\nMerged Data Info:")
print(merged_df.info())



[STEP 6] Merging Datasets
----------------------------------------------------------------------
âœ“ Merged dataset shape: (61, 13)
âœ“ Customers in merged data: 61

ðŸ“‹ Merged Dataset Preview:
   customer_id  avg_engagement  max_engagement  std_engagement  \
0          100       77.000000              81        5.656854   
1          101       68.000000              68        0.000000   
2          102       51.000000              51        0.000000   
3          103       64.333333              77       10.969655   
4          104       83.000000              91        7.549834   
5          105       51.000000              52        1.414214   
6          106       99.000000              99        0.000000   
7          107       77.000000              96       17.349352   
8          111       98.000000              98        0.000000   
9          113       74.000000              98       33.941125   

   avg_purchase_interest  max_purchase_interest  num_platforms  \
0          

In [12]:
# ============================================================================
# STEP 7: POST-MERGE FEATURE ENGINEERING
# ============================================================================
print("\n[STEP 7] Feature Engineering - Post-Merge")
print("-" * 70)

# Create interaction features
merged_df['engagement_x_interest'] = merged_df['avg_engagement'] * merged_df['avg_purchase_interest']
merged_df['spending_per_purchase'] = merged_df['total_spent'] / merged_df['purchase_frequency']
merged_df['engagement_diversity'] = merged_df['std_engagement'] / (merged_df['avg_engagement'] + 1)

print("âœ“ Created 3 interaction features:")
print("  - engagement_x_interest")
print("  - spending_per_purchase")
print("  - engagement_diversity")


[STEP 7] Feature Engineering - Post-Merge
----------------------------------------------------------------------
âœ“ Created 3 interaction features:
  - engagement_x_interest
  - spending_per_purchase
  - engagement_diversity
