In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd

# Data Viz & Regular Expression Libraries :

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

# Scikit-Learn ML Libraries :

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import classification_report

# Boosting Algorithm Libraries :

# from xgboost                          import XGBClassifier
# from catboost                         import CatBoostClassifier
# from lightgbm                         import LGBMClassifier
from sklearn.ensemble                 import RandomForestClassifier, VotingClassifier
from sklearn.metrics                  import accuracy_score
from sklearn.model_selection          import StratifiedKFold,KFold, train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('data/processed/train.csv')


  df = pd.read_csv('data/processed/train.csv')


# Reduce memory 

In [5]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [4]:

df = reduce_memory_usage(df)

Mem. usage decreased to 781.94 Mb (53.6% reduction)


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4606311 entries, 0 to 4606310
Data columns (total 48 columns):
 #   Column                                Dtype  
---  ------                                -----  
 0   date                                  object 
 1   customer_id                           int32  
 2   employee_index                        object 
 3   country_of_residence                  object 
 4   gender                                object 
 5   age                                   object 
 6   registration_date                     object 
 7   new_customer                          float16
 8   seniority                             object 
 9   primary_customer                      float16
 10  last_primary_date                     object 
 11  customer_type                         object 
 12  relation_type                         object 
 13  residence_index                       object 
 14  foreigner_index                       object 
 15  spouse_index   

# Clean dataset

In [8]:
def clean_dataset(df):
    """
    Clean and preprocess the banking dataset
    
    Parameters:
    df: pandas DataFrame - Raw dataset
    
    Returns:
    df: pandas DataFrame - Cleaned dataset
    """
    
    print("🧹 CLEANING DATASET...")
    print("="*40)
    
    # =============================
    # 1. HANDLE MISSING VALUES
    # =============================
    print("1️⃣ Handling missing values...")
    
    # Fill missing values for payroll indicators
    # These columns indicate if customer has payroll/pension products
    missing_before = df[['payroll_final_label', 'pensions_2_final_label']].isnull().sum()
    df.fillna(value={
        'payroll_final_label': 0,
        'pensions_2_final_label': 0
    }, inplace=True)
    
    print(f"   ✅ Filled payroll_final_label: {missing_before['payroll_final_label']} missing → 0")
    print(f"   ✅ Filled pensions_2_final_label: {missing_before['pensions_2_final_label']} missing → 0")
    
    # =============================
    # 2. CREATE CUSTOMER TENURE FEATURE
    # =============================
    print("\n2️⃣ Creating customer tenure feature...")
    
    # Convert date columns to datetime
    df['date'] = pd.to_datetime(df['date'])
    df['registration_date'] = pd.to_datetime(df['registration_date'])
    
    # Calculate days since registration (customer tenure)
    days_column = (df['date'] - df['registration_date']).dt.days
    
    # Insert the new column at position 6
    df.insert(loc=6, column='customer_tenure_days', value=days_column)
    
    print(f"   ✅ Created 'customer_tenure_days' feature")
    print(f"   📊 Range: {days_column.min()} to {days_column.max()} days")
    
    # Drop the original registration_date column to save memory
    df.drop(columns=['registration_date'], inplace=True)
    print(f"   🗑️ Dropped 'registration_date' column")
    
    # =============================
    # 3. CONVERT LAST_PRIMARY_DATE TO BINARY INDICATOR
    # =============================
    print("\n3️⃣ Converting last_primary_date to binary indicator...")
    
    # Convert last_primary_date to binary: 1 if date exists, 0 if null
    # This indicates if customer was ever a primary customer
    original_nulls = df['last_primary_date'].isnull().sum()
    df['was_primary_customer'] = df['last_primary_date'].apply(
        lambda x: 1 if pd.notnull(x) else 0
    )
    
    print(f"   ✅ Created 'was_primary_customer' binary feature")
    print(f"   📊 {original_nulls:,} nulls → 0, {len(df) - original_nulls:,} dates → 1")
    
    # Drop the original last_primary_date column
    df.drop(columns=['last_primary_date'], inplace=True)
    print(f"   🗑️ Dropped 'last_primary_date' column")
    
    # =============================
    # 4. REMOVE CONSTANT/DUPLICATE COLUMNS
    # =============================
    print("\n4️⃣ Removing constant and duplicate columns...")
    
    # Remove address_type if it has the same value for all customers
    if 'address_type' in df.columns:
        unique_values = df['address_type'].nunique()
        if unique_values <= 1:
            df.drop(columns=['address_type'], inplace=True)
            print(f"   🗑️ Dropped 'address_type' (constant value)")
        else:
            print(f"   ✅ Kept 'address_type' ({unique_values} unique values)")
    
    # Remove province_code as it's duplicate of province_name
    if 'province_code' in df.columns and 'province_name' in df.columns:
        df.drop(columns=['province_code'], inplace=True)
        print(f"   🗑️ Dropped 'province_code' (duplicate of province_name)")
    
    # =============================
    # 5. CLEAN NUMERIC COLUMNS
    # =============================
    print("\n5️⃣ Cleaning numeric columns...")
    
    # Convert age column - handle 'NA' strings
    if 'age' in df.columns:
        age_before = df['age'].dtype
        df['age'] = pd.to_numeric(df['age'], errors='coerce')
        age_nulls = df['age'].isnull().sum()
        print(f"   ✅ Cleaned 'age': {age_before} → numeric ({age_nulls:,} nulls)")
    
    # Convert seniority column - handle 'NA' strings and negative values
    if 'seniority' in df.columns:
        seniority_before = df['seniority'].dtype
        df['seniority'] = pd.to_numeric(df['seniority'], errors='coerce')
        
        # Handle special negative values (often -999999 means missing)
        negative_values = (df['seniority'] < 0).sum()
        if negative_values > 0:
            df['seniority'] = df['seniority'].where(df['seniority'] >= 0, np.nan)
            print(f"   ⚠️ Converted {negative_values:,} negative seniority values to NaN")
        
        seniority_nulls = df['seniority'].isnull().sum()
        print(f"   ✅ Cleaned 'seniority': {seniority_before} → numeric ({seniority_nulls:,} nulls)")
    
    # =============================
    # 6. SUMMARY STATISTICS
    # =============================
    print("\n📊 CLEANUP SUMMARY:")
    print("-" * 40)
    print(f"   Final shape: {df.shape}")
    print(f"   Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
    print(f"   Null values: {df.isnull().sum().sum():,}")
    
    # Show data types
    print(f"\n📋 DATA TYPES AFTER CLEANUP:")
    for dtype in df.dtypes.value_counts().index:
        count = df.dtypes.value_counts()[dtype]
        print(f"   {dtype}: {count} columns")
    
    return df


In [8]:
df = clean_dataset(df)

🧹 CLEANING DATASET...
1️⃣ Handling missing values...
   ✅ Filled payroll_final_label: 0 missing → 0
   ✅ Filled pensions_2_final_label: 0 missing → 0

2️⃣ Creating customer tenure feature...
   ✅ Created 'customer_tenure_days' feature
   📊 Range: -3.0 to 7498.0 days
   🗑️ Dropped 'registration_date' column

3️⃣ Converting last_primary_date to binary indicator...
   ✅ Created 'was_primary_customer' binary feature
   📊 4,600,165 nulls → 0, 6,146 dates → 1
   🗑️ Dropped 'last_primary_date' column

4️⃣ Removing constant and duplicate columns...
   🗑️ Dropped 'address_type' (constant value)
   🗑️ Dropped 'province_code' (duplicate of province_name)

5️⃣ Cleaning numeric columns...
   ✅ Cleaned 'age': object → numeric (27,734 nulls)
   ⚠️ Converted 14 negative seniority values to NaN
   ✅ Cleaned 'seniority': object → numeric (27,748 nulls)

📊 CLEANUP SUMMARY:
----------------------------------------
   Final shape: (4606311, 46)
   Memory usage: 2935.7 MB
   Null values: 5,979,487

📋 DATA T

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4606311 entries, 0 to 4606310
Data columns (total 46 columns):
 #   Column                                Dtype         
---  ------                                -----         
 0   date                                  datetime64[ns]
 1   customer_id                           int32         
 2   employee_index                        object        
 3   country_of_residence                  object        
 4   gender                                object        
 5   age                                   float64       
 6   customer_tenure_days                  float64       
 7   new_customer                          float16       
 8   seniority                             float64       
 9   primary_customer                      float16       
 10  customer_type                         object        
 11  relation_type                         object        
 12  residence_index                       object        
 13  foreigner_in

# Filter data for TypeI 

In [15]:
payment_account_labels = [
    'current_accounts_final_label',
    'payroll_accounts_final_label',
    'junior_accounts_final_label',
    'more_particular_accounts_final_label',
    'particular_accounts_final_label',
    'particular_plus_accounts_final_label',
    'home_account_final_label',
    'payroll_final_label',
    'e_account_final_label'
]

customer_features = [
    'date', 'customer_id', 'employee_index', 'country_of_residence', 'gender',
    'age', 'customer_tenure_days', 'seniority', 'residence_index',  # Added customer_tenure_days
    'foreigner_index', 'spouse_index', 'channel', 'deceased_index', 
    'province_name', 'segment', 'was_primary_customer'  # Added was_primary_customer
]


In [11]:
def filter_data(df) :
    # Lấy khách hàng không có tài khoản thanh toán nào (không có giá trị -1)
    mask = ~(df[payment_account_labels] == -1).any(axis=1)
    
    # Các cột cần giữ lại
    columns_to_keep = customer_features + payment_account_labels
    
    # Lọc dữ liệu
    df = df.loc[mask, columns_to_keep]
    
    print(f"Tổng số khách hàng: {len(df):,}")
    print(f"Khách hàng KHÔNG sở hữu tài khoản thanh toán nào: {len(df):,}")
    
    # Kiểm tra distribution
    print("\n📊 Distribution check:")
    for col in payment_account_labels:
        unique_vals = df[col].unique()
        print(f"  {col}: {unique_vals}")
    
    return df

In [14]:
df = filter_data(df)

Tổng số khách hàng: 442,736
Khách hàng KHÔNG sở hữu tài khoản thanh toán nào: 442,736

📊 Distribution check:
  current_accounts_final_label: [0 1]
  payroll_accounts_final_label: [0 1]
  junior_accounts_final_label: [0 1]
  more_particular_accounts_final_label: [0 1]
  particular_accounts_final_label: [0 1]
  particular_plus_accounts_final_label: [0 1]
  home_account_final_label: [0]
  payroll_final_label: [0 1]
  e_account_final_label: [0 1]


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 442736 entries, 3 to 4606305
Data columns (total 25 columns):
 #   Column                                Non-Null Count   Dtype         
---  ------                                --------------   -----         
 0   date                                  442736 non-null  datetime64[ns]
 1   customer_id                           442736 non-null  int32         
 2   employee_index                        415002 non-null  object        
 3   country_of_residence                  415002 non-null  object        
 4   gender                                414987 non-null  object        
 5   age                                   415002 non-null  float64       
 6   customer_tenure_days                  415002 non-null  float64       
 7   seniority                             414995 non-null  float64       
 8   residence_index                       415002 non-null  object        
 9   foreigner_index                       415002 non-null  object  

# Clean memory 

In [16]:

import gc

def cleanup_memory():
    """
    Clean up unused memory and optimize DataFrame
    """
    print("🧹 MEMORY CLEANUP...")
    print("="*40)
    
    # Force garbage collection
    collected = gc.collect()
    print(f"   🗑️ Garbage collected: {collected} objects")
    
    # Get memory info
    import psutil
    import os
    
    process = psutil.Process(os.getpid())
    memory_info = process.memory_info()
    memory_mb = memory_info.rss / 1024 / 1024
    
    print(f"   💾 Current memory usage: {memory_mb:.1f} MB")
    
    return memory_mb

memory_before = cleanup_memory()

🧹 MEMORY CLEANUP...
   🗑️ Garbage collected: 2127 objects
   💾 Current memory usage: 1390.0 MB


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 442736 entries, 3 to 4606305
Data columns (total 25 columns):
 #   Column                                Non-Null Count   Dtype         
---  ------                                --------------   -----         
 0   date                                  442736 non-null  datetime64[ns]
 1   customer_id                           442736 non-null  int32         
 2   employee_index                        415002 non-null  object        
 3   country_of_residence                  415002 non-null  object        
 4   gender                                414987 non-null  object        
 5   age                                   415002 non-null  float64       
 6   customer_tenure_days                  415002 non-null  float64       
 7   seniority                             414995 non-null  float64       
 8   residence_index                       415002 non-null  object        
 9   foreigner_index                       415002 non-null  object  

In [19]:
# Cột target (label) - tài khoản thanh toán
tar_cols = [
    'current_accounts_final_label',
    'payroll_accounts_final_label',
    'junior_accounts_final_label',
    'more_particular_accounts_final_label',
    'particular_accounts_final_label',
    'particular_plus_accounts_final_label',
    'home_account_final_label',
    'payroll_final_label',
    'e_account_final_label'
]

# Cột số (numeric)
num_cols = [
    'age',
    'seniority',
    'new_customer',
    'address_type',
    'province_code',
    'activity_index',
    'income'
]

# Cột phân loại (categorical / object / category)
cat_cols = [
    'employee_index',
    'country_of_residence',
    'gender',
    'primary_customer',
    'customer_type',
    'relation_type',
    'residence_index',
    'foreigner_index',
    'spouse_index',
    'channel',
    'deceased_index',
    'province_name',
    'segment'
]


# Feature Engineering

In [21]:
def create_enhanced_features(df):
    df_enhanced = df.copy()
    
    print("🔧 CREATING ENHANCED FEATURES...")
    print("="*50)
    
    # Check available columns first
    available_cols = df_enhanced.columns.tolist()
    print(f"📋 Available columns: {len(available_cols)}")
    
    # =============================
    # 1. TIME-BASED FEATURES
    # =============================
    print("📅 Creating time-based features...")
    
    # Extract từ date column
    if 'date' in df_enhanced.columns:
        df_enhanced['year'] = df_enhanced['date'].dt.year
        df_enhanced['month'] = df_enhanced['date'].dt.month
        df_enhanced['quarter'] = df_enhanced['date'].dt.quarter
        df_enhanced['day_of_week'] = df_enhanced['date'].dt.dayofweek
        df_enhanced['is_weekend'] = (df_enhanced['day_of_week'] >= 5).astype(int)
        df_enhanced['is_month_end'] = (df_enhanced['date'].dt.day >= 25).astype(int)
        df_enhanced['is_quarter_end'] = df_enhanced['date'].dt.month.isin([3, 6, 9, 12]).astype(int)
    
    # Customer tenure features (using existing customer_tenure_days)
    if 'customer_tenure_days' in df_enhanced.columns:
        df_enhanced['years_since_registration'] = df_enhanced['customer_tenure_days'] / 365.25
        
        # Customer tenure categories
        df_enhanced['tenure_category'] = pd.cut(
            df_enhanced['customer_tenure_days'],
            bins=[-1, 90, 365, 1095, 2190, np.inf],
            labels=['Very_New', 'New', 'Medium', 'Long', 'Very_Long']
        )
    
    # =============================
    # 2. DEMOGRAPHIC FEATURES
    # =============================
    print("👥 Creating demographic features...")
    
    # Age-based features
    if 'age' in df_enhanced.columns:
        df_enhanced['age_group'] = pd.cut(
            df_enhanced['age'],
            bins=[0, 25, 35, 45, 55, 65, 100],
            labels=['18-25', '26-35', '36-45', '46-55', '56-65', '65+']
        )
        
        age_young = (df_enhanced['age'] >= 18) & (df_enhanced['age'] <= 30)
        age_middle = (df_enhanced['age'] >= 31) & (df_enhanced['age'] <= 50)
        age_senior = (df_enhanced['age'] >= 60)
        
        df_enhanced['is_young_adult'] = age_young.fillna(False).astype(int)
        df_enhanced['is_middle_aged'] = age_middle.fillna(False).astype(int)
        df_enhanced['is_senior'] = age_senior.fillna(False).astype(int)
        df_enhanced['age_squared'] = df_enhanced['age'] ** 2
    
    # =============================
    # 3. BANKING RELATIONSHIP FEATURES  
    # =============================
    print("🏦 Creating banking relationship features...")
    
    # Seniority-based features
    if 'seniority' in df_enhanced.columns:
        df_enhanced['seniority_years'] = df_enhanced['seniority'] / 12
        df_enhanced['seniority_category'] = pd.cut(
            df_enhanced['seniority'],
            bins=[-1, 0, 6, 12, 24, 60, np.inf],
            labels=['New', 'Very_Short', 'Short', 'Medium', 'Long', 'Very_Long']
        )
        
        seniority_new = df_enhanced['seniority'] <= 6
        seniority_established = df_enhanced['seniority'] >= 24
        
        df_enhanced['is_new_relationship'] = seniority_new.fillna(False).astype(int)
        df_enhanced['is_established_relationship'] = seniority_established.fillna(False).astype(int)
    
    # Customer status features (check if columns exist)
    df_enhanced['is_employee'] = 0
    if 'employee_index' in df_enhanced.columns:
        df_enhanced['is_employee'] = (df_enhanced['employee_index'] == 1).astype(int)
    
    df_enhanced['is_primary_customer_flag'] = 0  
    if 'was_primary_customer' in df_enhanced.columns:
        df_enhanced['is_primary_customer_flag'] = df_enhanced['was_primary_customer']
    
    # Geographic features
    if 'country_of_residence' in df_enhanced.columns:
        domestic = df_enhanced['country_of_residence'] == 'ES'
        df_enhanced['is_domestic'] = domestic.fillna(False).astype(int)
    
    if 'foreigner_index' in df_enhanced.columns:
        foreigner = df_enhanced['foreigner_index'] == 1
        df_enhanced['is_foreigner'] = foreigner.fillna(False).astype(int)
    
    # =============================
    # 4. PRODUCT PORTFOLIO FEATURES
    # =============================
    print("📊 Creating product portfolio features...")
    
    # Define tar_cols as payment_account_labels
    tar_cols = payment_account_labels
    
    # Total products owned
    df_enhanced['total_products'] = df_enhanced[tar_cols].sum(axis=1)
    df_enhanced['has_any_product'] = (df_enhanced['total_products'] > 0).astype(int)
    df_enhanced['is_single_product'] = (df_enhanced['total_products'] == 1).astype(int)
    df_enhanced['is_multi_product'] = (df_enhanced['total_products'] > 1).astype(int)
    
    # Product diversity
    df_enhanced['product_diversity_ratio'] = df_enhanced['total_products'] / len(tar_cols)
    
    # Product category features
    current_products = ['current_accounts_final_label']
    savings_products = ['payroll_accounts_final_label', 'junior_accounts_final_label']
    premium_products = ['particular_plus_accounts_final_label', 'more_particular_accounts_final_label']
    
    df_enhanced['has_current_account'] = df_enhanced[current_products].sum(axis=1)
    df_enhanced['has_savings_account'] = df_enhanced[savings_products].sum(axis=1)
    df_enhanced['has_premium_account'] = df_enhanced[premium_products].sum(axis=1)
    
    # =============================
    # 5. INTERACTION FEATURES
    # =============================
    print("🔗 Creating interaction features...")
    
    # Age-Seniority interactions
    if 'age' in df_enhanced.columns and 'seniority' in df_enhanced.columns:
        age_filled = df_enhanced['age'].fillna(0)
        seniority_filled = df_enhanced['seniority'].fillna(0)
        
        df_enhanced['age_seniority_interaction'] = (age_filled * seniority_filled) / 100
        df_enhanced['seniority_per_age'] = seniority_filled / (age_filled + 1)
    
    # Age-Tenure interactions
    if 'age' in df_enhanced.columns and 'customer_tenure_days' in df_enhanced.columns:
        age_filled = df_enhanced['age'].fillna(0)
        tenure_filled = df_enhanced['customer_tenure_days'].fillna(0)
        
        df_enhanced['age_tenure_ratio'] = age_filled / (tenure_filled/365 + 1)
    
    # =============================
    # 6. BEHAVIORAL FEATURES
    # =============================
    print("🎯 Creating behavioral features...")
    
    # Channel preference
    if 'channel' in df_enhanced.columns:
        channel_mapping = {
            'KAT': 'Traditional',
            'KFC': 'Phone', 
            'KHE': 'Digital',
            'KHM': 'Mobile',
            'KHN': 'Online'
        }
        df_enhanced['channel_type'] = df_enhanced['channel'].map(channel_mapping).fillna('Other')
        df_enhanced['is_digital_channel'] = df_enhanced['channel_type'].isin(['Digital', 'Mobile', 'Online']).astype(int)
    
    # Customer segment enhancement
    if 'segment' in df_enhanced.columns:
        df_enhanced['is_vip_segment'] = df_enhanced['segment'].str.contains('VIP', na=False).astype(int)
        df_enhanced['is_university_segment'] = df_enhanced['segment'].str.contains('UNIVERSITY', na=False).astype(int)
    
    # =============================
    # 7. RISK & STABILITY FEATURES
    # =============================
    print("⚖️ Creating risk and stability features...")
    
    # Customer stability score
    stability_score = 0
    
    if 'seniority' in df_enhanced.columns:
        seniority_stable = df_enhanced['seniority'] >= 12
        stability_score += seniority_stable.fillna(False).astype(int)
    
    if 'age' in df_enhanced.columns:
        age_stable = df_enhanced['age'] >= 30
        stability_score += age_stable.fillna(False).astype(int)
    
    stability_score += df_enhanced['is_primary_customer_flag']
    stability_score += df_enhanced['is_employee']
    
    df_enhanced['customer_stability_score'] = stability_score
    df_enhanced['is_stable_customer'] = (stability_score >= 2).astype(int)
    
    # Potential value score
    potential_score = 0
    
    if 'age' in df_enhanced.columns:
        age_prime = (df_enhanced['age'] >= 25) & (df_enhanced['age'] <= 55)
        potential_score += age_prime.fillna(False).astype(int)
    
    potential_score += df_enhanced['is_digital_channel'] if 'is_digital_channel' in df_enhanced.columns else 0
    potential_score += df_enhanced['is_domestic'] if 'is_domestic' in df_enhanced.columns else 0
    
    df_enhanced['customer_potential_score'] = potential_score
    df_enhanced['is_high_potential'] = (potential_score >= 2).astype(int)
    
    print(f"✅ Feature engineering completed!")
    print(f"Original features: {df.shape[1]}")
    print(f"Enhanced features: {df_enhanced.shape[1]}")
    print(f"New features added: {df_enhanced.shape[1] - df.shape[1]}")
    
    return df_enhanced

# =============================
# APPLY FEATURE ENGINEERING
# =============================
print("🔍 Checking available columns before feature engineering:")
print(f"Available columns: {df.columns.tolist()}")

df_enhanced = create_enhanced_features(df)

# =============================
# VERIFY DATA TYPES
# =============================
print("\n🔍 CHECKING NEW FEATURES DATA TYPES...")
new_features = [col for col in df_enhanced.columns if col not in df.columns]
print(f"New features created: {len(new_features)}")

for feature in new_features[:15]:  # Show first 15 new features
    print(f"   {feature}: {df_enhanced[feature].dtype}")

print(f"\n📊 SAMPLE OF ENHANCED DATA:")
if len(new_features) > 0:
    sample_features = new_features[:5] if len(new_features) >= 5 else new_features
    print(df_enhanced[sample_features].head())
else:
    print("No new features were created.")

🔍 Checking available columns before feature engineering:
Available columns: ['date', 'customer_id', 'employee_index', 'country_of_residence', 'gender', 'age', 'customer_tenure_days', 'seniority', 'residence_index', 'foreigner_index', 'spouse_index', 'channel', 'deceased_index', 'province_name', 'segment', 'was_primary_customer', 'current_accounts_final_label', 'payroll_accounts_final_label', 'junior_accounts_final_label', 'more_particular_accounts_final_label', 'particular_accounts_final_label', 'particular_plus_accounts_final_label', 'home_account_final_label', 'payroll_final_label', 'e_account_final_label']
🔧 CREATING ENHANCED FEATURES...
📋 Available columns: 25
📅 Creating time-based features...
👥 Creating demographic features...
🏦 Creating banking relationship features...
📊 Creating product portfolio features...
🔗 Creating interaction features...
🎯 Creating behavioral features...
⚖️ Creating risk and stability features...
✅ Feature engineering completed!
Original features: 25
Enhanc

In [22]:
del df

In [23]:
df_enhanced.info()

<class 'pandas.core.frame.DataFrame'>
Index: 442736 entries, 3 to 4606305
Data columns (total 66 columns):
 #   Column                                Non-Null Count   Dtype         
---  ------                                --------------   -----         
 0   date                                  442736 non-null  datetime64[ns]
 1   customer_id                           442736 non-null  int32         
 2   employee_index                        415002 non-null  object        
 3   country_of_residence                  415002 non-null  object        
 4   gender                                414987 non-null  object        
 5   age                                   415002 non-null  float64       
 6   customer_tenure_days                  415002 non-null  float64       
 7   seniority                             414995 non-null  float64       
 8   residence_index                       415002 non-null  object        
 9   foreigner_index                       415002 non-null  object  

In [24]:
# =============================
# FEATURE LISTS FOR MODELING
# =============================

# Numeric features (engineered features)
numeric_features = [
    'year', 'month', 'quarter', 'day_of_week', 'is_weekend', 'is_month_end', 'is_quarter_end',
    'days_since_registration', 'registration_year', 'registration_month', 'years_since_registration',
    'is_young_adult', 'is_middle_aged', 'is_senior', 'age_squared',
    'income_vs_median', 'is_high_income', 'is_low_income', 'log_income',
    'seniority_years', 'is_new_relationship', 'is_established_relationship',
    'is_primary_customer', 'is_new_customer', 'is_active', 'is_domestic', 'is_foreigner',
    'total_products', 'has_any_product', 'is_single_product', 'is_multi_product',
    'product_diversity_ratio', 'has_current_account', 'has_savings_account', 'has_premium_account',
    'age_income_interaction', 'income_per_age', 'seniority_income_interaction', 'income_growth_proxy',
    'young_high_income', 'senior_established', 'is_digital_channel',
    'is_vip_segment', 'is_university_segment', 'customer_stability_score',
    'is_stable_customer', 'customer_potential_score', 'is_high_potential'
]

# Categorical features
categorical_features = [
    'tenure_category', 'age_group', 'income_quartile', 'seniority_category', 'channel_type'
]

# Target columns
target_cols = [
    'current_accounts_final_label',
    'payroll_accounts_final_label',
    'junior_accounts_final_label',
    'more_particular_accounts_final_label',
    'particular_accounts_final_label',
    'particular_plus_accounts_final_label',
    'home_account_final_label',
    'payroll_final_label',
    'e_account_final_label'
]

print(f"✅ FEATURE LISTS DEFINED:")
print(f"   Numeric features: {len(numeric_features)}")
print(f"   Categorical features: {len(categorical_features)}")
print(f"   Target columns: {len(target_cols)}")
print(f"   Total features: {len(numeric_features) + len(categorical_features)}")


✅ FEATURE LISTS DEFINED:
   Numeric features: 48
   Categorical features: 5
   Target columns: 9
   Total features: 53


In [25]:
# =============================
# PREPARE DATA FOR MODELING
# =============================

# Features to exclude from modeling
exclude_features = [
    'date', 'customer_id', 'registration_date', 'last_primary_date'
]

# Filter features that exist in dataframe
available_num_cols = [col for col in numeric_features if col in df_enhanced.columns and col not in exclude_features]
available_cat_cols = [col for col in categorical_features if col in df_enhanced.columns and col not in exclude_features]

print(f"🎯 FINAL FEATURE SELECTION:")
print(f"Available numeric features: {len(available_num_cols)}")
print(f"Available categorical features: {len(available_cat_cols)}")
print(f"Target labels: {len(target_cols)}")

# Prepare X and y
X = df_enhanced[available_num_cols + available_cat_cols]
y = df_enhanced[target_cols]

print(f"\n📋 DATA SHAPE:")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

# Check for any remaining issues
print(f"\n🔍 DATA QUALITY CHECK:")
print(f"Missing values in X: {X.isnull().sum().sum()}")
print(f"Missing values in y: {y.isnull().sum().sum()}")

🎯 FINAL FEATURE SELECTION:
Available numeric features: 32
Available categorical features: 4
Target labels: 9

📋 DATA SHAPE:
X shape: (442736, 36)
y shape: (442736, 9)

🔍 DATA QUALITY CHECK:
Missing values in X: 169929
Missing values in y: 0


# Test dataset

In [29]:
test_10_2015 = pd.read_csv('data/processed/test_10_2015.csv')
test_11_2015 = pd.read_csv('data/processed/test_11_2015.csv')
test_12_2015 = pd.read_csv('data/processed/test_12_2015.csv')

  test_10_2015 = pd.read_csv('data/processed/test_10_2015.csv')
  test_11_2015 = pd.read_csv('data/processed/test_11_2015.csv')
  test_12_2015 = pd.read_csv('data/processed/test_12_2015.csv')


In [27]:
test_10_2015 = reduce_memory_usage(test_10_2015)
test_11_2015 = reduce_memory_usage(test_11_2015)
test_12_2015 = reduce_memory_usage(test_12_2015)

Mem. usage decreased to 903.96 Mb (53.6% reduction)
Mem. usage decreased to 950.77 Mb (53.6% reduction)
Mem. usage decreased to 998.31 Mb (53.6% reduction)


In [30]:
test_10_2015 = clean_dataset(test_10_2015)
test_11_2015 = clean_dataset(test_11_2015)
test_12_2015 = clean_dataset(test_12_2015)


🧹 CLEANING DATASET...
1️⃣ Handling missing values...
   ✅ Filled payroll_final_label: 0 missing → 0
   ✅ Filled pensions_2_final_label: 0 missing → 0

2️⃣ Creating customer tenure feature...
   ✅ Created 'customer_tenure_days' feature
   📊 Range: -3.0 to 7590.0 days
   🗑️ Dropped 'registration_date' column

3️⃣ Converting last_primary_date to binary indicator...
   ✅ Created 'was_primary_customer' binary feature
   📊 5,316,273 nulls → 0, 8,870 dates → 1
   🗑️ Dropped 'last_primary_date' column

4️⃣ Removing constant and duplicate columns...
   🗑️ Dropped 'address_type' (constant value)
   🗑️ Dropped 'province_code' (duplicate of province_name)

5️⃣ Cleaning numeric columns...
   ✅ Cleaned 'age': object → numeric (9,750 nulls)
   ⚠️ Converted 14 negative seniority values to NaN
   ✅ Cleaned 'seniority': object → numeric (9,764 nulls)

📊 CLEANUP SUMMARY:
----------------------------------------
   Final shape: (5325143, 46)
   Memory usage: 4408.8 MB
   Null values: 6,810,911

📋 DATA TYP

In [31]:
test_10_2015 = filter_data(test_10_2015)
test_11_2015 = filter_data(test_11_2015)
test_12_2015 = filter_data(test_12_2015)


Tổng số khách hàng: 1,017,191
Khách hàng KHÔNG sở hữu tài khoản thanh toán nào: 1,017,191

📊 Distribution check:
  current_accounts_final_label: [0 1]
  payroll_accounts_final_label: [0 1]
  junior_accounts_final_label: [0 1]
  more_particular_accounts_final_label: [0 1]
  particular_accounts_final_label: [0 1]
  particular_plus_accounts_final_label: [0 1]
  home_account_final_label: [0]
  payroll_final_label: [0 1]
  e_account_final_label: [0 1]
Tổng số khách hàng: 1,223,624
Khách hàng KHÔNG sở hữu tài khoản thanh toán nào: 1,223,624

📊 Distribution check:
  current_accounts_final_label: [0 1]
  payroll_accounts_final_label: [0 1]
  junior_accounts_final_label: [0 1]
  more_particular_accounts_final_label: [0 1]
  particular_accounts_final_label: [0 1]
  particular_plus_accounts_final_label: [0 1]
  home_account_final_label: [0]
  payroll_final_label: [0 1]
  e_account_final_label: [0 1]
Tổng số khách hàng: 1,425,848
Khách hàng KHÔNG sở hữu tài khoản thanh toán nào: 1,425,848

📊 Distr

In [32]:
test_10_2015 = create_enhanced_features(test_10_2015)
test_11_2015 = create_enhanced_features(test_11_2015)
test_12_2015 = create_enhanced_features(test_12_2015)

🔧 CREATING ENHANCED FEATURES...
📋 Available columns: 25
📅 Creating time-based features...
👥 Creating demographic features...
🏦 Creating banking relationship features...
📊 Creating product portfolio features...
🔗 Creating interaction features...
🎯 Creating behavioral features...
⚖️ Creating risk and stability features...
✅ Feature engineering completed!
Original features: 25
Enhanced features: 66
New features added: 41
🔧 CREATING ENHANCED FEATURES...
📋 Available columns: 25
📅 Creating time-based features...
👥 Creating demographic features...
🏦 Creating banking relationship features...
📊 Creating product portfolio features...
🔗 Creating interaction features...
🎯 Creating behavioral features...
⚖️ Creating risk and stability features...
✅ Feature engineering completed!
Original features: 25
Enhanced features: 66
New features added: 41
🔧 CREATING ENHANCED FEATURES...
📋 Available columns: 25
📅 Creating time-based features...
👥 Creating demographic features...
🏦 Creating banking relationship

# Evaluate function

In [24]:
def apk(actual, predicted, k=7):
    """
    Computes the average precision at k.
    
    Parameters:
    actual : list - A list of actual relevant items
    predicted : list - A list of predicted items ordered by rank
    k : int - The maximum number of predicted elements
    
    Returns:
    score : double - The average precision at k
    """
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=7):
    """
    Computes the mean average precision at k.
    
    Parameters:
    actual : list of lists - A list of lists of actual relevant items
    predicted : list of lists - A list of lists of predicted items ordered by rank
    k : int - The maximum number of predicted elements
    
    Returns:
    score : double - The mean average precision at k
    """
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

def evaluate_recommendations(actual_products, predicted_products, k=7):
    """
    Evaluate recommendation system using MAP@k
    
    Parameters:
    actual_products : dict - {user_id: [list of actual products]}
    predicted_products : dict - {user_id: [list of predicted products]}
    k : int - Number of recommendations to consider
    
    Returns:
    map_score : float - MAP@k score
    """
    actual_list = []
    predicted_list = []
    
    for user_id in actual_products.keys():
        if user_id in predicted_products:
            actual_list.append(actual_products[user_id])
            predicted_list.append(predicted_products[user_id])
        else:
            actual_list.append(actual_products[user_id])
            predicted_list.append([])  # No predictions for this user
    
    map_score = mapk(actual_list, predicted_list, k)
    
    print(f" MAP@{k}: {map_score:.4f}")
    print(f" Users evaluated: {len(actual_list):,}")
    print(f" Coverage: {len([p for p in predicted_list if p]) / len(predicted_list):.2%}")
    
    return map_score

In [23]:

def precision_at_k(actual, predicted, k=7):
    """
    Computes the precision at k.

    Parameters:
    actual : list - A list of actual relevant items
    predicted : list - A list of predicted items ordered by rank
    k : int - The maximum number of predicted elements

    Returns:
    precision : double - The precision at k
    """
    if len(predicted) > k:
        predicted = predicted[:k]
    
    if not predicted:
        return 0.0
    
    # Đếm số dự đoán đúng trong top-k
    correct = len(set(predicted) & set(actual))
    return correct / k


def evaluate_recommendations_with_precision(actual_products, predicted_products, k=7):
    """
    Evaluate recommendation system using MAP@k and Precision@k.

    Parameters:
    actual_products : dict - {user_id: [list of actual products]}
    predicted_products : dict - {user_id: [list of predicted products]}
    k : int - Number of recommendations to consider

    Returns:
    metrics : dict - {'MAP@k': ..., 'Precision@k': ...}
    """
    actual_list = []
    predicted_list = []

    for user_id in actual_products.keys():
        if user_id in predicted_products:
            actual_list.append(actual_products[user_id])
            predicted_list.append(predicted_products[user_id])
        else:
            actual_list.append(actual_products[user_id])
            predicted_list.append([])  # No predictions for this user

    # MAP@k
    map_score = mapk(actual_list, predicted_list, k)

    # Precision@k trung bình
    precision_scores = [
        precision_at_k(a, p, k) for a, p in zip(actual_list, predicted_list)
    ]
    avg_precision = np.mean(precision_scores)

    print(f" MAP@{k}: {map_score:.4f}")
    print(f" Precision@{k}: {avg_precision:.4f}")
    print(f" Users evaluated: {len(actual_list):,}")
    print(f" Coverage: {len([p for p in predicted_list if p]) / len(predicted_list):.2%}")

    return {f"MAP@{k}": map_score, f"Precision@{k}": avg_precision}

# Data Pipline

In [35]:

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Updated preprocessors
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),   # Fill missing values with mean
    ('scaler', StandardScaler())                   # Scale numerical data
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing with most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))     # One-hot encode categorical data
])

# Updated ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, available_num_cols),
        ('cat', categorical_transformer, available_cat_cols)
    ],
    remainder='drop'  # Drop any remaining columns
)

print(f"✅ Preprocessor updated with:")
print(f"   - {len(available_num_cols)} numeric features")
print(f"   - {len(available_cat_cols)} categorical features")

✅ Preprocessor updated with:
   - 32 numeric features
   - 4 categorical features


In [36]:
# =============================
# MULTI-LABEL CLASSIFICATION MODELS - FIXED
# =============================

from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, hamming_loss, jaccard_score
import time

# Define all models for multi-label classification
models = {
    'Random Forest': MultiOutputClassifier(
        RandomForestClassifier(
            n_estimators=100,
            max_depth=10,
            min_samples_split=5,
            min_samples_leaf=2,
            random_state=42,
            n_jobs=-1
        )
    )
}

# Add XGBoost if available - FIXED with proper parameters
try:
    from xgboost import XGBClassifier
    models['XGBoost'] = MultiOutputClassifier(
        XGBClassifier(
            n_estimators=100,
            max_depth=6,
            learning_rate=0.1,
            random_state=42,
            n_jobs=-1,
            verbosity=0,
            base_score=0.5,  # Fix for the logistic loss error
            objective='binary:logistic',  # Explicitly set objective
            eval_metric='logloss'  # Set evaluation metric
        )
    )
    print("✅ XGBoost added to models (with fixed parameters)")
except ImportError:
    print("⚠️ XGBoost not available - install with: pip install xgboost")

# Add LightGBM if available - with proper parameters
try:
    from lightgbm import LGBMClassifier
    models['LightGBM'] = MultiOutputClassifier(
        LGBMClassifier(
            n_estimators=100,
            max_depth=6,
            learning_rate=0.1,
            random_state=42,
            n_jobs=-1,
            verbose=-1,
            objective='binary',  # Explicitly set objective
            boosting_type='gbdt'
        )
    )
    print("✅ LightGBM added to models")
except ImportError:
    print("⚠️ LightGBM not available - install with: pip install lightgbm")

# Create complete pipelines for all models
pipelines = {}
for name, model in models.items():
    pipelines[name] = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

print(f"\n🤖 MODELS PREPARED: {list(pipelines.keys())}")
print(f"📊 Total models to train: {len(pipelines)}")

✅ XGBoost added to models (with fixed parameters)
✅ LightGBM added to models

🤖 MODELS PREPARED: ['Random Forest', 'XGBoost', 'LightGBM']
📊 Total models to train: 3


In [41]:
# =============================
# COMPREHENSIVE MODEL EVALUATION FUNCTION - UPDATED WITH MAP@2,3,4,5
# =============================

def evaluate_model_comprehensive(model_pipeline, model_name, processed_test_data, tar_cols):
    """
    Comprehensive evaluation function for recommendation system
    
    Parameters:
    model_pipeline : sklearn.pipeline.Pipeline - Trained model pipeline
    model_name : str - Name of the model
    processed_test_data : dict - Dictionary of processed test datasets
    tar_cols : list - List of target column names
    
    Returns:
    results_df : pd.DataFrame - Comprehensive results DataFrame
    detailed_results : dict - Detailed results for further analysis
    """
    
    print(f"\n🧪 EVALUATING {model_name.upper()}...")
    print("="*60)
    
    all_results = []
    detailed_results = {
        'model_name': model_name,
        'monthly_results': {},
        'overall_metrics': {},
        'all_actual_products': {},
        'all_predicted_products': {}
    }
    
    customer_offset = 0
    
    for month_name, data in processed_test_data.items():
        print(f"\n📊 Evaluating on {month_name}...")
        print("-" * 40)
        
        X_test_month = data['X']
        y_test_month = data['y']
        
        if y_test_month is not None:
            # Make predictions
            start_time = time.time()
            y_pred_month = model_pipeline.predict(X_test_month)
            prediction_time = time.time() - start_time
            
            # Get prediction probabilities for ranking
            y_pred_proba = model_pipeline.predict_proba(X_test_month)
            
            # Calculate standard classification metrics
            test_accuracy = accuracy_score(y_test_month, y_pred_month)
            hamming_loss_score = hamming_loss(y_test_month, y_pred_month)
            jaccard_score_macro = jaccard_score(y_test_month, y_pred_month, average='macro')
            
            print(f"   📈 Standard Metrics:")
            print(f"      Accuracy: {test_accuracy:.4f}")
            print(f"      Hamming Loss: {hamming_loss_score:.4f}")
            print(f"      Jaccard (Macro): {jaccard_score_macro:.4f}")
            
            # =============================
            # PREPARE RECOMMENDATION DATA
            # =============================
            actual_products = {}
            predicted_products = {}
            
            for i in range(len(y_test_month)):
                customer_id = i
                
                # Get actual products
                actual = []
                for j, product in enumerate(tar_cols):
                    if y_test_month.iloc[i, j] == 1:
                        actual.append(product)
                actual_products[customer_id] = actual
                
                # Get predicted products (ranked by probability)
                product_scores = []
                for j, product in enumerate(tar_cols):
                    # Get probability of class 1
                    prob = y_pred_proba[j][i][1] if len(y_pred_proba[j][i]) > 1 else y_pred_proba[j][i][0]
                    product_scores.append((product, prob))
                
                # Sort by probability (descending)
                product_scores.sort(key=lambda x: x[1], reverse=True)
                predicted = [product for product, score in product_scores]
                predicted_products[customer_id] = predicted
                
                # Add to overall data with offset
                detailed_results['all_actual_products'][customer_id + customer_offset] = actual
                detailed_results['all_predicted_products'][customer_id + customer_offset] = predicted
            
            customer_offset += len(actual_products)
            
            # =============================
            # CALCULATE RECOMMENDATION METRICS - UPDATED K VALUES
            # =============================
            print(f"\n   🎯 Recommendation Metrics:")
            
            month_metrics = {}
            for k in [2, 3, 4, 5]:  # CHANGED: Test với k=2,3,4,5
                # Calculate MAP@k and Precision@k
                actual_list = list(actual_products.values())
                predicted_list = list(predicted_products.values())
                
                # MAP@k
                map_score = mapk(actual_list, predicted_list, k)
                
                # Precision@k
                precision_scores = [precision_at_k(a, p, k) for a, p in zip(actual_list, predicted_list)]
                avg_precision = np.mean(precision_scores)
                
                month_metrics[f'MAP@{k}'] = map_score
                month_metrics[f'Precision@{k}'] = avg_precision
                
                print(f"      k={k}: MAP={map_score:.4f}, Precision={avg_precision:.4f}")
            
            # =============================
            # MONTHLY STATISTICS
            # =============================
            total_customers = len(actual_products)
            customers_with_products = len([p for p in actual_products.values() if p])
            avg_products_per_customer = np.mean([len(p) for p in actual_products.values()])
            
            # Product distribution
            product_counts = {}
            for products in actual_products.values():
                for product in products:
                    product_counts[product] = product_counts.get(product, 0) + 1
            
            # Store monthly result
            month_result = {
                'Model': model_name,
                'Month': month_name,
                'Total_Customers': total_customers,
                'Customers_With_Products': customers_with_products,
                'Coverage_Rate': customers_with_products / total_customers,
                'Avg_Products_Per_Customer': avg_products_per_customer,
                'Test_Accuracy': test_accuracy,
                'Hamming_Loss': hamming_loss_score,
                'Jaccard_Macro': jaccard_score_macro,
                'Prediction_Time': prediction_time,
                **month_metrics  # Add all MAP@k and Precision@k metrics
            }
            
            all_results.append(month_result)
            
            # Store detailed results
            detailed_results['monthly_results'][month_name] = {
                'actual_products': actual_products,
                'predicted_products': predicted_products,
                'metrics': month_metrics,
                'stats': month_result,
                'product_distribution': product_counts
            }
            
            print(f"   ⏱️ Time: {prediction_time:.2f}s")
        
        else:
            print(f"   ⚠️ No ground truth available for {month_name}")
    
    # =============================
    # CALCULATE OVERALL METRICS - UPDATED K VALUES
    # =============================
    if detailed_results['all_actual_products'] and detailed_results['all_predicted_products']:
        print(f"\n🎯 Overall Performance:")
        
        overall_metrics = {}
        for k in [2, 3, 4, 5]:  # CHANGED: Overall metrics với k=2,3,4,5
            actual_list = list(detailed_results['all_actual_products'].values())
            predicted_list = list(detailed_results['all_predicted_products'].values())
            
            # Overall MAP@k and Precision@k
            map_score = mapk(actual_list, predicted_list, k)
            precision_scores = [precision_at_k(a, p, k) for a, p in zip(actual_list, predicted_list)]
            avg_precision = np.mean(precision_scores)
            
            overall_metrics[f'Overall_MAP@{k}'] = map_score
            overall_metrics[f'Overall_Precision@{k}'] = avg_precision
            
            print(f"   Overall k={k}: MAP={map_score:.4f}, Precision={avg_precision:.4f}")
        
        # Add overall row
        overall_result = {
            'Model': model_name,
            'Month': 'OVERALL',
            'Total_Customers': len(detailed_results['all_actual_products']),
            'Customers_With_Products': len([p for p in detailed_results['all_actual_products'].values() if p]),
            'Coverage_Rate': len([p for p in detailed_results['all_actual_products'].values() if p]) / len(detailed_results['all_actual_products']),
            'Avg_Products_Per_Customer': np.mean([len(p) for p in detailed_results['all_actual_products'].values()]),
            'Test_Accuracy': np.mean([r['Test_Accuracy'] for r in all_results]),
            'Hamming_Loss': np.mean([r['Hamming_Loss'] for r in all_results]),
            'Jaccard_Macro': np.mean([r['Jaccard_Macro'] for r in all_results]),
            'Prediction_Time': sum([r['Prediction_Time'] for r in all_results]),
        }
        
        # Add overall metrics - UPDATED K VALUES
        for k in [2, 3, 4, 5]:  # CHANGED: Add k=2,3,4,5 to overall results
            overall_result[f'MAP@{k}'] = overall_metrics[f'Overall_MAP@{k}']
            overall_result[f'Precision@{k}'] = overall_metrics[f'Overall_Precision@{k}']
        
        all_results.append(overall_result)
        detailed_results['overall_metrics'] = overall_metrics
    
    # Create DataFrame
    results_df = pd.DataFrame(all_results)
    
    return results_df, detailed_results

In [42]:
# =============================
# PREPARE TEST DATA FORMAT FOR EVALUATION
# =============================

def format_test_data_for_evaluation(test_df, available_num_cols, available_cat_cols, tar_cols):
    """
    Format processed test data để có structure phù hợp với evaluation function
    """
    print(f"📋 Formatting test data: {len(test_df):,} samples")
    
    # Thêm missing features nếu cần
    test_formatted = test_df.copy()
    
    # Add missing numeric features
    for feature in available_num_cols:
        if feature not in test_formatted.columns:
            test_formatted[feature] = 0
            print(f"   ⚠️ Added missing numeric: {feature}")
    
    # Add missing categorical features  
    for feature in available_cat_cols:
        if feature not in test_formatted.columns:
            test_formatted[feature] = 'Unknown'
            print(f"   ⚠️ Added missing categorical: {feature}")
    
    # Prepare X và y
    X_test = test_formatted[available_num_cols + available_cat_cols]
    
    # Check target columns
    if all(col in test_formatted.columns for col in tar_cols):
        y_test = test_formatted[tar_cols]
        print(f"   ✅ Ground truth available: {y_test.shape}")
    else:
        y_test = None
        print(f"   ⚠️ No ground truth available")
    
    return {'X': X_test, 'y': y_test}

# Format test datasets
processed_test_data = {
    'October 2015': format_test_data_for_evaluation(
        test_10_2015, available_num_cols, available_cat_cols, tar_cols
    ),
    'November 2015': format_test_data_for_evaluation(
        test_11_2015, available_num_cols, available_cat_cols, tar_cols
    ),
    'December 2015': format_test_data_for_evaluation(
        test_12_2015, available_num_cols, available_cat_cols, tar_cols
    )
}

print(f"\n✅ TEST DATA FORMATTED FOR EVALUATION!")
for month, data in processed_test_data.items():
    x_shape = data['X'].shape
    y_shape = data['y'].shape if data['y'] is not None else 'None'
    print(f"   {month}: X={x_shape}, y={y_shape}")

📋 Formatting test data: 1,017,191 samples
   ✅ Ground truth available: (1017191, 9)
📋 Formatting test data: 1,223,624 samples
   ✅ Ground truth available: (1223624, 9)
📋 Formatting test data: 1,425,848 samples
   ✅ Ground truth available: (1425848, 9)

✅ TEST DATA FORMATTED FOR EVALUATION!
   October 2015: X=(1017191, 36), y=(1017191, 9)
   November 2015: X=(1223624, 36), y=(1223624, 9)
   December 2015: X=(1425848, 36), y=(1425848, 9)


In [43]:
# =============================
# TRAIN ALL MODELS AND EVALUATE COMPREHENSIVELY
# =============================




X_train = df_enhanced[available_num_cols + available_cat_cols]
y_train = df_enhanced[tar_cols]

# Storage for all results
all_model_results = []
all_detailed_results = {}
training_results = {}

print("🚀 TRAINING AND EVALUATING ALL MODELS...")
print("="*70)

for model_name, pipeline in pipelines.items():
    print(f"\n🤖 TRAINING {model_name.upper()}...")
    print("-" * 50)
    
    # =============================
    # TRAIN MODEL
    # =============================
    start_time = time.time()
    pipeline.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    # Training metrics
    print("📊 Calculating training metrics...")
    y_pred_train = pipeline.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_pred_train)
    
    print(f"   ✅ Training completed in {training_time:.2f}s")
    print(f"   📈 Train Accuracy: {train_accuracy:.4f}")
    
    # Store training results
    training_results[model_name] = {
        'pipeline': pipeline,
        'train_accuracy': train_accuracy,
        'training_time': training_time
    }
    
    # =============================
    # EVALUATE ON TEST SETS
    # =============================
    model_results_df, detailed_results = evaluate_model_comprehensive(
        pipeline, model_name, processed_test_data, tar_cols
    )
    
    # Add training info to results
    model_results_df['Training_Time'] = training_time
    model_results_df['Train_Accuracy'] = train_accuracy
    
    # Store results
    all_model_results.append(model_results_df)
    all_detailed_results[model_name] = detailed_results
    
    print(f"\n✅ {model_name} evaluation completed!")
    print(f"🔧 Pipeline saved for future use")

# =============================
# COMBINE ALL RESULTS
# =============================
print(f"\n🎉 ALL MODELS TRAINED AND EVALUATED!")
final_results_df = pd.concat(all_model_results, ignore_index=True)

print(f"📊 SUMMARY:")
print(f"   Models evaluated: {len(pipelines)}")
print(f"   Total result rows: {len(final_results_df)}")
print(f"   Test datasets: {len(processed_test_data)}")

🚀 TRAINING AND EVALUATING ALL MODELS...

🤖 TRAINING RANDOM FOREST...
--------------------------------------------------
📊 Calculating training metrics...
   ✅ Training completed in 51.28s
   📈 Train Accuracy: 0.9982

🧪 EVALUATING RANDOM FOREST...

📊 Evaluating on October 2015...
----------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


   📈 Standard Metrics:
      Accuracy: 0.9964
      Hamming Loss: 0.0005
      Jaccard (Macro): 0.5829

   🎯 Recommendation Metrics:
      k=2: MAP=0.0140, Precision=0.0149
      k=3: MAP=0.0150, Precision=0.0111
      k=4: MAP=0.0153, Precision=0.0086
      k=5: MAP=0.0154, Precision=0.0070
   ⏱️ Time: 12.56s

📊 Evaluating on November 2015...
----------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


   📈 Standard Metrics:
      Accuracy: 0.9971
      Hamming Loss: 0.0004
      Jaccard (Macro): 0.5918

   🎯 Recommendation Metrics:
      k=2: MAP=0.0130, Precision=0.0137
      k=3: MAP=0.0138, Precision=0.0101
      k=4: MAP=0.0141, Precision=0.0078
      k=5: MAP=0.0141, Precision=0.0063
   ⏱️ Time: 15.11s

📊 Evaluating on December 2015...
----------------------------------------
   📈 Standard Metrics:
      Accuracy: 0.9975
      Hamming Loss: 0.0003
      Jaccard (Macro): 0.5630

   🎯 Recommendation Metrics:
      k=2: MAP=0.0103, Precision=0.0109
      k=3: MAP=0.0110, Precision=0.0080
      k=4: MAP=0.0112, Precision=0.0062
      k=5: MAP=0.0112, Precision=0.0050
   ⏱️ Time: 17.60s

🎯 Overall Performance:
   Overall k=2: MAP=0.0122, Precision=0.0130
   Overall k=3: MAP=0.0130, Precision=0.0095
   Overall k=4: MAP=0.0133, Precision=0.0074
   Overall k=5: MAP=0.0133, Precision=0.0060

✅ Random Forest evaluation completed!
🔧 Pipeline saved for future use

🤖 TRAINING XGBOOST...
---

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


   📈 Standard Metrics:
      Accuracy: 0.9984
      Hamming Loss: 0.0003
      Jaccard (Macro): 0.6893

   🎯 Recommendation Metrics:
      k=2: MAP=0.0301, Precision=0.0168
      k=3: MAP=0.0302, Precision=0.0116
      k=4: MAP=0.0303, Precision=0.0088
      k=5: MAP=0.0303, Precision=0.0070
   ⏱️ Time: 2.32s

📊 Evaluating on November 2015...
----------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


   📈 Standard Metrics:
      Accuracy: 0.9987
      Hamming Loss: 0.0002
      Jaccard (Macro): 0.6788

   🎯 Recommendation Metrics:
      k=2: MAP=0.0277, Precision=0.0153
      k=3: MAP=0.0278, Precision=0.0105
      k=4: MAP=0.0279, Precision=0.0080
      k=5: MAP=0.0279, Precision=0.0064
   ⏱️ Time: 2.92s

📊 Evaluating on December 2015...
----------------------------------------
   📈 Standard Metrics:
      Accuracy: 0.9989
      Hamming Loss: 0.0002
      Jaccard (Macro): 0.6557

   🎯 Recommendation Metrics:
      k=2: MAP=0.0219, Precision=0.0120
      k=3: MAP=0.0221, Precision=0.0083
      k=4: MAP=0.0221, Precision=0.0062
      k=5: MAP=0.0221, Precision=0.0050
   ⏱️ Time: 3.29s

🎯 Overall Performance:
   Overall k=2: MAP=0.0261, Precision=0.0144
   Overall k=3: MAP=0.0262, Precision=0.0099
   Overall k=4: MAP=0.0263, Precision=0.0075
   Overall k=5: MAP=0.0263, Precision=0.0060

✅ XGBoost evaluation completed!
🔧 Pipeline saved for future use

🤖 TRAINING LIGHTGBM...
----------



   ✅ Training completed in 7.57s
   📈 Train Accuracy: 0.9923

🧪 EVALUATING LIGHTGBM...

📊 Evaluating on October 2015...
----------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


   📈 Standard Metrics:
      Accuracy: 0.9906
      Hamming Loss: 0.0012
      Jaccard (Macro): 0.3378

   🎯 Recommendation Metrics:
      k=2: MAP=0.0275, Precision=0.0154
      k=3: MAP=0.0275, Precision=0.0105
      k=4: MAP=0.0276, Precision=0.0079
      k=5: MAP=0.0276, Precision=0.0064
   ⏱️ Time: 3.71s

📊 Evaluating on November 2015...
----------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


   📈 Standard Metrics:
      Accuracy: 0.9903
      Hamming Loss: 0.0012
      Jaccard (Macro): 0.3299

   🎯 Recommendation Metrics:
      k=2: MAP=0.0254, Precision=0.0141
      k=3: MAP=0.0254, Precision=0.0096
      k=4: MAP=0.0254, Precision=0.0072
      k=5: MAP=0.0255, Precision=0.0058
   ⏱️ Time: 4.65s

📊 Evaluating on December 2015...
----------------------------------------




   📈 Standard Metrics:
      Accuracy: 0.9913
      Hamming Loss: 0.0011
      Jaccard (Macro): 0.3186

   🎯 Recommendation Metrics:
      k=2: MAP=0.0201, Precision=0.0110
      k=3: MAP=0.0200, Precision=0.0074
      k=4: MAP=0.0201, Precision=0.0056
      k=5: MAP=0.0201, Precision=0.0045
   ⏱️ Time: 5.57s

🎯 Overall Performance:
   Overall k=2: MAP=0.0239, Precision=0.0132
   Overall k=3: MAP=0.0239, Precision=0.0090
   Overall k=4: MAP=0.0239, Precision=0.0068
   Overall k=5: MAP=0.0240, Precision=0.0055

✅ LightGBM evaluation completed!
🔧 Pipeline saved for future use

🎉 ALL MODELS TRAINED AND EVALUATED!
📊 SUMMARY:
   Models evaluated: 3
   Total result rows: 12
   Test datasets: 3


In [45]:
# =============================
# COMPREHENSIVE MODEL COMPARISON AND SUMMARY
# =============================

# Display final results
print("📊 DETAILED RESULTS TABLE:")
print("="*80)
display(final_results_df)

# =============================
# MODEL PERFORMANCE SUMMARY
# =============================

print("\n🏆 MODEL PERFORMANCE SUMMARY:")
print("="*80)

# Overall performance comparison (filter OVERALL rows only)
overall_results = final_results_df[final_results_df['Month'] == 'OVERALL'].copy()

if not overall_results.empty:
    # Sort by MAP@5 (primary metric)
    overall_results_sorted = overall_results.sort_values('MAP@5', ascending=False)
    
    print("\n🎯 RANKING BY MAP@5:")
    print("-" * 50)
    for i, (_, row) in enumerate(overall_results_sorted.iterrows(), 1):
        print(f"{i}. {row['Model']}: MAP@5 = {row['MAP@5']:.4f}")
    
    # Key metrics comparison table
    print(f"\n📋 KEY METRICS COMPARISON:")
    comparison_metrics = ['Model', 'MAP@2', 'MAP@3', 'MAP@4', 'MAP@5', 
                         'Precision@2', 'Precision@3', 'Precision@4', 'Precision@5',
                         'Test_Accuracy', 'Training_Time', 'Prediction_Time']
    
    summary_table = overall_results[comparison_metrics].round(4)
    display(summary_table)

# =============================
# DETAILED PERFORMANCE ANALYSIS
# =============================

print(f"\n🔍 DETAILED PERFORMANCE ANALYSIS:")
print("=" * 80)

for model_name in overall_results['Model'].values:
    print(f"\n🤖 {model_name.upper()} PERFORMANCE:")
    print("-" * 50)
    
    model_data = overall_results[overall_results['Model'] == model_name].iloc[0]
    
    # MAP@k scores
    print(f"   📈 MAP Scores:")
    for k in [2, 3, 4, 5]:
        map_score = model_data[f'MAP@{k}']
        print(f"      MAP@{k}: {map_score:.4f}")
    
    # Precision@k scores  
    print(f"   🎯 Precision Scores:")
    for k in [2, 3, 4, 5]:
        precision_score = model_data[f'Precision@{k}']
        print(f"      Precision@{k}: {precision_score:.4f}")
    
    # Other metrics
    print(f"   ⚡ Performance Metrics:")
    print(f"      Test Accuracy: {model_data['Test_Accuracy']:.4f}")
    print(f"      Training Time: {model_data['Training_Time']:.2f}s")
    print(f"      Prediction Time: {model_data['Prediction_Time']:.2f}s")
    print(f"      Total Customers: {model_data['Total_Customers']:,}")

# =============================
# MONTHLY PERFORMANCE BREAKDOWN
# =============================

print(f"\n📅 MONTHLY PERFORMANCE BREAKDOWN:")
print("=" * 80)

monthly_results = final_results_df[final_results_df['Month'] != 'OVERALL'].copy()

if not monthly_results.empty:
    # Group by month and show MAP@5 for each model
    for month in monthly_results['Month'].unique():
        print(f"\n📊 {month}:")
        month_data = monthly_results[monthly_results['Month'] == month]
        month_sorted = month_data.sort_values('MAP@5', ascending=False)
        
        for _, row in month_sorted.iterrows():
            print(f"   {row['Model']}: MAP@5={row['MAP@5']:.4f}, "
                  f"Precision@5={row['Precision@5']:.4f}, "
                  f"Accuracy={row['Test_Accuracy']:.4f}")

# =============================
# BEST MODEL IDENTIFICATION
# =============================

print(f"\n🏅 BEST MODEL IDENTIFICATION:")
print("=" * 80)

if not overall_results.empty:
    # Best model by different metrics
    best_models = {}
    
    metrics_to_check = ['MAP@2', 'MAP@3', 'MAP@4', 'MAP@5', 
                       'Precision@2', 'Precision@3', 'Precision@4', 'Precision@5',
                       'Test_Accuracy']
    
    for metric in metrics_to_check:
        best_idx = overall_results[metric].idxmax()
        best_model = overall_results.loc[best_idx, 'Model']
        best_score = overall_results.loc[best_idx, metric]
        best_models[metric] = (best_model, best_score)
        print(f"🏆 Best {metric}: {best_model} ({best_score:.4f})")
    
    # Overall winner (most wins)
    model_wins = {}
    for metric, (model, score) in best_models.items():
        model_wins[model] = model_wins.get(model, 0) + 1
    
    overall_winner = max(model_wins.items(), key=lambda x: x[1])
    print(f"\n🎖️ OVERALL WINNER: {overall_winner[0]} ({overall_winner[1]} best scores)")

# =============================
# PERFORMANCE INSIGHTS
# =============================

print(f"\n💡 PERFORMANCE INSIGHTS:")
print("=" * 80)

if not overall_results.empty:
    # MAP@k trend analysis
    print(f"📈 MAP@k Trends:")
    for _, row in overall_results.iterrows():
        model = row['Model']
        map_scores = [row[f'MAP@{k}'] for k in [2, 3, 4, 5]]
        trend = "📈 Improving" if all(map_scores[i] <= map_scores[i+1] for i in range(len(map_scores)-1)) \
                else "📉 Declining" if all(map_scores[i] >= map_scores[i+1] for i in range(len(map_scores)-1)) \
                else "📊 Mixed"
        print(f"   {model}: {trend} (MAP@2={map_scores[0]:.3f} → MAP@5={map_scores[3]:.3f})")
    
    # Speed vs Accuracy analysis
    print(f"\n⚡ Speed vs Accuracy Analysis:")
    for _, row in overall_results.iterrows():
        model = row['Model']
        speed_score = 1 / (row['Training_Time'] + row['Prediction_Time'])  # Inverse of total time
        accuracy_score = row['MAP@5']
        efficiency_ratio = accuracy_score / row['Training_Time'] * 1000  # MAP@5 per second * 1000
        print(f"   {model}: Efficiency Ratio = {efficiency_ratio:.2f} (MAP@5/training_time)")



📊 DETAILED RESULTS TABLE:


Unnamed: 0,Model,Month,Total_Customers,Customers_With_Products,Coverage_Rate,Avg_Products_Per_Customer,Test_Accuracy,Hamming_Loss,Jaccard_Macro,Prediction_Time,MAP@2,Precision@2,MAP@3,Precision@3,MAP@4,Precision@4,MAP@5,Precision@5,Training_Time,Train_Accuracy
0,Random Forest,October 2015,1017191,31170,0.030643,0.035039,0.996449,0.000457,0.582871,12.560119,0.014018,0.014931,0.015,0.011062,0.015285,0.008593,0.015368,0.006958,51.279366,0.998161
1,Random Forest,November 2015,1223624,34541,0.028228,0.031845,0.997064,0.00037,0.591792,15.111525,0.012989,0.013746,0.013819,0.010097,0.014063,0.007825,0.014131,0.006329,51.279366,0.998161
2,Random Forest,December 2015,1425848,31935,0.022397,0.024978,0.997516,0.000309,0.563023,17.599548,0.010338,0.010875,0.010964,0.007951,0.011163,0.006166,0.011202,0.004972,51.279366,0.998161
3,Random Forest,OVERALL,3666663,97646,0.026631,0.030061,0.99701,0.000379,0.579229,45.271192,0.012243,0.012958,0.013036,0.00953,0.013274,0.007393,0.013335,0.005976,51.279366,0.998161
4,XGBoost,October 2015,1017191,31170,0.030643,0.035039,0.998448,0.000275,0.689261,2.321311,0.030052,0.016814,0.030203,0.011567,0.030267,0.008754,0.03027,0.007007,7.906032,0.999808
5,XGBoost,November 2015,1223624,34541,0.028228,0.031845,0.998652,0.000234,0.678838,2.920624,0.027675,0.015315,0.027828,0.010539,0.027871,0.007957,0.027874,0.006369,7.906032,0.999808
6,XGBoost,December 2015,1425848,31935,0.022397,0.024978,0.9989,0.000196,0.655683,3.294385,0.021937,0.01202,0.022065,0.00827,0.0221,0.006242,0.0221,0.004994,7.906032,0.999808
7,XGBoost,OVERALL,3666663,97646,0.026631,0.030061,0.998666,0.000235,0.674594,8.53632,0.026103,0.01445,0.026246,0.009942,0.026291,0.007511,0.026293,0.006011,7.906032,0.999808
8,LightGBM,October 2015,1017191,31170,0.030643,0.035039,0.990609,0.001168,0.337837,3.706887,0.027493,0.015406,0.027544,0.010513,0.027564,0.007917,0.027631,0.006404,7.566023,0.992305
9,LightGBM,November 2015,1223624,34541,0.028228,0.031845,0.990329,0.001189,0.329925,4.652761,0.025368,0.01407,0.02537,0.00955,0.025387,0.00719,0.025455,0.00582,7.566023,0.992305



🏆 MODEL PERFORMANCE SUMMARY:

🎯 RANKING BY MAP@5:
--------------------------------------------------
1. XGBoost: MAP@5 = 0.0263
2. LightGBM: MAP@5 = 0.0240
3. Random Forest: MAP@5 = 0.0133

📋 KEY METRICS COMPARISON:


Unnamed: 0,Model,MAP@2,MAP@3,MAP@4,MAP@5,Precision@2,Precision@3,Precision@4,Precision@5,Test_Accuracy,Training_Time,Prediction_Time
3,Random Forest,0.0122,0.013,0.0133,0.0133,0.013,0.0095,0.0074,0.006,0.997,51.2794,45.2712
7,XGBoost,0.0261,0.0262,0.0263,0.0263,0.0144,0.0099,0.0075,0.006,0.9987,7.906,8.5363
11,LightGBM,0.0239,0.0239,0.0239,0.024,0.0132,0.009,0.0068,0.0055,0.9907,7.566,13.9291



🔍 DETAILED PERFORMANCE ANALYSIS:

🤖 RANDOM FOREST PERFORMANCE:
--------------------------------------------------
   📈 MAP Scores:
      MAP@2: 0.0122
      MAP@3: 0.0130
      MAP@4: 0.0133
      MAP@5: 0.0133
   🎯 Precision Scores:
      Precision@2: 0.0130
      Precision@3: 0.0095
      Precision@4: 0.0074
      Precision@5: 0.0060
   ⚡ Performance Metrics:
      Test Accuracy: 0.9970
      Training Time: 51.28s
      Prediction Time: 45.27s
      Total Customers: 3,666,663

🤖 XGBOOST PERFORMANCE:
--------------------------------------------------
   📈 MAP Scores:
      MAP@2: 0.0261
      MAP@3: 0.0262
      MAP@4: 0.0263
      MAP@5: 0.0263
   🎯 Precision Scores:
      Precision@2: 0.0144
      Precision@3: 0.0099
      Precision@4: 0.0075
      Precision@5: 0.0060
   ⚡ Performance Metrics:
      Test Accuracy: 0.9987
      Training Time: 7.91s
      Prediction Time: 8.54s
      Total Customers: 3,666,663

🤖 LIGHTGBM PERFORMANCE:
-------------------------------------------------

# Recommendation algorithm

## Collaborative filtering 

In [20]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from tqdm import tqdm

class CollaborativeFilteringRecommender:
    """
    User-based collaborative filtering for product recommendations.
    
    Uses cosine similarity to find similar users and recommend products.
    Supports both dense and sparse matrices for scalability.
    """

    def __init__(self, use_sparse=True):
        """Initialize with option to use sparse matrices for large datasets."""
        self.user_item_matrix = None
        self.similarity_matrix = None
        self.user_means = None
        self.user_index_map = {}
        self.item_index_map = {}
        self.use_sparse = use_sparse
        
    def create_user_item_matrix(self, df, product_cols, sample_size=None):
        """Create matrix of users and their product interactions."""
        if sample_size and len(df) > sample_size:
            df = df.sample(sample_size, random_state=42)
        latest_data = df.groupby('customer_id').last().reset_index()
        user_item_data = latest_data[['customer_id'] + product_cols].set_index('customer_id')
        self.user_index_map = {user_id: idx for idx, user_id in enumerate(user_item_data.index)}
        self.item_index_map = {item: idx for idx, item in enumerate(product_cols)}
        user_item_data = user_item_data.fillna(0)
        if self.use_sparse:
            self.user_item_matrix = sparse.csr_matrix(user_item_data.values)
            self.matrix_columns = product_cols
            self.matrix_index = user_item_data.index
        else:
            self.user_item_matrix = user_item_data
        return self.user_item_matrix
            
    def compute_similarity_matrix(self, chunk_size=1000):
        """Calculate similarity between users using cosine similarity."""
        if self.use_sparse:
            n_users = self.user_item_matrix.shape[0]
            similarity_matrix = np.zeros((n_users, n_users))
            for i in tqdm(range(0, n_users, chunk_size), desc="Computing similarity"):
                end = min(i + chunk_size, n_users)
                chunk = self.user_item_matrix[i:end]
                chunk_similarities = cosine_similarity(chunk, self.user_item_matrix)
                similarity_matrix[i:end] = chunk_similarities
        else:
            similarity_matrix = cosine_similarity(self.user_item_matrix)
        np.fill_diagonal(similarity_matrix, 0)
        self.similarity_matrix = similarity_matrix
        return similarity_matrix
            
    def fit(self, df, product_cols, sample_size=None):
        """Prepare recommender by creating matrices and computing similarities."""
        self.create_user_item_matrix(df, product_cols, sample_size)
        self.compute_similarity_matrix()
        if self.use_sparse:
            self.user_means = np.array([
                self.user_item_matrix[i].sum() / max(1, self.user_item_matrix[i].count_nonzero()) 
                for i in range(self.user_item_matrix.shape[0])
            ])
        else:
            self.user_means = self.user_item_matrix.mean(axis=1)
                
    def predict_for_user(self, user_idx, top_k=7, n_neighbors=50):
        """Recommend top products for a specific user based on similar users."""
        if self.use_sparse:
            user_items = self.user_item_matrix[user_idx].toarray().flatten()
        else:
            user_items = self.user_item_matrix.iloc[user_idx]
        user_similarities = self.similarity_matrix[user_idx]
        similar_users_idx = np.argsort(user_similarities)[::-1][:n_neighbors]
        predictions = {}
        for item_idx in range(len(self.matrix_columns if self.use_sparse else self.user_item_matrix.columns)):
            item_name = self.matrix_columns[item_idx] if self.use_sparse else self.user_item_matrix.columns[item_idx]
            if user_items[item_idx] == 0:
                numerator = denominator = 0
                for similar_user_idx in similar_users_idx:
                    if similar_user_idx != user_idx:
                        similarity = user_similarities[similar_user_idx]
                        if similarity > 0:
                            if self.use_sparse:
                                rating = self.user_item_matrix[similar_user_idx, item_idx]
                            else:
                                rating = self.user_item_matrix.iloc[similar_user_idx, item_idx]
                            numerator += similarity * rating
                            denominator += similarity
                if denominator > 0:
                    predicted_rating = numerator / denominator
                    predictions[item_name] = predicted_rating
        sorted_predictions = sorted(predictions.items(), key=lambda x: x[1], reverse=True)
        return [item for item, _ in sorted_predictions[:top_k]]
    
    def predict_for_users(self, user_ids, top_k=7, batch_size=100):
        """Generate recommendations for multiple users in batches."""
        predictions = {}
        for i in range(0, len(user_ids), batch_size):
            batch_users = user_ids[i:i+batch_size]
            for user_id in tqdm(batch_users, desc=f"User-based CF predictions (batch {i//batch_size + 1})"):
                if user_id in self.user_index_map:
                    user_idx = self.user_index_map[user_id]
                    predictions[user_id] = self.predict_for_user(user_idx, top_k)
                else:
                    predictions[user_id] = self._get_popular_items(top_k)
        return predictions
    
    def _get_popular_items(self, top_k=7):
        """Return most popular products as fallback recommendations."""
        if self.use_sparse:
            item_popularity = np.array(self.user_item_matrix.sum(axis=0)).flatten()
            popular_indices = np.argsort(item_popularity)[::-1][:top_k]
            return [self.matrix_columns[i] for i in popular_indices]
        else:
            item_popularity = self.user_item_matrix.sum(axis=0).sort_values(ascending=False)
            return item_popularity.head(top_k).index.tolist()

In [3]:
df = pd.read_csv('data/processed/train.csv')


  df = pd.read_csv('data/processed/train.csv')


In [6]:
df= reduce_memory_usage(df)

Mem. usage decreased to 781.94 Mb (53.6% reduction)


In [9]:
df = clean_dataset(df)

🧹 CLEANING DATASET...
1️⃣ Handling missing values...
   ✅ Filled payroll_final_label: 0 missing → 0
   ✅ Filled pensions_2_final_label: 0 missing → 0

2️⃣ Creating customer tenure feature...
   ✅ Created 'customer_tenure_days' feature
   📊 Range: -3.0 to 7498.0 days
   🗑️ Dropped 'registration_date' column

3️⃣ Converting last_primary_date to binary indicator...
   ✅ Created 'was_primary_customer' binary feature
   📊 4,600,165 nulls → 0, 6,146 dates → 1
   🗑️ Dropped 'last_primary_date' column

4️⃣ Removing constant and duplicate columns...
   🗑️ Dropped 'address_type' (constant value)
   🗑️ Dropped 'province_code' (duplicate of province_name)

5️⃣ Cleaning numeric columns...
   ✅ Cleaned 'age': object → numeric (27,734 nulls)
   ⚠️ Converted 14 negative seniority values to NaN
   ✅ Cleaned 'seniority': object → numeric (27,748 nulls)

📊 CLEANUP SUMMARY:
----------------------------------------
   Final shape: (4606311, 46)
   Memory usage: 2935.7 MB
   Null values: 5,979,487

📋 DATA T

In [16]:
df = filter_data(df)

Tổng số khách hàng: 442,736
Khách hàng KHÔNG sở hữu tài khoản thanh toán nào: 442,736

📊 Distribution check:
  current_accounts_final_label: [0 1]
  payroll_accounts_final_label: [0 1]
  junior_accounts_final_label: [0 1]
  more_particular_accounts_final_label: [0 1]
  particular_accounts_final_label: [0 1]
  particular_plus_accounts_final_label: [0 1]
  home_account_final_label: [0]
  payroll_final_label: [0 1]
  e_account_final_label: [0 1]


In [17]:
test_10_2015 = pd.read_csv('data/processed/test_10_2015.csv')

test_10_2015 = filter_data(clean_dataset(reduce_memory_usage(test_10_2015)))

  test_10_2015 = pd.read_csv('data/processed/test_10_2015.csv')


Mem. usage decreased to 903.96 Mb (53.6% reduction)
🧹 CLEANING DATASET...
1️⃣ Handling missing values...
   ✅ Filled payroll_final_label: 0 missing → 0
   ✅ Filled pensions_2_final_label: 0 missing → 0

2️⃣ Creating customer tenure feature...
   ✅ Created 'customer_tenure_days' feature
   📊 Range: -3.0 to 7590.0 days
   🗑️ Dropped 'registration_date' column

3️⃣ Converting last_primary_date to binary indicator...
   ✅ Created 'was_primary_customer' binary feature
   📊 5,316,273 nulls → 0, 8,870 dates → 1
   🗑️ Dropped 'last_primary_date' column

4️⃣ Removing constant and duplicate columns...
   🗑️ Dropped 'address_type' (constant value)
   🗑️ Dropped 'province_code' (duplicate of province_name)

5️⃣ Cleaning numeric columns...
   ✅ Cleaned 'age': object → numeric (9,750 nulls)
   ⚠️ Converted 14 negative seniority values to NaN
   ✅ Cleaned 'seniority': object → numeric (9,764 nulls)

📊 CLEANUP SUMMARY:
----------------------------------------
   Final shape: (5325143, 46)
   Memory us

In [18]:
# Evaluate Collaborative Filtering models (User-Based and Item-Based) on the test set

# Prepare actual products for test users
actual_products_cf = {}
test_latest_cf = test_10_2015.groupby('customer_id').last().reset_index()
for _, row in test_latest_cf.iterrows():
    user_id = row['customer_id']
    owned_products = [col for col in payment_account_labels if row[col] == 1]
    if owned_products:
        actual_products_cf[user_id] = owned_products

# Select users for evaluation (subset for speed, or use all for full evaluation)
eval_users_cf = list(actual_products_cf.keys())[:1000]

In [26]:
user_cf = CollaborativeFilteringRecommender(use_sparse=True)
user_cf.fit(df, payment_account_labels, sample_size=50000)

print("Evaluating User-Based Collaborative Filtering:")
ucf_predictions = user_cf.predict_for_users(eval_users_cf, top_k=2)
ucf_actual = {user: actual_products_cf[user] for user in eval_users_cf if user in actual_products_cf}
ucf_score = evaluate_recommendations_with_precision(ucf_actual, ucf_predictions, k=2)


Computing similarity: 100%|██████████| 43/43 [00:08<00:00,  5.23it/s]


Evaluating User-Based Collaborative Filtering:


User-based CF predictions (batch 1): 100%|██████████| 100/100 [00:00<00:00, 3916.51it/s]
User-based CF predictions (batch 2): 100%|██████████| 100/100 [00:00<00:00, 3070.61it/s]
User-based CF predictions (batch 3): 100%|██████████| 100/100 [00:00<00:00, 4330.50it/s]
User-based CF predictions (batch 4): 100%|██████████| 100/100 [00:00<00:00, 3378.93it/s]
User-based CF predictions (batch 5): 100%|██████████| 100/100 [00:00<00:00, 4155.29it/s]
User-based CF predictions (batch 6): 100%|██████████| 100/100 [00:00<00:00, 2730.51it/s]
User-based CF predictions (batch 7): 100%|██████████| 100/100 [00:00<00:00, 3444.39it/s]
User-based CF predictions (batch 8): 100%|██████████| 100/100 [00:00<00:00, 2482.76it/s]
User-based CF predictions (batch 9): 100%|██████████| 100/100 [00:00<00:00, 3331.72it/s]
User-based CF predictions (batch 10): 100%|██████████| 100/100 [00:00<00:00, 3766.27it/s]

 MAP@2: 0.5853
 Precision@2: 0.3485
 Users evaluated: 1,000
 Coverage: 80.40%



