In [214]:
# ============================================================================
# SALES PIPELINE PREDICTION - COMPLETE IMPLEMENTATION
# ============================================================================
# This code walks through every step with detailed comments
# ============================================================================

import pyodbc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime


# For modeling
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (accuracy_score, roc_auc_score, precision_score, 
                             recall_score, f1_score, confusion_matrix, 
                             classification_report, roc_curve, auc)
from imblearn.over_sampling import SMOTE
from category_encoders import TargetEncoder

# Set style for visualizations
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)


import warnings
warnings.filterwarnings('ignore')

print("✓ All libraries imported successfully!\n")

✓ All libraries imported successfully!



In [215]:
print("=" * 80)
print("PHASE 1: LOADING AND EXPLORING DATA")
print("=" * 80)

PHASE 1: LOADING AND EXPLORING DATA


In [216]:
# Connect to database
conn = pyodbc.connect(
    "Driver={SQL Server};"
    "Server=ELICE99\\SQLEXPRESS;"
    "Database=CRM_Sales_Opportunity;"
    "Trusted_Connection=yes;"
)

In [217]:
# Query to get data (excluding 'Prospecting' stage)
query = '''
SELECT p.*, 
    a.sector, a.year_established, a.account_tier, a.employees, a.office_location,
    s.manager, s.regional_office
FROM dbo.sales_pipeline p
LEFT JOIN accounts a ON a.account = p.account
LEFT JOIN sales_teams s ON p.sales_agent = s.sales_agent
WHERE deal_stage NOT IN ('Prospecting')
'''

df = pd.read_sql(query, conn)
conn.close()

print(f"✓ Data loaded: {len(df)} records")
print(f"✓ Columns: {df.shape[1]}")

✓ Data loaded: 8300 records
✓ Columns: 17


In [218]:
print("\n" + "=" * 80)
print(" EXPLORATORY DATA ANALYSIS")
print("=" * 80)


 EXPLORATORY DATA ANALYSIS


In [219]:
print("\nFirst few rows:")
print(df.head())


First few rows:
  opportunity_id      sales_agent         product  account deal_stage  \
0       1C1I7A6R      Moses Frase  GTX Plus Basic  Cancity        Won   
1       Z063OYW0  Darcel Schlecht          GTXPro    Isdom        Won   
2       EC4QE1BX  Darcel Schlecht      MG Special  Cancity        Won   
3       MV1LWRNH      Moses Frase       GTX Basic  Codehow        Won   
4       PE84CX4O        Zane Levy       GTX Basic   Hatfan        Won   

  engage_date  close_date  close_value  is_active  deal_duration    sector  \
0  2016-10-20  2017-03-01         1054      False          132.0    Retail   
1  2016-10-25  2017-03-11         4514      False          137.0   Medical   
2  2016-10-25  2017-03-07           50      False          133.0    Retail   
3  2016-10-25  2017-03-09          588      False          135.0  Software   
4  2016-10-25  2017-03-02          517      False          128.0  Services   

   year_established      account_tier  employees office_location  \
0      

In [220]:
print("\nBasic Statistics:")
print(df.describe())


Basic Statistics:
        close_value  deal_duration  year_established     employees
count   8300.000000    6711.000000       7212.000000   7212.000000
mean    1205.486024      47.985397       1995.454104   5737.717277
std     2167.597195      41.057665          9.186596   6850.680603
min        0.000000       1.000000       1979.000000      9.000000
25%        0.000000       8.000000       1988.000000   1238.000000
50%       49.000000      45.000000       1995.000000   3492.000000
75%     1136.000000      85.000000       2002.000000   7523.000000
max    30288.000000     138.000000       2017.000000  34288.000000


In [221]:
df['employees'].max()

np.float64(34288.0)

In [222]:
df['employees'].describe(percentiles=[0.9,0.95,0.99])

count     7212.000000
mean      5737.717277
std       6850.680603
min          9.000000
50%       3492.000000
90%      16499.000000
95%      17479.000000
99%      34288.000000
max      34288.000000
Name: employees, dtype: float64

In [223]:
#log transformation

#since only few companies are this large, we capped the outliers
df["employees_log"]= np.log1p(df['employees'])

In [224]:
#company size category

bins = [0, 50, 250, 1000, 5000, 15000, np.inf]
labels = ['micro', 'small', 'medium', 'large', 'enterprise', 'mega']
df ["company_size"] = pd.cut(df['employees'], bins=bins, labels=labels).astype(str)

In [225]:
print("\n" + "=" * 80)
print("DATA CLEANING & LEAKAGE PREVENTION")
print("=" * 80)


DATA CLEANING & LEAKAGE PREVENTION


In [226]:
# Standardize column names (lowercase, underscores)
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [227]:
# Standardize string values
string_cols = df.select_dtypes(include='object').columns
for col in string_cols:
    df[col] = df[col].str.lower().str.replace(' ', '_')

print("✓ Column names and values standardized\n")

✓ Column names and values standardized



In [228]:
# CRITICAL: Keep only CLOSED deals (won or lost) for training
print("\nOriginal deal_stage distribution:")
print(df['deal_stage'].value_counts())


Original deal_stage distribution:
deal_stage
won         4238
lost        2473
engaging    1589
Name: count, dtype: int64


In [229]:
# Filter: Keep only won and lost deals for training
df_training = df[df['deal_stage'].isin(['won', 'lost'])].copy()

# Save active deals separately for later predictions
df_active = df[df['deal_stage'] == 'engaging'].copy()

print(f"\nAfter filtering:")
print(f"Training data (won + lost): {len(df_training)} rows")
print(f"Active deals (to predict): {len(df_active)} rows")


After filtering:
Training data (won + lost): 6711 rows
Active deals (to predict): 1589 rows


In [230]:
# CRITICAL: DROP LEAKAGE COLUMNS
print("\nRemoving leakage columns...")
leakage_columns = ['close_date', 'close_value', 'is_active','deal_duration', 'opportunity_id','employees', 'account_tier']
df_training = df_training.drop(columns=leakage_columns, errors='ignore')
df_active = df_active.drop(columns=leakage_columns, errors='ignore')

print("Leakage columns removed: close_date, close_value, is_active, deal_duration, opportunity_id, employees, account_tier")


Removing leakage columns...
Leakage columns removed: close_date, close_value, is_active, deal_duration, opportunity_id, employees, account_tier


In [231]:
print("\n" + "=" * 80)
print("CREATE TARGET VARIABLE")
print("=" * 80)


CREATE TARGET VARIABLE


In [232]:
# Create binary target: 1 = Won, 0 = Lost
df_training['target'] = (df_training['deal_stage'] == 'won').astype(int)

print("\nTarget Distribution:")
print(df_training['target'].value_counts())
print(f"\nwon Rate: {df_training['target'].mean() * 100:.2f}%")

# Drop deal_stage column (no longer needed)
df_training = df_training.drop('deal_stage', axis=1)


Target Distribution:
target
1    4238
0    2473
Name: count, dtype: int64

won Rate: 63.15%


In [233]:
print("\n" + "=" * 80)
print("PHASE 8: TRAIN-TEST SPLIT")
print("=" * 80)


PHASE 8: TRAIN-TEST SPLIT


In [234]:
df_full_train, df_test = train_test_split(df_training, test_size=0.2, random_state=1)
len(df_full_train), len(df_test)

(5368, 1343)

In [235]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)
len(df_train), len(df_test), len(df_val)

(4026, 1343, 1342)

In [236]:
# Separate the target variable (y) from the features (X) for the training and test sets
y_train = df_train.target.values
y_val = df_val.target.values
y_test = df_test.target.values

In [237]:
df_train.target.value_counts(normalize=True)

target
1    0.626428
0    0.373572
Name: proportion, dtype: float64

In [238]:
win_rate = df_train.target.mean() 
win_rate

np.float64(0.626428216592151)

In [239]:
print("\n" + "=" * 80)
print(" FEATURE ENGINEERING")
print("=" * 80)


 FEATURE ENGINEERING


In [240]:
# -------- 5.1: TEMPORAL FEATURES FROM ENGAGE_DATE --------
print("\n5.1: Extracting Temporal Features...")

# Convert to datetime

df_train['engage_date'] = pd.to_datetime(df_train['engage_date'])
df_val['engage_date'] = pd.to_datetime(df_val['engage_date'])
df_test['engage_date'] = pd.to_datetime(df_test['engage_date'])
df_active['engage_date'] = pd.to_datetime(df_active['engage_date'])


# Extract temporal features
df_train['month_engaged'] = df_train['engage_date'].dt.month
df_train['quarter_engaged'] = df_train['engage_date'].dt.quarter
df_train['day_of_week_engaged'] = df_train['engage_date'].dt.dayofweek
df_train['is_weekend'] = (df_train['day_of_week_engaged'].isin([5, 6])).astype(int)
df_train['days_into_year'] = df_train['engage_date'].dt.dayofyear

# Extract temporal features
df_val['month_engaged'] = df_val['engage_date'].dt.month
df_val['quarter_engaged'] = df_val['engage_date'].dt.quarter
df_val['day_of_week_engaged'] = df_val['engage_date'].dt.dayofweek
df_val['is_weekend'] = (df_val['day_of_week_engaged'].isin([5, 6])).astype(int)
df_val['days_into_year'] = df_val['engage_date'].dt.dayofyear

# Extract temporal features
df_test['month_engaged'] = df_test['engage_date'].dt.month
df_test['quarter_engaged'] = df_test['engage_date'].dt.quarter
df_test['day_of_week_engaged'] = df_test['engage_date'].dt.dayofweek
df_test['is_weekend'] = (df_test['day_of_week_engaged'].isin([5, 6])).astype(int)
df_test['days_into_year'] = df_test['engage_date'].dt.dayofyear

# Apply same to active deals
df_active['month_engaged'] = df_active['engage_date'].dt.month
df_active['quarter_engaged'] = df_active['engage_date'].dt.quarter
df_active['day_of_week_engaged'] = df_active['engage_date'].dt.dayofweek
df_active['is_weekend'] = (df_active['day_of_week_engaged'].isin([5, 6])).astype(int)
df_active['days_into_year'] = df_active['engage_date'].dt.dayofyear

print("✓ Temporal features created: month, quarter, day_of_week, is_weekend, days_into_year")


5.1: Extracting Temporal Features...
✓ Temporal features created: month, quarter, day_of_week, is_weekend, days_into_year


In [241]:
# -------- 5.2: DEAL DURATION --------
print("\n5.2: Calculating Deal Duration...")

# Convert to datetime
df['engage_date'] = pd.to_datetime(df['engage_date'])
df['close_date'] = pd.to_datetime(df['close_date'], errors='coerce')

# Get reference date as the max date in dataset
ref_date = df[['engage_date', 'close_date']].max().max()
print(f"  Reference date: {ref_date.strftime('%Y-%m-%d')}")


5.2: Calculating Deal Duration...
  Reference date: 2017-12-31


In [242]:
# Calculate deal duration
df['closed_duration'] = (df['close_date'] - df['engage_date']).dt.days
df['active_duration'] = (ref_date - df['engage_date']).dt.days
df['deal_age'] = df['closed_duration'].fillna(df['active_duration'])

# Apply to training and active data

df_train['deal_age'] = df.loc[df_train.index, 'deal_age'].values
df_val['deal_age'] = df.loc[df_val.index, 'deal_age'].values
df_test['deal_age'] = df.loc[df_test.index, 'deal_age'].values
df_active['deal_age'] = df.loc[df_active.index, 'deal_age'].values

print(f"✓ Deal age calculated")
print(f"  Average: {df_train['deal_age'].mean():.0f} days")
print(f"  Min-Max: {df_train['deal_age'].min():.0f} - {df_train['deal_age'].max():.0f} days")

✓ Deal age calculated
  Average: 48 days
  Min-Max: 1 - 137 days


In [243]:
# --------  INTERACTION FEATURES --------
print("\nCreating Interaction Features...")


Creating Interaction Features...




In [244]:
# Sales Agent Performance
agent_stats = df_train.groupby('sales_agent').agg({
    'target': ['mean', 'count', 'sum']
}).reset_index()
agent_stats.columns = ['sales_agent', 'agent_win_rate', 'agent_total_deals', 'agent_win']

df_train = df_train.merge(agent_stats, on='sales_agent', how='left')
df_val = df_val.merge(agent_stats, on='sales_agent', how='left')
df_test = df_test.merge(agent_stats, on='sales_agent', how='left')
df_active = df_active.merge(agent_stats, on='sales_agent', how='left')

agent_stats.head()

Unnamed: 0,sales_agent,agent_win_rate,agent_total_deals,agent_win
0,anna_snelling,0.612745,204,125
1,boris_faz,0.666667,90,60
2,cassey_cress,0.617284,162,100
3,cecily_lampkin,0.68,100,68
4,corliss_cosme,0.627586,145,91


In [245]:
# account Performance
account_stats = df_train.groupby('account').agg({
    'target': ['mean', 'count', 'sum']
}).reset_index()
account_stats.columns = ['account', 'account_win_rate', 'account_deal_count', 'account_total_win']

df_train = df_train.merge(account_stats, on='account', how='left')
df_val = df_val.merge(account_stats, on='account', how='left')
df_test = df_test.merge(account_stats, on='account', how='left')
df_active = df_active.merge(account_stats, on='account', how='left')

account_stats.head()

Unnamed: 0,account,account_win_rate,account_deal_count,account_total_win
0,acme_corporation,0.571429,35,20
1,betasoloin,0.676471,34,23
2,betatech,0.607143,56,34
3,bioholding,0.666667,42,28
4,bioplex,0.62963,27,17


In [246]:
# Sector Performance
sector_stats = df_train.groupby('sector').agg({
    'target': ['mean', 'count', 'sum']
}).reset_index()
sector_stats.columns = ['sector', 'sector_win_rate', 'sector_deal_count', 'sector_total_win']

df_train = df_train.merge(sector_stats, on='sector', how='left')
df_val = df_val.merge(sector_stats, on='sector', how='left')
df_test = df_test.merge(sector_stats, on='sector', how='left')
df_active = df_active.merge(sector_stats, on='sector', how='left')

sector_stats.head()

Unnamed: 0,sector,sector_win_rate,sector_deal_count,sector_total_win
0,employment,0.647399,173,112
1,entertainment,0.603306,242,146
2,finance,0.611111,378,231
3,marketing,0.634286,350,222
4,medical,0.616179,581,358


In [247]:
# Office Location Performance
office_stats = df_train.groupby('office_location').agg({
    'target': ['mean', 'count', 'sum']
}).reset_index()
office_stats.columns = ['office_location', 'office_win_rate', 'office_deal_count', 'office_total_win']

df_train = df_train.merge(office_stats, on='office_location', how='left')
df_val = df_val.merge(office_stats, on='office_location', how='left')
df_test = df_test.merge(office_stats, on='office_location', how='left')
df_active = df_active.merge(office_stats, on='office_location', how='left')

office_stats.head()

Unnamed: 0,office_location,office_win_rate,office_deal_count,office_total_win
0,belgium,0.671875,64,43
1,brazil,0.678571,28,19
2,china,0.76,25,19
3,germany,0.78125,32,25
4,italy,0.591837,49,29


In [248]:
# Regional Office Performance
region_stats = df_train.groupby('regional_office').agg({
    'target': ['mean', 'count', 'sum']
}).reset_index()
region_stats.columns = ['regional_office', 'region_win_rate', 'region_deal_count', 'region_total_win']

df_train = df_train.merge(region_stats, on='regional_office', how='left')
df_val = df_val.merge(region_stats, on='regional_office', how='left')
df_test = df_test.merge(region_stats, on='regional_office', how='left')
df_active = df_active.merge(region_stats, on='regional_office', how='left')

region_stats.head()

Unnamed: 0,regional_office,region_win_rate,region_deal_count,region_total_win
0,central,0.631074,1564,987
1,east,0.609083,1123,684
2,west,0.635549,1339,851


In [249]:
# company_size Performance
company_size_stats = df_train.groupby('company_size').agg({
    'target': ['mean', 'count', 'sum']
}).reset_index()
company_size_stats.columns = ['company_size', 'company_size_win_rate', 'company_size_deal_count', 'company_size_total_win']

df_train = df_train.merge(company_size_stats, on='company_size', how='left')
df_val = df_val.merge(company_size_stats, on='company_size', how='left')
df_test = df_test.merge(company_size_stats, on='company_size', how='left')
df_active = df_active.merge(company_size_stats, on='company_size', how='left')

company_size_stats.head()

Unnamed: 0,company_size,company_size_win_rate,company_size_deal_count,company_size_total_win
0,enterprise,0.634167,1159,735
1,large,0.627574,1748,1097
2,medium,0.619444,360,223
3,mega,0.595349,430,256
4,micro,0.692308,104,72


In [250]:
# Product Performance
product_stats = df_full_train.groupby('product').agg({
    'target': ['mean', 'count', 'sum']
}).reset_index()
product_stats.columns = ['product', 'product_win_rate', 'product_deal_count', 'product_total_win']

df_full_train = df_train.merge(product_stats, on='product', how='left')
df_train = df_train.merge(product_stats, on='product', how='left')
df_val = df_val.merge(product_stats, on='product', how='left')
df_test = df_test.merge(product_stats, on='product', how='left')
df_active = df_active.merge(product_stats, on='product', how='left')

product_stats.head()

Unnamed: 0,product,product_win_rate,product_deal_count,product_total_win
0,gtk_500,0.5,18,9
1,gtx_basic,0.638261,1150,734
2,gtx_plus_basic,0.629405,823,518
3,gtx_plus_pro,0.631751,611,386
4,gtxpro,0.623608,898,560


In [251]:
print("✓ Aggregation features created:")
print(f"  - agent_win_rate, agent_total_deals")
print(f"  - product_lost_rate, product_deal_count")
print(f"  - sector_win_rate, sector_deal_count")
print(f"  - company_size_win_rate, company_size_deal_count")
print(f"  - region_win_rate, region_deal_count")

✓ Aggregation features created:
  - agent_win_rate, agent_total_deals
  - product_lost_rate, product_deal_count
  - sector_win_rate, sector_deal_count
  - company_size_win_rate, company_size_deal_count
  - region_win_rate, region_deal_count


In [252]:
# Agent-Product performance synergy
df_train['agent_product_synergy'] = df_train['agent_win_rate'] * df_train['product_win_rate']
df_val['agent_product_synergy'] = df_val['agent_win_rate'] * df_val['product_win_rate']

# Agent efficiency metrics
df_train['agent_win_efficiency'] = (1 - df_train['agent_win_rate']) * df_train['agent_total_deals']
df_val['agent_win_efficiency'] = (1 - df_val['agent_win_rate']) * df_val['agent_total_deals']

# Performance relative to sector
df_train['agent_vs_sector'] = df_train['agent_win_rate'] - df_train['sector_win_rate']
df_val['agent_vs_sector'] = df_val['agent_win_rate'] - df_val['sector_win_rate']

# Deal complexity score
df_train['deal_complexity'] = (
    df_train['employees_log'] * df_train['deal_age'] / 
    (df_train['agent_total_deals'] + 1)
)
df_val['deal_complexity'] = (
    df_val['employees_log'] * df_val['deal_age'] / 
    (df_val['agent_total_deals'] + 1)
)

# Office load vs performance
df_train['office_load'] = df_train['office_deal_count'] / (df_train['office_total_win'] + 1)
df_val['office_load'] = df_val['office_deal_count'] / (df_val['office_total_win'] + 1)

# Product-sector fit
df_train['product_sector_fit'] = df_train['product_win_rate'] * df_train['sector_win_rate']
df_val['product_sector_fit'] = df_val['product_win_rate'] * df_val['sector_win_rate']

# Regional performance relative to company size
df_train['region_size_match'] = df_train['region_win_rate'] * df_train['company_size_win_rate']
df_val['region_size_match'] = df_val['region_win_rate'] * df_val['company_size_win_rate']

# Temporal risk factors
df_train['quarter_risk'] = df_train['quarter_engaged'].map({1: 0.8, 2: 0.9, 3: 1.0, 4: 1.2})
df_val['quarter_risk'] = df_val['quarter_engaged'].map({1: 0.8, 2: 0.9, 3: 1.0, 4: 1.2})

df_train['is_quarter_end'] = df_train['month_engaged'].isin([3, 6, 9, 12]).astype(int)
df_val['is_quarter_end'] = df_val['month_engaged'].isin([3, 6, 9, 12]).astype(int)

print("✓ Created 9 advanced interaction features")


✓ Created 9 advanced interaction features


In [253]:
# Remove the target variable from the feature DataFrames
del df_train['target']
del df_val['target']
del df_test['target']

In [254]:
numerical = ['month_engaged',
       'quarter_engaged', 'day_of_week_engaged', 'is_weekend',
       'days_into_year', 'deal_age', 'agent_win_rate', 'agent_total_deals',
       'agent_win', 'account_win_rate', 'account_deal_count',
       'account_total_win', 'sector_win_rate', 'sector_deal_count',
       'sector_total_win', 'office_win_rate', 'office_deal_count',
       'office_total_win', 'region_win_rate', 'region_deal_count',
       'region_total_win', 'company_size_win_rate',
       'company_size_deal_count', 'company_size_total_win',
       'product_win_rate', 'product_deal_count', 'product_total_win',
       'agent_product_synergy', 'agent_win_efficiency', 'agent_vs_sector',
       'deal_complexity', 'office_load', 'product_sector_fit',
       'region_size_match', 'quarter_risk', 'is_quarter_end','year_established','employees_log']

categorical = ['sales_agent', 'product', 'account', 'engage_date', 'sector',
        'office_location', 'manager', 'regional_office',
        'company_size',]

In [255]:
print("\n1. Applying Target Encoding for Categorical Features...")
print("-" * 50)


1. Applying Target Encoding for Categorical Features...
--------------------------------------------------


In [256]:
# Use smoothing to prevent overfitting
encoder = TargetEncoder(cols=categorical, smoothing=2.0)

In [257]:
# Fit on train, transform both
df_train_encoded = encoder.fit_transform(df_train[categorical], y_train)
df_val_encoded = encoder.transform(df_val[categorical])

print(f"✓ Encoded {len(categorical)} categorical features")

✓ Encoded 9 categorical features


In [258]:
print("\n4. Scaling Features...")
print("-" * 50)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(df_train_encoded)
X_val_scaled = scaler.transform(df_val_encoded)

print("✓ Features scaled")


4. Scaling Features...
--------------------------------------------------
✓ Features scaled


In [270]:
print("\n5. Applying SMOTE Resampling...")
print("-" * 50)

from imblearn.over_sampling import BorderlineSMOTE

print(f"Before SMOTE: {np.bincount(y_train)}")

smote = BorderlineSMOTE(
    sampling_strategy=1,  # Make lost class 80% of won class
    random_state=42,
    k_neighbors=5
)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print(f"After SMOTE:  {np.bincount(y_train_resampled)}")
print(f"✓ Training set size: {len(X_train_resampled)}")


5. Applying SMOTE Resampling...
--------------------------------------------------
Before SMOTE: [1504 2522]


After SMOTE:  [2522 2522]
✓ Training set size: 5044


In [260]:
print("\n" + "=" * 80)
print(" FEATURE IMPORTANCE: CORRELATION")
print("=" * 80)


 FEATURE IMPORTANCE: CORRELATION


In [261]:
print("\n" + "="*80)
print("TRAINING OPTIMIZED MODELS")
print("="*80)

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

models_optimized = {}
results_optimized = {}


TRAINING OPTIMIZED MODELS


In [262]:
# MODEL 1: Optimized XGBoost
print("\n1. Training Optimized XGBoost...")
print("-" * 50)

xgb_opt = XGBClassifier(
    n_estimators=500,
    max_depth=5,
    learning_rate=0.03,
    min_child_weight=5,
    subsample=0.8,
    colsample_bytree=0.7,
    gamma=0.2,
    scale_pos_weight=3,  # For class imbalance
    reg_alpha=1,
    reg_lambda=2,
    random_state=42,
    eval_metric='auc',
    early_stopping_rounds=50,
    verbosity=0
)
xgb_opt.fit(
    X_train_resampled, 
    y_train_resampled,
    eval_set=[(X_val_scaled, y_val)],
    verbose=False
)

y_pred_xgb = xgb_opt.predict(X_val_scaled)
y_proba_xgb = xgb_opt.predict_proba(X_val_scaled)[:, 1]

models_optimized['XGBoost'] = xgb_opt
results_optimized['XGBoost'] = {
    'predictions': y_pred_xgb,
    'probabilities': y_proba_xgb
}

print("✓ XGBoost trained")



1. Training Optimized XGBoost...
--------------------------------------------------
✓ XGBoost trained


In [263]:
# MODEL 2: LightGBM (often performs better on imbalanced data)
print("\n2. Training LightGBM...")
print("-" * 50)

lgbm_opt = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.03,
    max_depth=7,
    num_leaves=31,
    min_child_samples=30,
    subsample=0.8,
    colsample_bytree=0.7,
    class_weight='balanced',
    random_state=42,
    verbose=-1
)

lgbm_opt.fit(X_train_resampled, y_train_resampled)

y_pred_lgbm = lgbm_opt.predict(X_val_scaled)
y_proba_lgbm = lgbm_opt.predict_proba(X_val_scaled)[:, 1]

models_optimized['LightGBM'] = lgbm_opt
results_optimized['LightGBM'] = {
    'predictions': y_pred_lgbm,
    'probabilities': y_proba_lgbm
}

print("✓ LightGBM trained")


2. Training LightGBM...
--------------------------------------------------
✓ LightGBM trained


In [264]:
# MODEL 3: Optimized Gradient Boosting
print("\n3. Training Optimized Gradient Boosting...")
print("-" * 50)

gb_opt = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    min_samples_split=20,
    min_samples_leaf=10,
    subsample=0.8,
    max_features='sqrt',
    random_state=42
)

gb_opt.fit(X_train_resampled, y_train_resampled)

y_pred_gb = gb_opt.predict(X_val_scaled)
y_proba_gb = gb_opt.predict_proba(X_val_scaled)[:, 1]

models_optimized['GradientBoosting'] = gb_opt
results_optimized['GradientBoosting'] = {
    'predictions': y_pred_gb,
    'probabilities': y_proba_gb
}

print("✓ Gradient Boosting trained")



3. Training Optimized Gradient Boosting...
--------------------------------------------------
✓ Gradient Boosting trained


In [265]:
# MODEL 4: Optimized Logistic Regression
print("\n4. Training Optimized Logistic Regression...")
print("-" * 50)

lr_opt = LogisticRegression(
    
    C=0.1,
    max_iter=1000,
    random_state=42,
    solver='saga',
    penalty='elasticnet',
    l1_ratio=0.5
)

lr_opt.fit(X_train_resampled, y_train_resampled)

y_pred_lr = lr_opt.predict(X_val_scaled)
y_proba_lr = lr_opt.predict_proba(X_val_scaled)[:, 1]

models_optimized['LogisticRegression'] = lr_opt
results_optimized['LogisticRegression'] = {
    'predictions': y_pred_lr,
    'probabilities': y_proba_lr
}

print("✓ Logistic Regression trained")



4. Training Optimized Logistic Regression...
--------------------------------------------------
✓ Logistic Regression trained


In [266]:
print("\n" + "="*80)
print("MODEL EVALUATION - OPTIMIZED MODELS")
print("="*80)


MODEL EVALUATION - OPTIMIZED MODELS


In [267]:
evaluation_results = []

for model_name, results in results_optimized.items():
    y_pred = results['predictions']
    y_proba = results['probabilities']
    
    accuracy = accuracy_score(y_val, y_pred)
    roc_auc = roc_auc_score(y_val, y_proba)
    precision = precision_score(y_val, y_pred, zero_division=0)
    recall = recall_score(y_val, y_pred, zero_division=0)
    f1 = f1_score(y_val, y_pred, zero_division=0)
    
    evaluation_results.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'ROC-AUC': roc_auc,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1
    })
    
    print(f"\n{model_name}")
    print("-" * 50)
    print(f"Accuracy:   {accuracy:.4f}")
    print(f"ROC-AUC:    {roc_auc:.4f}  ← MAIN METRIC")
    print(f"Precision:  {precision:.4f}")
    print(f"Recall:     {recall:.4f}")
    print(f"F1-Score:   {f1:.4f}")



XGBoost
--------------------------------------------------
Accuracy:   0.6259
ROC-AUC:    0.5275  ← MAIN METRIC
Precision:  0.6322
Recall:     0.9704
F1-Score:   0.7656

LightGBM
--------------------------------------------------
Accuracy:   0.5380
ROC-AUC:    0.5209  ← MAIN METRIC
Precision:  0.6377
Recall:     0.6166
F1-Score:   0.6270

GradientBoosting
--------------------------------------------------
Accuracy:   0.5641
ROC-AUC:    0.5279  ← MAIN METRIC
Precision:  0.6401
Recall:     0.7030
F1-Score:   0.6701

LogisticRegression
--------------------------------------------------
Accuracy:   0.5708
ROC-AUC:    0.5119  ← MAIN METRIC
Precision:  0.6312
Recall:     0.7657
F1-Score:   0.6920


In [268]:
# Create comparison dataframe
eval_comparison = pd.DataFrame(evaluation_results)
eval_comparison = eval_comparison.sort_values('ROC-AUC', ascending=False)

In [269]:
print("\n" + "="*80)
print("MODEL COMPARISON (Sorted by ROC-AUC)")
print("="*80)
print(eval_comparison.to_string(index=False))


MODEL COMPARISON (Sorted by ROC-AUC)
             Model  Accuracy  ROC-AUC  Precision   Recall  F1-Score
  GradientBoosting  0.564083 0.527932   0.640086 0.702959  0.670051
           XGBoost  0.625931 0.527489   0.632228 0.970414  0.765640
          LightGBM  0.538003 0.520917   0.637699 0.616568  0.626955
LogisticRegression  0.570790 0.511874   0.631220 0.765680  0.691979
