# 03: Driver Analysis and Segmentation

## Question 2: Which segments have higher/lower quality?

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from statsmodels.stats.proportion import proportions_ztest
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score, average_precision_score
import warnings
warnings.filterwarnings('ignore')

df = pd.read_pickle('df_cleaned.pkl')
baseline_rate = df['is_good'].mean()
print(f"Baseline GoodQualityRate: {baseline_rate:.4f} ({baseline_rate*100:.2f}%)")

## 6.1 Univariate Segmentation Table

In [None]:
def segment_analysis(df, segment_col, baseline_rate):
    results = []
    
    for segment in df[segment_col].unique():
        if pd.isna(segment):
            segment_df = df[df[segment_col].isna()]
            segment_name = 'missing'
        else:
            segment_df = df[df[segment_col] == segment]
            segment_name = str(segment)
        
        if len(segment_df) == 0:
            continue
        
        n = len(segment_df)
        good_count = segment_df['is_good'].sum()
        closed_count = segment_df['is_closed'].sum()
        bad_count = segment_df['is_bad'].sum()
        
        good_rate = good_count / n
        close_rate = closed_count / n
        bad_rate = bad_count / n
        
        lift = good_rate / baseline_rate if baseline_rate > 0 else 0
        
        se = np.sqrt(good_rate * (1 - good_rate) / n)
        z = stats.norm.ppf(0.975)
        ci_lower = max(0, good_rate - z * se)
        ci_upper = min(1, good_rate + z * se)
        
        counts = np.array([good_count, baseline_rate * len(df)])
        nobs = np.array([n, len(df)])
        z_stat, p_value = proportions_ztest(counts, nobs)
        
        results.append({
            'segment': segment_name,
            'leads': n,
            'GoodQualityRate': good_rate,
            'CloseRate': close_rate,
            'BadRate': bad_rate,
            'lift': lift,
            'ci_lower': ci_lower,
            'ci_upper': ci_upper,
            'p_value': p_value,
            'significant': p_value < 0.05
        })
    
    result_df = pd.DataFrame(results)
    result_df = result_df.sort_values('GoodQualityRate', ascending=False)
    return result_df

dimensions_to_analyze = []

if 'dc_pages' in df.columns:
    dimensions_to_analyze.append('dc_pages')
if 'design' in df.columns:
    dimensions_to_analyze.append('design')
if 'bg_color' in df.columns:
    dimensions_to_analyze.append('bg_color')
if 'publisher_zone' in df.columns:
    dimensions_to_analyze.append('publisher_zone')
if 'is_call_center' in df.columns:
    dimensions_to_analyze.append('is_call_center')
if 'address_score_bin' in df.columns:
    dimensions_to_analyze.append('address_score_bin')
if 'phone_score_bin' in df.columns:
    dimensions_to_analyze.append('phone_score_bin')
if 'is_branded' in df.columns:
    dimensions_to_analyze.append('is_branded')
if 'debt_bin' in df.columns:
    dimensions_to_analyze.append('debt_bin')
if 'state' in df.columns:
    dimensions_to_analyze.append('state')
if 'traffic_type' in df.columns:
    dimensions_to_analyze.append('traffic_type')

print("Dimensions to analyze:")
print(dimensions_to_analyze)

In [None]:
segment_results = {}

for dim in dimensions_to_analyze:
    print(f"\n{'='*60}")
    print(f"Dimension: {dim}")
    print('='*60)
    result = segment_analysis(df, dim, baseline_rate)
    segment_results[dim] = result
    print(result.to_string())
    
    print(f"\nTop 3 High-Quality Segments:")
    top3 = result.head(3)
    for idx, row in top3.iterrows():
        sig_mark = "***" if row['significant'] else ""
        print(f"  {row['segment']}: {row['GoodQualityRate']:.4f} ({row['GoodQualityRate']*100:.2f}%) "
              f"lift={row['lift']:.2f}x, n={row['leads']} {sig_mark}")
    
    print(f"\nTop 3 Low-Quality Segments:")
    bottom3 = result.tail(3)
    for idx, row in bottom3.iterrows():
        sig_mark = "***" if row['significant'] else ""
        print(f"  {row['segment']}: {row['GoodQualityRate']:.4f} ({row['GoodQualityRate']*100:.2f}%) "
              f"lift={row['lift']:.2f}x, n={row['leads']} {sig_mark}")

## 6.2 Multivariate Models

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

feature_cols = []
for col in ['dc_pages', 'design', 'bg_color', 'publisher_zone', 'is_call_center',
            'address_score_bin', 'phone_score_bin', 'is_branded', 'debt_bin', 
            'state', 'traffic_type', 'ad_size']:
    if col in df.columns:
        feature_cols.append(col)

print(f"Features used: {feature_cols}")

df_model = df[feature_cols + ['is_good']].copy()

for col in feature_cols:
    df_model[col] = df_model[col].fillna('missing')

df_encoded = pd.get_dummies(df_model[feature_cols], prefix=feature_cols)

X = df_encoded.values
y = df_model['is_good'].values

print(f"\nFeature dimensions: {X.shape}")
print(f"Target distribution: {y.sum()} / {len(y)} ({y.mean()*100:.2f}%)")

In [None]:
from statsmodels.api import Logit, add_constant

X_with_const = add_constant(X)
logit_model = Logit(y, X_with_const)
logit_result = logit_model.fit(disp=0, maxiter=1000)

print("=" * 60)
print("Logistic Regression Model Results")
print("=" * 60)
print(logit_result.summary())

feature_importance = pd.DataFrame({
    'feature': df_encoded.columns,
    'coef': logit_result.params[1:].values,
    'p_value': logit_result.pvalues[1:].values
})
feature_importance['abs_coef'] = np.abs(feature_importance['coef'])
feature_importance = feature_importance.sort_values('abs_coef', ascending=False)

print("\nTop 10 Important Features (by absolute coefficient):")
print(feature_importance.head(10).to_string())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)

y_pred_proba = rf_model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred_proba)
pr_auc = average_precision_score(y_test, y_pred_proba)

print("=" * 60)
print("Random Forest Model Results")
print("=" * 60)
print(f"AUC: {auc:.4f}")
print(f"PR-AUC: {pr_auc:.4f}")

rf_importance = pd.DataFrame({
    'feature': df_encoded.columns,
    'importance': rf_model.feature_importances_
})
rf_importance = rf_importance.sort_values('importance', ascending=False)

print("\nTop 10 Important Features (Random Forest):")
print(rf_importance.head(10).to_string())

## 6.3 Driver Summary

In [None]:
all_high_quality = []
all_low_quality = []

for dim, result_df in segment_results.items():
    top3 = result_df.head(3)
    bottom3 = result_df.tail(3)
    
    for idx, row in top3.iterrows():
        if row['leads'] >= 50:
            all_high_quality.append({
                'dimension': dim,
                'segment': row['segment'],
                'rate': row['GoodQualityRate'],
                'lift': row['lift'],
                'leads': row['leads']
            })
    
    for idx, row in bottom3.iterrows():
        if row['leads'] >= 50:
            all_low_quality.append({
                'dimension': dim,
                'segment': row['segment'],
                'rate': row['GoodQualityRate'],
                'lift': row['lift'],
                'leads': row['leads']
            })

high_df = pd.DataFrame(all_high_quality).sort_values('rate', ascending=False)
low_df = pd.DataFrame(all_low_quality).sort_values('rate', ascending=True)

print("=" * 60)
print("Driver Summary")
print("=" * 60)
print("\nTop 5 High-Quality Segments (Recommend to Scale):")
print(high_df.head(5).to_string())

print("\nTop 5 Low-Quality Segments (Recommend to Cut):")
print(low_df.head(5).to_string())
print("=" * 60)