# 04: 9.6% Target Scenario Simulation

## Question 3: Can we increase quality from 8.0% to 9.6%?

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

df = pd.read_pickle('df_cleaned.pkl')
baseline_rate = df['is_good'].mean()
target_rate = 0.096

print("=" * 60)
print("Scenario Simulation: Can we reach 9.6% target?")
print("=" * 60)
print(f"\nCurrent Baseline GoodQualityRate: {baseline_rate:.4f} ({baseline_rate*100:.2f}%)")
print(f"Target GoodQualityRate: {target_rate:.4f} ({target_rate*100:.2f}%)")
print(f"Required improvement: {(target_rate - baseline_rate)*100:.2f} percentage points")
print(f"Improvement magnitude: {((target_rate - baseline_rate)/baseline_rate*100):.2f}%")
print("=" * 60)

## 7.1 Confirm Current Baseline

In [None]:
print(f"Current GoodQualityRate: {baseline_rate:.4f} ({baseline_rate*100:.2f}%)")
print(f"Question mentions 8.0% baseline: 0.0800 (8.00%)")
print(f"Difference: {abs(baseline_rate - 0.08)*100:.2f} percentage points")

all_leads = len(df)
good_quality_count = df['is_good'].sum()
closed_count = df['is_closed'].sum()
bad_count = df['is_bad'].sum()

GoodQualityRate = good_quality_count / all_leads
CloseRate = closed_count / all_leads
BadRate = bad_count / all_leads

print(f"\nThree core metrics:")
print(f"1. GoodQualityRate: {GoodQualityRate:.4f} ({GoodQualityRate*100:.2f}%)")
print(f"2. CloseRate: {CloseRate:.4f} ({CloseRate*100:.2f}%)")
print(f"3. BadRate: {BadRate:.4f} ({BadRate*100:.2f}%)")

## 7.2 Find Transportable High-Quality Supply

In [None]:
def find_high_quality_segments(df, segment_col, target_rate, min_volume=50):
    results = []
    
    for segment in df[segment_col].unique():
        if pd.isna(segment):
            segment_df = df[df[segment_col].isna()]
            segment_name = 'missing'
        else:
            segment_df = df[df[segment_col] == segment]
            segment_name = str(segment)
        
        if len(segment_df) < min_volume:
            continue
        
        n = len(segment_df)
        rate = segment_df['is_good'].mean()
        volume_share = n / len(df)
        
        if rate >= target_rate:
            results.append({
                'segment': segment_name,
                'rate': rate,
                'volume': n,
                'volume_share': volume_share,
                'lift': rate / baseline_rate
            })
    
    return pd.DataFrame(results).sort_values('rate', ascending=False)

high_quality_segments = {}

if 'dc_pages' in df.columns:
    high_quality_segments['dc_pages'] = find_high_quality_segments(df, 'dc_pages', target_rate)
if 'publisher_zone' in df.columns:
    high_quality_segments['publisher_zone'] = find_high_quality_segments(df, 'publisher_zone', target_rate)
if 'is_call_center' in df.columns:
    high_quality_segments['is_call_center'] = find_high_quality_segments(df, 'is_call_center', target_rate)
if 'address_score_bin' in df.columns:
    high_quality_segments['address_score_bin'] = find_high_quality_segments(df, 'address_score_bin', target_rate)
if 'phone_score_bin' in df.columns:
    high_quality_segments['phone_score_bin'] = find_high_quality_segments(df, 'phone_score_bin', target_rate)
if 'traffic_type' in df.columns:
    high_quality_segments['traffic_type'] = find_high_quality_segments(df, 'traffic_type', target_rate)

print("=" * 60)
print("High-Quality Segments (rate >= 9.6%)")
print("=" * 60)
for dim, result_df in high_quality_segments.items():
    if len(result_df) > 0:
        print(f"\nDimension: {dim}")
        print(result_df.to_string())

## 7.3 Scenario Simulation

### Scenario A: Cut the Tail (Remove Worst Traffic)

In [None]:
def scenario_a_cut_tail(df, cut_percentages=[5, 10, 15, 20]):
    results = []
    
    worst_segments = []
    
    for col in ['publisher_zone', 'traffic_type', 'phone_score_bin', 'address_score_bin']:
        if col in df.columns:
            segment_rates = df.groupby(col)['is_good'].agg(['mean', 'count']).reset_index()
            segment_rates = segment_rates[segment_rates['count'] >= 20]
            segment_rates = segment_rates.sort_values('mean')
            worst_segments.extend(segment_rates.head(3)[col].tolist())
    
    for cut_pct in cut_percentages:
        df_sorted = df.sort_values('is_good').reset_index(drop=True)
        cut_n = int(len(df_sorted) * cut_pct / 100)
        df_remaining = df_sorted.iloc[cut_n:].copy()
        
        new_rate = df_remaining['is_good'].mean()
        remaining_volume = len(df_remaining)
        volume_drop = (len(df) - remaining_volume) / len(df) * 100
        
        results.append({
            'cut_percentage': cut_pct,
            'new_rate': new_rate,
            'remaining_volume': remaining_volume,
            'volume_drop_pct': volume_drop,
            'reached_target': new_rate >= target_rate
        })
    
    return pd.DataFrame(results)

scenario_a_results = scenario_a_cut_tail(df)
print("=" * 60)
print("Scenario A: Cut the Tail")
print("=" * 60)
print(scenario_a_results.to_string())
print("=" * 60)

### Scenario B: Structural Reallocation (Budget Reallocation)

In [None]:
def scenario_b_reallocation(df, max_increase=0.3):
    high_segments = []
    low_segments = []
    
    if 'publisher_zone' in df.columns:
        zone_stats = df.groupby('publisher_zone')['is_good'].agg(['mean', 'count']).reset_index()
        zone_stats = zone_stats[zone_stats['count'] >= 50]
        
        high_zones = zone_stats[zone_stats['mean'] >= target_rate]
        low_zones = zone_stats[zone_stats['mean'] < baseline_rate * 0.8]
        
        high_segments.extend(high_zones['publisher_zone'].tolist())
        low_segments.extend(low_zones['publisher_zone'].tolist())
    
    high_volume_current = df[df['publisher_zone'].isin(high_segments)]['is_good'].count() if high_segments else 0
    low_volume_current = df[df['publisher_zone'].isin(low_segments)]['is_good'].count() if low_segments else 0
    
    total_volume = len(df)
    
    max_increase_volume = int(high_volume_current * max_increase)
    cut_volume = min(low_volume_current * 0.3, max_increase_volume)
    
    high_rate = df[df['publisher_zone'].isin(high_segments)]['is_good'].mean() if high_segments else baseline_rate
    low_rate = df[df['publisher_zone'].isin(low_segments)]['is_good'].mean() if low_segments else baseline_rate
    other_rate = df[~df['publisher_zone'].isin(high_segments + low_segments)]['is_good'].mean()
    other_volume = total_volume - high_volume_current - low_volume_current
    
    new_high_volume = high_volume_current + cut_volume
    new_low_volume = low_volume_current - cut_volume
    new_other_volume = other_volume
    
    new_total_good = (new_high_volume * high_rate + 
                      new_low_volume * low_rate + 
                      new_other_volume * other_rate)
    new_rate = new_total_good / total_volume
    
    return {
        'max_increase': max_increase,
        'new_rate': new_rate,
        'reached_target': new_rate >= target_rate,
        'high_volume_increase': cut_volume,
        'low_volume_decrease': cut_volume
    }

scenario_b_result = scenario_b_reallocation(df, max_increase=0.3)
print("=" * 60)
print("Scenario B: Structural Reallocation (High-Quality Segments Max +30%)")
print("=" * 60)
for key, value in scenario_b_result.items():
    print(f"{key}: {value}")
print("=" * 60)

### Scenario C: Rule-Based Filtering (Address/Phone Gating)

In [None]:
def scenario_c_gating(df, phone_threshold=4, address_threshold=4):
    results = []
    
    phone_score_col = None
    address_score_col = None
    
    for col in df.columns:
        if 'phone' in col.lower() and 'score' in col.lower() and 'bin' not in col.lower():
            phone_score_col = col
        if 'address' in col.lower() and 'score' in col.lower() and 'bin' not in col.lower():
            address_score_col = col
    
    if phone_score_col:
        df_phone_filtered = df[df[phone_score_col] >= phone_threshold].copy()
        phone_rate = df_phone_filtered['is_good'].mean()
        phone_volume = len(df_phone_filtered)
        phone_drop = (len(df) - phone_volume) / len(df) * 100
        
        results.append({
            'filter': f'PhoneScore >= {phone_threshold}',
            'new_rate': phone_rate,
            'remaining_volume': phone_volume,
            'volume_drop_pct': phone_drop,
            'reached_target': phone_rate >= target_rate
        })
    
    if address_score_col:
        df_address_filtered = df[df[address_score_col] >= address_threshold].copy()
        address_rate = df_address_filtered['is_good'].mean()
        address_volume = len(df_address_filtered)
        address_drop = (len(df) - address_volume) / len(df) * 100
        
        results.append({
            'filter': f'AddressScore >= {address_threshold}',
            'new_rate': address_rate,
            'remaining_volume': address_volume,
            'volume_drop_pct': address_drop,
            'reached_target': address_rate >= target_rate
        })
    
    if phone_score_col and address_score_col:
        df_both_filtered = df[(df[phone_score_col] >= phone_threshold) & 
                               (df[address_score_col] >= address_threshold)].copy()
        both_rate = df_both_filtered['is_good'].mean()
        both_volume = len(df_both_filtered)
        both_drop = (len(df) - both_volume) / len(df) * 100
        
        results.append({
            'filter': f'PhoneScore >= {phone_threshold} AND AddressScore >= {address_threshold}',
            'new_rate': both_rate,
            'remaining_volume': both_volume,
            'volume_drop_pct': both_drop,
            'reached_target': both_rate >= target_rate
        })
    
    return pd.DataFrame(results)

scenario_c_results = scenario_c_gating(df)
print("=" * 60)
print("Scenario C: Rule-Based Filtering (Score Gating)")
print("=" * 60)
print(scenario_c_results.to_string())
print("=" * 60)

## 7.4 Business Summary: Is It Worth It?

In [None]:
print("=" * 60)
print("Scenario Simulation Summary")
print("=" * 60)

best_scenario = None
best_rate = baseline_rate

for idx, row in scenario_a_results.iterrows():
    if row['reached_target'] and row['new_rate'] > best_rate:
        best_rate = row['new_rate']
        best_scenario = f"Scenario A: Remove worst {row['cut_percentage']}% traffic"

if scenario_b_result['reached_target'] and scenario_b_result['new_rate'] > best_rate:
    best_rate = scenario_b_result['new_rate']
    best_scenario = f"Scenario B: Budget reallocation (high-quality segments +30%)"

for idx, row in scenario_c_results.iterrows():
    if row['reached_target'] and row['new_rate'] > best_rate:
        best_rate = row['new_rate']
        best_scenario = f"Scenario C: {row['filter']}"

if best_scenario:
    print(f"\n✓ Can reach 9.6% target")
    print(f"Best scenario: {best_scenario}")
    print(f"Estimated new quality: {best_rate:.4f} ({best_rate*100:.2f}%)")
else:
    print(f"\n✗ Cannot reach 9.6% target")
    print(f"Current maximum achievable: {best_rate:.4f} ({best_rate*100:.2f}%)")
    print(f"Bottleneck: Insufficient high-quality segment supply or limited scalability")

print(f"\nCPL Impact Analysis:")
print(f"Current CPL: $30")
print(f"Target CPL: $33 (20% increase)")
print(f"Quality improvement: {((target_rate - baseline_rate)/baseline_rate*100):.2f}%")
print(f"If quality improvement can bring CPL increase, need to evaluate volume change impact")

print("=" * 60)