# 02: Lead Quality Trend Analysis

## Question 1: Is lead quality improving/declining over time? Is it statistically significant?

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from statsmodels.stats.proportion import proportions_ztest
import warnings
warnings.filterwarnings('ignore')

df = pd.read_pickle('df_cleaned.pkl')
print(f"Data shape: {df.shape}")

## 5.1 Aggregate by Day and Week, Plot Trends

In [None]:
daily_stats = df.groupby('date').agg({
    'is_good': ['sum', 'count'],
    'is_closed': 'sum',
    'is_bad': 'sum'
}).reset_index()

daily_stats.columns = ['date', 'good_count', 'total_count', 'closed_count', 'bad_count']
daily_stats['GoodQualityRate'] = daily_stats['good_count'] / daily_stats['total_count']
daily_stats['CloseRate'] = daily_stats['closed_count'] / daily_stats['total_count']
daily_stats['BadRate'] = daily_stats['bad_count'] / daily_stats['total_count']

daily_stats['GoodQualityRate_7d'] = daily_stats['GoodQualityRate'].rolling(window=7, min_periods=1).mean()
daily_stats['CloseRate_7d'] = daily_stats['CloseRate'].rolling(window=7, min_periods=1).mean()
daily_stats['BadRate_7d'] = daily_stats['BadRate'].rolling(window=7, min_periods=1).mean()

from scipy.stats import binom
def calc_ci(n, p, alpha=0.05):
    if n == 0:
        return (0, 0)
    se = np.sqrt(p * (1 - p) / n)
    z = stats.norm.ppf(1 - alpha/2)
    ci_lower = max(0, p - z * se)
    ci_upper = min(1, p + z * se)
    return (ci_lower, ci_upper)

daily_stats['GoodQualityRate_ci_lower'] = daily_stats.apply(
    lambda row: calc_ci(row['total_count'], row['GoodQualityRate'])[0], axis=1
)
daily_stats['GoodQualityRate_ci_upper'] = daily_stats.apply(
    lambda row: calc_ci(row['total_count'], row['GoodQualityRate'])[1], axis=1
)

print("Daily statistics (first 10 days):")
print(daily_stats.head(10))

In [None]:
weekly_stats = df.groupby('week').agg({
    'is_good': ['sum', 'count'],
    'is_closed': 'sum',
    'is_bad': 'sum'
}).reset_index()

weekly_stats.columns = ['week', 'good_count', 'total_count', 'closed_count', 'bad_count']
weekly_stats['GoodQualityRate'] = weekly_stats['good_count'] / weekly_stats['total_count']
weekly_stats['CloseRate'] = weekly_stats['closed_count'] / weekly_stats['total_count']
weekly_stats['BadRate'] = weekly_stats['bad_count'] / weekly_stats['total_count']

print("Weekly statistics:")
print(weekly_stats)

In [None]:
fig, axes = plt.subplots(3, 1, figsize=(14, 12))

ax1 = axes[0]
ax1.plot(daily_stats['date'], daily_stats['GoodQualityRate'], 'o-', alpha=0.6, label='Daily Rate', markersize=4)
ax1.plot(daily_stats['date'], daily_stats['GoodQualityRate_7d'], '-', linewidth=2, label='7-Day Rolling Mean', color='red')
ax1.fill_between(daily_stats['date'], daily_stats['GoodQualityRate_ci_lower'], 
                 daily_stats['GoodQualityRate_ci_upper'], alpha=0.2, label='95% CI')
ax1.set_title('GoodQualityRate Trend (Daily)', fontsize=14, fontweight='bold')
ax1.set_ylabel('Rate', fontsize=12)
ax1.legend()
ax1.grid(True, alpha=0.3)

ax2 = axes[1]
ax2.plot(daily_stats['date'], daily_stats['CloseRate'], 'o-', alpha=0.6, label='Daily Rate', markersize=4)
ax2.plot(daily_stats['date'], daily_stats['CloseRate_7d'], '-', linewidth=2, label='7-Day Rolling Mean', color='red')
ax2.set_title('CloseRate Trend (Daily)', fontsize=14, fontweight='bold')
ax2.set_ylabel('Rate', fontsize=12)
ax2.legend()
ax2.grid(True, alpha=0.3)

ax3 = axes[2]
ax3.plot(daily_stats['date'], daily_stats['BadRate'], 'o-', alpha=0.6, label='Daily Rate', markersize=4)
ax3.plot(daily_stats['date'], daily_stats['BadRate_7d'], '-', linewidth=2, label='7-Day Rolling Mean', color='red')
ax3.set_title('BadRate Trend (Daily)', fontsize=14, fontweight='bold')
ax3.set_ylabel('Rate', fontsize=12)
ax3.set_xlabel('Date', fontsize=12)
ax3.legend()
ax3.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('trend_daily.png', dpi=300, bbox_inches='tight')
plt.show()

## 5.2 Statistical Significance Tests

### Method 1: Two-Segment Comparison (First Half vs Second Half)

In [None]:
df_sorted = df.sort_values('date').reset_index(drop=True)
mid_point = len(df_sorted) // 2

first_half = df_sorted.iloc[:mid_point]
second_half = df_sorted.iloc[mid_point:]

rate_first = first_half['is_good'].mean()
rate_second = second_half['is_good'].mean()
n_first = len(first_half)
n_second = len(second_half)
count_first = first_half['is_good'].sum()
count_second = second_half['is_good'].sum()

print("=" * 60)
print("Two-Segment Comparison Analysis (First Half vs Second Half)")
print("=" * 60)
print(f"\nFirst Half:")
print(f"  Sample size: {n_first}")
print(f"  GoodQualityRate: {rate_first:.4f} ({rate_first*100:.2f}%)")
print(f"  Good quality count: {count_first}")

print(f"\nSecond Half:")
print(f"  Sample size: {n_second}")
print(f"  GoodQualityRate: {rate_second:.4f} ({rate_second*100:.2f}%)")
print(f"  Good quality count: {count_second}")

print(f"\nDifference: {rate_second - rate_first:.4f} ({((rate_second - rate_first)/rate_first*100):.2f}%)")

counts = np.array([count_first, count_second])
nobs = np.array([n_first, n_second])
z_stat, p_value = proportions_ztest(counts, nobs)

print(f"\nTwo-Proportion Z-Test:")
print(f"  z-statistic: {z_stat:.4f}")
print(f"  p-value: {p_value:.4f}")
print(f"  Significance: {'Significant' if p_value < 0.05 else 'Not significant'} (α=0.05)")

from scipy.stats import fisher_exact
contingency_table = [[count_first, n_first - count_first],
                     [count_second, n_second - count_second]]
oddsratio, p_fisher = fisher_exact(contingency_table)
print(f"\nFisher Exact Test:")
print(f"  Odds ratio: {oddsratio:.4f}")
print(f"  p-value: {p_fisher:.4f}")
print(f"  Significance: {'Significant' if p_fisher < 0.05 else 'Not significant'} (α=0.05)")
print("=" * 60)

### Method 2: Trend Regression (Logistic Regression)

In [None]:
from sklearn.linear_model import LogisticRegression
from statsmodels.api import Logit

X = df_sorted[['day_index']].values
y = df_sorted['is_good'].values

logit_model = Logit(y, X)
logit_result = logit_model.fit(disp=0)

print("=" * 60)
print("Trend Regression Analysis (Logistic Regression)")
print("=" * 60)
print(logit_result.summary())

coef = logit_result.params[0]
p_value_coef = logit_result.pvalues[0]

print(f"\nTime coefficient: {coef:.6f}")
print(f"p-value: {p_value_coef:.4f}")
print(f"Significance: {'Significant' if p_value_coef < 0.05 else 'Not significant'} (α=0.05)")
print(f"Trend direction: {'Increasing' if coef > 0 else 'Decreasing'}")
print("=" * 60)

### Method 3: Trend Test Only in Score Coverage Period

In [None]:
if 'address_score_bin' in df.columns:
    df_with_scores = df[df['address_score_bin'] != 'missing'].copy()
    
    if len(df_with_scores) > 100:
        print("=" * 60)
        print("Trend Analysis in Score Coverage Period")
        print("=" * 60)
        print(f"Score coverage period sample size: {len(df_with_scores)}")
        print(f"Score coverage period GoodQualityRate: {df_with_scores['is_good'].mean():.4f} ({df_with_scores['is_good'].mean()*100:.2f}%)")
        
        df_with_scores_sorted = df_with_scores.sort_values('date').reset_index(drop=True)
        df_with_scores_sorted['day_index_score'] = (df_with_scores_sorted['date'] - df_with_scores_sorted['date'].min()).dt.days
        
        X_score = df_with_scores_sorted[['day_index_score']].values
        y_score = df_with_scores_sorted['is_good'].values
        
        logit_model_score = Logit(y_score, X_score)
        logit_result_score = logit_model_score.fit(disp=0)
        
        print(f"\nTime coefficient: {logit_result_score.params[0]:.6f}")
        print(f"p-value: {logit_result_score.pvalues[0]:.4f}")
        print(f"Significance: {'Significant' if logit_result_score.pvalues[0] < 0.05 else 'Not significant'} (α=0.05)")
        print("=" * 60)

## 5.3 Trend Analysis Conclusion Summary

In [None]:
overall_rate = df['is_good'].mean()
first_half_rate = first_half['is_good'].mean()
second_half_rate = second_half['is_good'].mean()

print("=" * 60)
print("Trend Analysis Conclusion Summary")
print("=" * 60)
print(f"\nOverall GoodQualityRate: {overall_rate:.4f} ({overall_rate*100:.2f}%)")
print(f"First Half GoodQualityRate: {first_half_rate:.4f} ({first_half_rate*100:.2f}%)")
print(f"Second Half GoodQualityRate: {second_half_rate:.4f} ({second_half_rate*100:.2f}%)")
print(f"\nChange direction: {'Improving' if second_half_rate > first_half_rate else 'Declining' if second_half_rate < first_half_rate else 'No significant change'}")
print(f"Change magnitude: {abs(second_half_rate - first_half_rate):.4f} ({abs((second_half_rate - first_half_rate)/first_half_rate*100):.2f}%)")
print(f"\nStatistical significance (z-test): p={p_value:.4f}, {'Significant' if p_value < 0.05 else 'Not significant'}")
print(f"Statistical significance (logistic): p={p_value_coef:.4f}, {'Significant' if p_value_coef < 0.05 else 'Not significant'}")
print("=" * 60)