We have apply statistical test for checking that the sample we created is belongs to population or not.

The distribution of every variable in the sample should not be statistically different from the distribution in the population.

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
from scipy.stats import ks_2samp, ttest_ind, levene, chisquare
from statsmodels.stats.proportion import proportions_ztest
import warnings
warnings.filterwarnings('ignore')

# Load Dataset

In [2]:
population = pd.read_csv(r"D:\#Great Learning\Capstone Project\Datasets\Traffic_Crashes_-_Crashes.csv")
sample = pd.read_csv(r"D:\#Great Learning\Capstone Project\Datasets\crash_data.csv")
results = []

# Track processed columns to avoid duplicates
processed = set()

# Numerical Distribution Test

In [3]:
num_cols = population.select_dtypes(include=['int64', 'float64']).columns

for col in num_cols:
    if col in processed:
        continue
    
    # Check if actually binary (encoded as numeric)
    if population[col].nunique() <= 2:
        continue  # Will be handled in binary section
    
    pop = population[col].dropna()
    smp = sample[col].dropna()
    
    # Skip if insufficient data
    if len(pop) < 3 or len(smp) < 3:
        results.append([col, "Numerical", None, None, None, 
                       "Insufficient data", "Cannot assess"])
        processed.add(col)
        continue
    
    # Primary test: KS test for distribution equality
    ks_stat, ks_p = ks_2samp(pop, smp)
    
    # Additional diagnostics (optional)
    try:
        t_stat, t_p = ttest_ind(pop, smp, equal_var=False)
        lev_stat, lev_p = levene(pop, smp)
    except:
        t_p, lev_p = None, None
    
    results.append([
        col, "Numerical", ks_p, t_p, lev_p,
        f"KS test (stat={ks_stat:.3f})",
        "Same distribution" if ks_p > 0.05 else "Different distribution"
    ])
    processed.add(col)

# Categorical Test (Chi-Square)

In [4]:
cat_cols = population.select_dtypes(include='object').columns

for col in cat_cols:
    if col in processed:
        continue
    
    # Get value counts for both datasets
    pop_counts = population[col].fillna("NA").value_counts()
    smp_counts = sample[col].fillna("NA").value_counts()
    
    # Align categories (use population categories as reference)
    all_categories = pop_counts.index.tolist()
    
    # Expected frequencies from population proportions
    pop_props = pop_counts / len(population)
    expected = pop_props * len(sample)
    
    # Observed frequencies in sample
    observed = [smp_counts.get(cat, 0) for cat in all_categories]
    expected = expected.values
    
    # Chi-square goodness-of-fit test
    if len(all_categories) > 1 and sum(expected > 5) >= len(expected) * 0.8:
        chi_stat, chi_p = chisquare(observed, expected)
        test_desc = f"Chi-square GOF (stat={chi_stat:.3f})"
        conclusion = "Same distribution" if chi_p > 0.05 else "Different distribution"
    else:
        chi_p = None
        test_desc = "Chi-square GOF (warning: low expected frequencies)"
        conclusion = "Cannot reliably assess"
    
    results.append([
        col, "Categorical", chi_p, None, None,
        test_desc, conclusion
    ])
    processed.add(col)

# Binary Proportion Test

In [5]:
# Include numeric columns with exactly 2 unique values
for col in population.columns:
    if col in processed:
        continue
    
    pop_unique = population[col].dropna().nunique()
    smp_unique = sample[col].dropna().nunique()
    
    if pop_unique == 2 and smp_unique <= 2:
        # Get the "positive" class (typically 1, True, or second category)
        pop_vals = population[col].dropna()
        smp_vals = sample[col].dropna()
        
        # Count successes (use the less common value or 1/True)
        pop_categories = pop_vals.unique()
        if len(pop_categories) < 2:
            processed.add(col)
            continue
            
        # Use consistent category ordering
        positive_class = sorted(pop_categories)[1] if population[col].dtype != 'object' else pop_categories[0]
        
        pop_success = (pop_vals == positive_class).sum()
        smp_success = (smp_vals == positive_class).sum()
        
        count = np.array([smp_success, pop_success])
        nobs = np.array([len(smp_vals), len(pop_vals)])
        
        try:
            z_stat, z_p = proportions_ztest(count, nobs)
            results.append([
                col, "Binary", z_p, None, None,
                f"Two-proportion Z-test (z={z_stat:.3f})",
                "Same distribution" if z_p > 0.05 else "Different distribution"
            ])
        except:
            results.append([
                col, "Binary", None, None, None,
                "Two-proportion Z-test failed",
                "Cannot assess"
            ])
        
        processed.add(col)

# Final Result

In [6]:
report = pd.DataFrame(results, columns=[
    "Column", "Type", "Main Test P-value", "T-test p",
    "Levene p", "Test Applied", "Conclusion"
])

# Sort by p-value to highlight most different columns
report_sorted = report.sort_values("Main Test P-value", na_position='last')

report_sorted.to_csv("final_stats_report.csv", index=False)

# Summary statistics
print("="*70)
print("POPULATION vs SAMPLE COMPARISON REPORT")
print("="*70)
print(f"\nTotal columns analyzed: {len(report)}")
print(f"Same distribution (p > 0.05): {(report['Main Test P-value'] > 0.05).sum()}")
print(f"Different distribution (p ≤ 0.05): {(report['Main Test P-value'] <= 0.05).sum()}")
print(f"Cannot assess: {report['Main Test P-value'].isna().sum()}")
print("\n" + "="*70)
print("\nDetailed Results:\n")
print(report_sorted.to_string(index=False))
print("\n" + "="*70)

POPULATION vs SAMPLE COMPARISON REPORT

Total columns analyzed: 47
Same distribution (p > 0.05): 42
Different distribution (p ≤ 0.05): 0
Cannot assess: 5


Detailed Results:

                       Column        Type  Main Test P-value  T-test p  Levene p                                       Test Applied             Conclusion
                  WORK_ZONE_I Categorical           0.142024       NaN       NaN                        Chi-square GOF (stat=3.904)      Same distribution
             STREET_DIRECTION Categorical           0.169021       NaN       NaN                        Chi-square GOF (stat=6.434)      Same distribution
         ROADWAY_SURFACE_COND Categorical           0.180815       NaN       NaN                        Chi-square GOF (stat=8.874)      Same distribution
           BEAT_OF_OCCURRENCE   Numerical           0.391038  0.211358  0.221534                               KS test (stat=0.002)      Same distribution
              TRAFFICWAY_TYPE Categorical         