In [7]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv(r'C:\Users\Sony\Desktop\marketing_clean.csv')

# Display basic info
print("Dataset Shape:", df.shape) 
print("\nFirst 5 rows:")
print(df.head())
print("\nData Summary:")
print(df.describe())

Dataset Shape: (588101, 6)

First 5 rows:
   user_id  test_group  converted  total_ads  most_ads_hour most_ads_day
0  1043981           1          0         54             19       Monday
1  1359756           1          0          2             19       Friday
2  1168777           1          0         35             19       Friday
3  1113243           1          0        109             23       Friday
4  1257407           1          0         15             19       Friday

Data Summary:
            user_id     test_group      converted      total_ads  \
count  5.881010e+05  588101.000000  588101.000000  588101.000000   
mean   1.310692e+06       0.960000       0.025239      24.820876   
std    2.022260e+05       0.195959       0.156850      43.715181   
min    9.000000e+05       0.000000       0.000000       1.000000   
25%    1.143190e+06       1.000000       0.000000       4.000000   
50%    1.313725e+06       1.000000       0.000000      13.000000   
75%    1.484088e+06       1.0

In [8]:
# A/B TEST : CHI-SQUARE TEST
print("\n" + "="*50)
print("A/B TEST ANALYSIS- CHI-SQUARE TEST")
print("="*50)

# Create contingency table
contingency_table = pd.crosstab(df['test_group'], df['converted'])
print("\nContingency Table:")
print(contingency_table)

# Perform chi-square test
chi2, p_value, dof , expected = stats.chi2_contingency(contingency_table)
print(f"\nChi-Square Statistics : {chi2:.4f}")
print(f"p-Value : {p_value:.4f}")
print(f"Degree of freedom: {dof}")

if p_value <0.05:
    print("\n RESULTS : Statistically significant difference(p< 0.05)")
    print("the test group performs diferently than the contral group!")
else :
    print("\nx RESULTS: No statistically significant different(p>= 0.05)")  
    print("No significant difference between test and control groups.")  

# Calculate conversion rates
control_rate= df[df['test_group']==0]['converted'].mean()*100    
test_rate = df[df['test_group']==1]['converted'].mean() * 100
lift = ((test_rate-control_rate)/control_rate) * 100
print(f"\nControl group conversion rate:{control_rate:.2f}%")
print(f"Test Group Conversion Rate: {test_rate:.2f}")
print(f"Lift:{lift:.2f}%")


A/B TEST ANALYSIS- CHI-SQUARE TEST

Contingency Table:
converted        0      1
test_group               
0            23104    420
1           550154  14423

Chi-Square Statistics : 54.0058
p-Value : 0.0000
Degree of freedom: 1

 RESULTS : Statistically significant difference(p< 0.05)
the test group performs diferently than the contral group!

Control group conversion rate:1.79%
Test Group Conversion Rate: 2.55
Lift:43.09%


In [9]:
# TWO-PROPORTION Z-TEST

print("\n" + "="*50)
print("TWO-PROPORTION Z-TEST")
print("="*50)

from statsmodels.stats.proportion import proportions_ztest

control_conversions = df[df['test_group']==0]['converted'].sum()
test_conversions = df[df['test_group']==1]['converted'].sum()
control_n = len(df[df['test_group']==0])
test_n= len(df[df['test_group']==1])

counts= np.array([test_conversions, control_conversions])
nobs= np.array([test_n, control_n])

z_stat, p_value_z= proportions_ztest(counts, nobs)
print(f"Z-statistics:{z_stat:.4f}")
print(f"p-value: {p_value_z:.4f}")

if p_value_z< 0.05:
    print("\n RESULTS: Statistically significant(p<0.05)")
else:
    print("\n RESULT: Not Statistically significant (p>=0.05)")    


TWO-PROPORTION Z-TEST
Z-statistics:7.3701
p-value: 0.0000

 RESULTS: Statistically significant(p<0.05)


In [10]:
# HYPOTHESIS TEST 2:AD FREQUENCY IMPACT
print("\n" + "="*50)
print("AD FREQUENCY IMPACT - T-TEST")
print("="*50)

converted_ads= df[df['converted']==1]['total_ads']
not_converted_ads = df[df['converted']==0]['total_ads']

t_stat, p_value_t = stats.ttest_ind(converted_ads, not_converted_ads)

print(f"T-Statistics: {t_stat:.4f}")
print(f"P-Value: {p_value_t:.4f}")
print(f"\nAverage ads for converted users : {converted_ads.mean():.2f}")
print(f" Average ads for non-converted users: {not_converted_ads.mean():.2f}")

if p_value_t <0.05:
    print("\n RESULT: Significant difference in ad frequency(p<0.05)")
else:
    print("\n RESULT : No significant difference(p>= 0.05)")    

# HYPOTHESIS TEST 3: HOUR IMPACT (ANOVA)

print("\n" + "="*50)
print("HOUR IMPACT-ONE-WAY ANOVA")
print("="*50)

# Group gy hour
hour_groups = [df[df['most_ads_hour']==hour]['converted'].values 
               for hour in df['most_ads_hour'].unique()]

f_stat, p_value_anova = stats.f_oneway(*hour_groups)               

print(f"F-Statistic : {f_stat:.4f}")
print(f"p-Value :{p_value_anova:.4f}")

if p_value_anova < 0.05:
    print("\n RESULT : Hour significantly affects conversion(p<0.05)")
else:
    print("\n RESULT: Hour does not significantly affect conversion(p>=0.05)")    


AD FREQUENCY IMPACT - T-TEST
T-Statistics: 170.8199
P-Value: 0.0000

Average ads for converted users : 83.89
 Average ads for non-converted users: 23.29

 RESULT: Significant difference in ad frequency(p<0.05)

HOUR IMPACT-ONE-WAY ANOVA
F-Statistic : 18.7420
p-Value :0.0000

 RESULT : Hour significantly affects conversion(p<0.05)


In [11]:
# export result

# Metric Summary
metrics_summary = {
    'metric': ['Total users', 'Conversions', 'Conversion Rate','Control Conv Rate', 'Test Conv Rate','Lift%'],

'Value' : [len(df),
           df['converted'].sum(),
           df['converted'].mean()*100,
           df[df['test_group']==0]['converted'].mean() * 100 ,
           df[df['test_group']==1]['converted'].mean() * 100 ,
           ((df[df['test_group']==1]['converted'].mean() -
            df[df['test_group']==0]['converted'].mean())/
            df[df['test_group']==0]['converted'].mean()) * 100]
}
pd.DataFrame(metrics_summary).to_csv('kpi_mrtrics.csv', index=False)