# Import libraries

In [1]:
import pandas as pd
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [2]:
# Load your cleaned data
df = pd.read_csv("./Data/clean_website_wata.csv") 
df

Unnamed: 0,Page Views,Session Duration,Bounce Rate,Traffic Source,Time on Page,Previous Visits,Conversion Rate
0,4,3.429316,0.003910,social,8.478174,0,0.01
1,4,1.621052,0.003980,organic,9.636170,2,0.01
2,5,3.629279,0.001805,organic,2.071925,3,0.01
3,5,4.235843,0.002915,paid,1.960654,5,0.01
4,3,4.541868,0.004207,social,3.438712,2,0.01
...,...,...,...,...,...,...,...
1883,9,3.577507,0.003522,paid,4.128122,3,0.01
1884,1,2.724513,0.002072,referral,1.324206,2,0.01
1885,3,0.392856,0.000956,organic,3.824416,1,0.01
1886,3,0.393319,0.002783,paid,5.037584,2,0.01


# 1. Hypothesis Testing (t-test: comparing average session duration between two traffic sources)

In [5]:
source1 = 'organic'
source2 = 'paid'

duration_source1 = df[df['Traffic Source'] == source1]['Session Duration']
duration_source2 = df[df['Traffic Source'] == source2]['Session Duration']

t_stat, p_value = stats.ttest_ind(duration_source1, duration_source2, equal_var=False) # Unequal variances assumed

print(f"T-statistic: {t_stat}")
print(f"P-value: {p_value}")

if p_value < 0.05:
    print(f"There is a statistically significant difference in session duration between {source1} and {source2}.")
else:
    print(f"There is no statistically significant difference in session duration between {source1} and {source2}.")

T-statistic: 1.8314422743193164
P-value: 0.06737867747221464
There is no statistically significant difference in session duration between organic and paid.


# 2. ANOVA (comparing average conversion rates across multiple traffic sources)

In [6]:
model = ols('Q("Conversion Rate") ~ Q("Traffic Source")', data=df).fit() #statsmodels handles the categorical data for you.
anova_table = sm.stats.anova_lm(model, typ=2)

print("\nANOVA Table (Conversion Rates by Traffic Source):")
print(anova_table)


ANOVA Table (Conversion Rates by Traffic Source):
                       sum_sq      df         F    PR(>F)
Q("Traffic Source")  0.000002     4.0  0.937965  0.440879
Residual             0.000857  1883.0       NaN       NaN


# 3. Chi-Square Test (relationship between Traffic Source and High Bounce Rate)

In [7]:
df['High Bounce'] = (df['Bounce Rate'] > df['Bounce Rate'].median()).astype(int)

contingency_table = pd.crosstab(df['Traffic Source'], df['High Bounce'])

chi2, p, dof, expected = stats.chi2_contingency(contingency_table)

print(f"\nChi-Square Statistic: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of Freedom: {dof}")
print("Expected Frequencies:")
print(expected)

if p < 0.05:
    print("There is a statistically significant relationship between Traffic Source and High Bounce Rate.")
else:
    print("There is no statistically significant relationship between Traffic Source and High Bounce Rate.")


Chi-Square Statistic: 3.9808362178242636
P-value: 0.4086055982953637
Degrees of Freedom: 4
Expected Frequencies:
[[103.  103. ]
 [369.5 369.5]
 [201.5 201.5]
 [142.  142. ]
 [128.  128. ]]
There is no statistically significant relationship between Traffic Source and High Bounce Rate.


# 4. Correlation test (Pearson correlation between session duration and page views)

In [8]:
correlation, p_value = stats.pearsonr(df['Session Duration'], df['Page Views'])

print(f"\nPearson Correlation Coefficient: {correlation}")
print(f"P-value: {p_value}")

if p_value < 0.05:
    print("There is a statistically significant correlation between Session Duration and Page Views.")
else:
    print("There is no statistically significant correlation between Session Duration and Page Views.")


Pearson Correlation Coefficient: -0.002252450277255582
P-value: 0.9220857095303984
There is no statistically significant correlation between Session Duration and Page Views.


# 5. Regression Analysis (OLS regression to predict conversion rate)

In [9]:
model = ols('Q("Conversion Rate") ~ Q("Session Duration") + Q("Page Views") + Q("Previous Visits") + Q("Bounce Rate")', data=df).fit()
print(model.summary())

                             OLS Regression Results                             
Dep. Variable:     Q("Conversion Rate")   R-squared:                       0.071
Model:                              OLS   Adj. R-squared:                  0.069
Method:                   Least Squares   F-statistic:                     35.87
Date:                  Wed, 12 Mar 2025   Prob (F-statistic):           6.34e-29
Time:                          10:56:26   Log-Likelihood:                 11176.
No. Observations:                  1888   AIC:                        -2.234e+04
Df Residuals:                      1883   BIC:                        -2.231e+04
Df Model:                             4                                         
Covariance Type:              nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept 