In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import scipy.stats as stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
warnings.filterwarnings('ignore')

In [4]:
data=pd.read_csv('final_data_for_hypothesis.csv')

### Hypothesis Testing

**Null Hypothesis (H0):** The mean sales are the same across all store types.

**Alternative Hypothesis (H1):** At least one store type has a different mean sales compared to the others.


In [12]:
# One-way ANOVA test
anova_result = stats.f_oneway(
    data[data['Store_Type'] == 'S1']['Sales'],
    data[data['Store_Type'] == 'S2']['Sales'],
    data[data['Store_Type'] == 'S3']['Sales'],
    data[data['Store_Type'] == 'S4']['Sales']
)
print("ANOVA p-value:", anova_result.pvalue)

# post-hoc test (Tukey's HSD)
if anova_result.pvalue < 0.05:
    tukey_result = pairwise_tukeyhsd(data['Sales'], data['Store_Type'])
    print(tukey_result)
print()
if anova_result.pvalue < 0.05:
    print("There is a significant difference in sales between store types.")
    print("Reject Null Hypothesis")
else:
    print("There is no significant difference in sales between store types.")
    print("Fail to reject Null Hypothesis")

ANOVA p-value: 0.0
     Multiple Comparison of Means - Tukey HSD, FWER=0.05     
group1 group2   meandiff  p-adj    lower      upper    reject
-------------------------------------------------------------
    S1     S2 -10107.2382   0.0 -10340.7664 -9873.7099   True
    S1     S3   9184.3262   0.0   8936.2044   9432.448   True
    S1     S4  19967.3429   0.0  19766.5766 20168.1092   True
    S2     S3  19291.5643   0.0  18992.7332 19590.3955   True
    S2     S4  30074.5811   0.0  29813.7311  30335.431   True
    S3     S4  10783.0167   0.0  10509.0246 11057.0089   True
-------------------------------------------------------------

There is a significant difference in sales between store types.
Reject Null Hypothesis


### Hypothesis Testing

**Null Hypothesis (H0):** The mean sales are the same across all Locations.

**Alternative Hypothesis (H1):** At least one Location type has a different mean sales compared to the others.

In [16]:
# One-way ANOVA test
anova_result = stats.f_oneway(
    data[data['Location_Type'] == 'L1']['Sales'],
    data[data['Location_Type'] == 'L2']['Sales'],
    data[data['Location_Type'] == 'L3']['Sales'],
    data[data['Location_Type'] == 'L4']['Sales'],
    data[data['Location_Type'] == 'L5']['Sales']
)
print("ANOVA p-value:", anova_result.pvalue)

# post-hoc test (Tukey's HSD)
if anova_result.pvalue < 0.05:
    tukey_result = pairwise_tukeyhsd(data['Sales'], data['Location_Type'])
    print(tukey_result)
print()
if anova_result.pvalue < 0.05:
    print("There is a significant difference in sales between location types.")
    print("Reject Null Hypothesis")
else:
    print("There is no significant difference in sales between location types.")
    print("Fail to reject Null Hypothesis")

ANOVA p-value: 0.0
     Multiple Comparison of Means - Tukey HSD, FWER=0.05      
group1 group2   meandiff  p-adj    lower       upper    reject
--------------------------------------------------------------
    L1     L2  15611.5633   0.0  15399.4575  15823.6692   True
    L1     L3  -8294.0324   0.0   -8541.623  -8046.4419   True
    L1     L4 -12290.8921   0.0 -12666.6281 -11915.1561   True
    L1     L5 -16170.5192   0.0 -16507.1958 -15833.8426   True
    L2     L3 -23905.5958   0.0 -24178.3092 -23632.8824   True
    L2     L4 -27902.4555   0.0  -28295.201   -27509.71   True
    L2     L5 -31782.0825   0.0 -32137.6421 -31426.5229   True
    L3     L4  -3996.8597   0.0  -4409.8503  -3583.8691   True
    L3     L5  -7876.4868   0.0  -8254.2894  -7498.6841   True
    L4     L5  -3879.6271   0.0   -4351.444  -3407.8101   True
--------------------------------------------------------------

There is a significant difference in sales between location types.
Reject Null Hypothesis


### Hypothesis Testing

**Null Hypothesis (H0):** The mean sales are the same across all regions.

**Alternative Hypothesis (H1):** At least one region has a different mean sales compared to the others.

In [15]:
# One-way ANOVA test
anova_result = stats.f_oneway(
    data[data['Region_Code'] == 'R1']['Sales'],
    data[data['Region_Code'] == 'R2']['Sales'],
    data[data['Region_Code'] == 'R3']['Sales'],
    data[data['Region_Code'] == 'R4']['Sales']

)
print("ANOVA p-value:", anova_result.pvalue)

# post-hoc test (Tukey's HSD)
if anova_result.pvalue < 0.05:
    tukey_result = pairwise_tukeyhsd(data['Sales'], data['Region_Code'])
    print(tukey_result)
print()
if anova_result.pvalue < 0.05:
    print("There is a significant difference in sales between diff regions.")
    print("Reject Null Hypothesis")
else:
    print("There is no significant difference in sales between diff regions.")
    print("Fail to reject Null Hypothesis")

ANOVA p-value: 0.0
    Multiple Comparison of Means - Tukey HSD, FWER=0.05     
group1 group2  meandiff  p-adj    lower      upper    reject
------------------------------------------------------------
    R1     R2 -5665.1479    0.0 -5915.8792 -5414.4166   True
    R1     R3 -3592.7111    0.0 -3857.9476 -3327.4746   True
    R1     R4 -5889.1059    0.0 -6205.3097  -5572.902   True
    R2     R3  2072.4368    0.0  1798.3575   2346.516   True
    R2     R4   -223.958 0.2841  -547.6151    99.6991  False
    R3     R4 -2296.3947    0.0 -2631.4143 -1961.3751   True
------------------------------------------------------------

There is a significant difference in sales between diff regions.
Reject Null Hypothesis


- R2 and R4 region has same sales mean

### Hypothesis Testing

**Null Hypothesis (H0):** Discounts do not affect sales; the mean sales are the same whether there is a discount or not.

**Alternative Hypothesis (H1):** Discounts affect sales; the mean sales are different when there is a discount compared to when there is no discount.

In [17]:
# Hypothesis Testing
sales_no_discount = data[data['Discount'] == 'No']['Sales']
sales_yes_discount = data[data['Discount'] == 'Yes']['Sales']

# Perform two-sample t-test
t_stat, p_value = stats.ttest_ind(sales_no_discount, sales_yes_discount, equal_var=False)

print(f"T-statistic: {t_stat:.2f}")
print(f"P-value: {p_value:.4f}")

# Interpretation
if p_value < 0.05:
    print("Reject the null hypothesis. There is a significant difference in sales between stores with and without discounts.")
else:
    print("Fail to reject the null hypothesis. There is no significant difference in sales between stores with and without discounts.")

T-statistic: -149.36
P-value: 0.0000
Reject the null hypothesis. There is a significant difference in sales between stores with and without discounts.


### Hypothesis Testing

**Null Hypothesis (H0)**: Holidays do not affect sales; the mean sales are the same on holidays and non-holidays.

**Alternative Hypothesis (H1)**: Holidays affect sales; the mean sales are different on holidays compared to non-holidays.

In [21]:
# Hypothesis Testing
sales_non_holiday = data[data['Holiday'] == 0]['Sales']
sales_holiday = data[data['Holiday'] == 1]['Sales']

# Perform two-sample t-test
t_stat, p_value = stats.ttest_ind(sales_non_holiday, sales_holiday, equal_var=False)

print(f"T-statistic: {t_stat:.2f}")
print(f"P-value: {p_value:.4f}")

# Mean Sales Comparison
mean_sales_non_holiday = sales_non_holiday.mean()
mean_sales_holiday = sales_holiday.mean()

print(f"Mean sales on non-holidays: {mean_sales_non_holiday:.2f}")
print(f"Mean sales on holidays: {mean_sales_holiday:.2f}")

# Interpretation
if p_value < 0.05:
    print("Reject the null hypothesis. There is a significant difference in sales between holidays and non-holidays.")
    if mean_sales_holiday > mean_sales_non_holiday:
        print("Sales are significantly higher on holidays than on non-holidays.")
    else:
        print("Sales are significantly higher on non-holidays than on holidays.")
else:
    print("Fail to reject the null hypothesis. There is no significant difference in sales between holidays and non-holidays.")

T-statistic: 67.20
P-value: 0.0000
Mean sales on non-holidays: 43089.53
Mean sales on holidays: 34995.15
Reject the null hypothesis. There is a significant difference in sales between holidays and non-holidays.
Sales are significantly higher on non-holidays than on holidays.
