# Hypothesis Testing

In [1]:
import pandas as pd
import numpy as np
from scipy import stats

df = pd.read_csv("https://raw.githubusercontent.com/Data-Navigators/Statistical_Concept_Excercise/main/data/Retail_sales_dataset.csv")

## Hypothesis Testing: Difference in average Total Amount between male and female customers

In [2]:
male_amounts = df[df['Gender'] == 'Male']['Total Amount']
female_amounts = df[df['Gender'] == 'Female']['Total Amount']

t_stat, p_value = stats.ttest_ind(male_amounts, female_amounts)

print("1. Hypothesis Test: Difference in average Total Amount between genders")
print(f"T-statistic: {t_stat}")
print(f"P-value: {p_value}")
print(f"Conclusion: {'Reject' if p_value < 0.05 else 'Fail to reject'} the null hypothesis")
print()

1. Hypothesis Test: Difference in average Total Amount between genders
T-statistic: -0.03161341824319852
P-value: 0.9747866634918314
Conclusion: Fail to reject the null hypothesis



## Chi-square test: Relationship between Gender and Product Category

Techniques used for Hypothesis testing with categorical variables

In [None]:
contingency_table = pd.crosstab(df['Gender'], df['Product Category'])
chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
print(f"Contingency Table:\n{contingency_table}\n")
print("Chi-square Test: Relationship between Gender and Product Category")
print(f"Chi-square statistic: {chi2}")
print(f"P-value: {p_value}")
print(f"Degrees of freedom: {dof}")
print(f"Conclusion: {'Reject' if p_value < 0.05 else 'Fail to reject'} the null hypothesis")
print()

Contingency Table:
Product Category  Beauty  Clothing  Electronics
Gender                                         
Female               166       174          170
Male                 141       177          172

2. Chi-square Test: Relationship between Gender and Product Category
Chi-square statistic: 1.673837085800602
P-value: 0.43304287262068974
Degrees of freedom: 2
Conclusion: Fail to reject the null hypothesis



## T-test: Compare mean Age across two popular Product Categories

In [5]:
top_categories = df['Product Category'].value_counts().nlargest(2).index

category1_ages = df[df['Product Category'] == top_categories[0]]['Age']
category2_ages = df[df['Product Category'] == top_categories[1]]['Age']

t_stat, p_value = stats.ttest_ind(category1_ages, category2_ages)

print("3. T-test: Compare mean Age across two popular Product Categories")
print(f"Comparing {top_categories[0]} and {top_categories[1]}")
print(f"T-statistic: {t_stat}")
print(f"P-value: {p_value}")
print(f"Conclusion: {'Reject' if p_value < 0.05 else 'Fail to reject'} the null hypothesis")

3. T-test: Compare mean Age across two popular Product Categories
Comparing Clothing and Electronics
T-statistic: 0.20529739367317656
P-value: 0.837400242059152
Conclusion: Fail to reject the null hypothesis
