In [2]:
import pandas as pd
import numpy as np
np.random.seed(42)

eye_colors = ['Blue', 'Green', 'Brown', 'Hazel']
genders = ['Male', 'Female']
marital_statuses = ['Single', 'Married', 'Divorced']
num_rows = 250

data = {
    'Eye Color': np.random.choice(eye_colors, num_rows),
    'Gender': np.random.choice(genders, num_rows),
    'Marital Status': np.random.choice(marital_statuses, num_rows),
    'Age Group': np.random.choice(['18-25', '26-35', '36-45', '46-55', '56+'], num_rows),
    'Income Level': np.random.choice(['Low', 'Medium', 'High'], num_rows)
}

df = pd.DataFrame(data)

df_counts = df.value_counts().reset_index(name='Count')

df_counts.to_csv('large_chi_square_dataset.csv', index=False)

print("Larger dataset with 5 columns and 250 rows saved as 'large_chi_square_dataset.csv':")
print(df_counts)


Larger dataset with 5 columns and 250 rows saved as 'large_chi_square_dataset.csv':
    Eye Color  Gender Marital Status Age Group Income Level  Count
0       Green  Female         Single       56+       Medium      4
1       Brown    Male        Married     46-55          Low      3
2       Green  Female        Married     36-45         High      3
3       Brown  Female         Single     26-35         High      3
4       Brown    Male       Divorced     46-55          Low      3
..        ...     ...            ...       ...          ...    ...
178     Brown    Male       Divorced     26-35         High      1
179     Brown    Male       Divorced     26-35       Medium      1
180     Brown    Male       Divorced     36-45         High      1
181     Brown    Male       Divorced     46-55       Medium      1
182     Hazel    Male         Single       56+       Medium      1

[183 rows x 6 columns]


In [9]:
import pandas as pd
from scipy.stats import chisquare, chi2_contingency
import itertools

df_counts = pd.read_csv('Dataset.csv')

print("Dataset Loaded:")
print(df_counts)

def chi_square_gof(column_name):
    observed_counts = df_counts.groupby(column_name)['Count'].sum()
    total_counts = observed_counts.sum()
    num_categories = len(observed_counts)
    expected_counts = [total_counts / num_categories] * num_categories
    chi2_stat, p_value = chisquare(observed_counts, expected_counts)
    df = num_categories - 1
    conclusion = "Reject the null hypothesis." if p_value < 0.05 else "Fail to reject the null hypothesis."
    return {
        'Chi-Squared Statistic': chi2_stat,
        'p-value': p_value,
        'Degrees of Freedom': df,
        'Conclusion': conclusion
    }

columns_to_test = ['Eye Color', 'Gender', 'Marital Status', 'Age Group', 'Income Level']
results_gof = {}

for col in columns_to_test:
    results_gof[col] = chi_square_gof(col)

for col, result in results_gof.items():
    print(f"\nChi-Square Goodness of Fit Test for {col}:")
    print(f"Chi-Squared Statistic: {result['Chi-Squared Statistic']}")
    print(f"p-value: {result['p-value']}")
    print(f"Degrees of Freedom: {result['Degrees of Freedom']}")
    print(f"Conclusion: {result['Conclusion']}")

def chi_square_independence(col1, col2):
    contingency_table = pd.crosstab(df_counts[col1], df_counts[col2], values=df_counts['Count'], aggfunc='sum').fillna(0)
    chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)
    conclusion = "Reject the null hypothesis." if p_value < 0.05 else "Fail to reject the null hypothesis."
    return {
        'Chi-Squared Statistic': chi2_stat,
        'p-value': p_value,
        'Degrees of Freedom': dof,
        'Conclusion': conclusion,
        'Expected Frequencies': expected
    }

results_independence = {}
column_pairs = list(itertools.combinations(columns_to_test, 2))

for col1, col2 in column_pairs:
    results_independence[(col1, col2)] = chi_square_independence(col1, col2)

for (col1, col2), result in results_independence.items():
    print(f"\nChi-Square Test of Independence between {col1} and {col2}:")
    print(f"Chi-Squared Statistic: {result['Chi-Squared Statistic']}")
    print(f"p-value: {result['p-value']}")
    print(f"Degrees of Freedom: {result['Degrees of Freedom']}")
    print(f"Conclusion: {result['Conclusion']}")
    print("Expected Frequencies:")
    print(result['Expected Frequencies'])


Dataset Loaded:
    Eye Color  Gender Marital Status Age Group Income Level  Count
0       Green  Female         Single       56+       Medium      4
1       Brown    Male        Married     46-55          Low      3
2       Green  Female        Married     36-45         High      3
3       Brown  Female         Single     26-35         High      3
4       Brown    Male       Divorced     46-55          Low      3
..        ...     ...            ...       ...          ...    ...
178     Brown    Male       Divorced     26-35         High      1
179     Brown    Male       Divorced     26-35       Medium      1
180     Brown    Male       Divorced     36-45         High      1
181     Brown    Male       Divorced     46-55       Medium      1
182     Hazel    Male         Single       56+       Medium      1

[183 rows x 6 columns]

Chi-Square Goodness of Fit Test for Eye Color:
Chi-Squared Statistic: 1.6800000000000002
p-value: 0.6413893691403016
Degrees of Freedom: 3
Conclusion: Fail