In [1]:
import numpy as np
import matplotlib.pyplot as plt
import math
import scipy.stats as stats
from scipy.stats import chi2

In [2]:
# For a chi-squared-curve with 12 degrees of freedom, find the chi-square value with area (alpha) 0.025 to its right

# Degrees of freedom
df = 12

# alpha (area to the right)
alpha = 0.025

# Cumulative probability (1 - 0.025 = 0.975)
p = 1 - alpha

# Find the critical value
chi_squared_value = chi2.ppf(p, df)
print(f'degrees of freedom: {df}')
print(f"X^2(area{alpha}) for {df} degrees of freedom is: {chi_squared_value:.4f}")


degrees of freedom: 12
X^2(area0.025) for 12 degrees of freedom is: 23.3367


In [17]:
# For a chi-squared-curve with 10 degrees of freedom, determine the chi-square value with area (alpha)0.05

# Degrees of freedom
df = 10

# alpha (area to the right)
alpha = 0.05

# Cumulative probability (1 - 0.05 = 0.95)
p = 1 - alpha

# Find the critical value
chi_squared_value = chi2.ppf(p, df)
print(f'degrees of freedom: {df}')
print(f"X^2(area{alpha}) for {df} degrees of freedom is: {chi_squared_value:.4f}")

degrees of freedom: 10
X^2(area0.05) for 10 degrees of freedom is: 18.3070


In [10]:
# calculate a goodness of fit test
# compare observed vs expected values

# Observed data
observed = [3, 36, 170, 291]

# Expected data
expected = [6.0, 34.0, 147.5, 312.5]

# Check assumptions
# Assumption 1: All expected frequencies are 1 or greater
if all(e >=1 for e in expected):
        print("Assumption 1 passed: All expected frequencies are 1 or greater.")
else:
    print("Assumption 1 failed: Not all expected frequencies are 1 or greater.")
    

# Assumption 2: At most, 20% of expected frequencies are less than 5
less_than_five = sum(1 for e in expected if e < 5)
if less_than_five / len(expected) <= 0.2:
    print("Assumption 2 passed: At most 20% of expected frequencies are less than 5.\n")
else:
    print("Assumption 2 failed: More than 20% of expected frequencies are less than 5.\n")
    
# Compute the chi-squared statistic manually
chi_squared_stat = sum([(o - e) ** 2 / e for o, e in zip(observed, expected)])

# Degrees of freedom
degrees_of_freedom = len(observed) - 1

# Compute the critical value for a 95% confidence level
critical_value = stats.chi2.ppf(0.95, df=degrees_of_freedom)

# Compute the p-value
p_value = 1 - stats.chi2.cdf(chi_squared_stat, df=degrees_of_freedom)

# Output results
print(f"Chi-squared Statistic: {chi_squared_stat:.3f}")
print(f"Critical Value: {critical_value:.3f}")
print(f"P-value: {p_value:.3f}")

Assumption 1 passed: All expected frequencies are 1 or greater.
Assumption 2 passed: At most 20% of expected frequencies are less than 5.

Chi-squared Statistic: 6.529
Critical Value (95% confidence): 7.815
P-value: 0.089


In [12]:
# calculate a goodness of fit test
# compare observed vs expected values
# You are given observed frequencies and distributions 

# Observed data
observed = [85,215,130,70]

# distribution
distribution = [0.2, 0.4, 0.3, 0.1]

# Total number of observations
total_observed = sum(observed)

# degrees of freedom
df = len(observed) -1

# Expected data
expected = [total_observed * p for p in distribution]

# Check assumptions
# Assumption 1: All expected frequencies are 1 or greater
if all(e >=1 for e in expected):
        print("Assumption 1 passed: All expected frequencies are 1 or greater.")
else:
    print("Assumption 1 failed: Not all expected frequencies are 1 or greater.")
    

# Assumption 2: At most, 20% of expected frequencies are less than 5
less_than_five = sum(1 for e in expected if e < 5)
if less_than_five / len(expected) <= 0.2:
    print("Assumption 2 passed: At most 20% of expected frequencies are less than 5.\n")
else:
    print("Assumption 2 failed: More than 20% of expected frequencies are less than 5.\n")


# Compute the chi-squared statistic manually
chi_squared_stat = sum([(o - e) ** 2 / e for o, e in zip(observed, expected)])

# Degrees of freedom
degrees_of_freedom = len(observed) - 1

# Compute the critical value for a 95% confidence level
critical_value = stats.chi2.ppf(0.95, df=degrees_of_freedom)

# Compute the p-value
p_value = 1 - stats.chi2.cdf(chi_squared_stat, df=degrees_of_freedom)

# Output results
print(f"degrees of freedom: {df}")
print(f"Chi-squared Statistic: {chi_squared_stat:.3f}")
print(f"Critical Value: {critical_value:.3f}")
print(f"P-value: {p_value:.3f}")

Assumption 1 passed: All expected frequencies are 1 or greater.
Assumption 2 passed: At most 20% of expected frequencies are less than 5.

degrees of freedom: 3
Chi-squared Statistic: 14.042
Critical Value (95% confidence): 7.815
P-value: 0.003


In [11]:
import pandas as pd
from scipy.stats import chi2_contingency, chi2

# Input your contingency table
data = {
    "Abstain from drinking": [67, 411, 85, 27],  
    "1-60 drinks per month": [213, 633, 51, 60],  
    "Over 60 drinks per month": [74, 129, 7, 15]
}

# Convert to a pandas DataFrame
contingency_table = pd.DataFrame(data, index=["Single", "Married", "Widowed", "Divorced"])

# Conduct the chi-squared test
chi2_stat, p, dof, expected = chi2_contingency(contingency_table)

# Add the expected frequencies to the DataFrame and round
expected_df = pd.DataFrame(expected, index=contingency_table.index, columns=contingency_table.columns)
expected_df = expected_df.round(2)
contingency_with_expected = pd.concat([contingency_table, expected_df.add_suffix(' (Expected)')], axis=1)

# Set the significance level (alpha)
alpha = 0.05

# Compute the critical value
critical_value = chi2.ppf(1 - alpha, dof)

# Check assumptions
# Assumption 1: All expected frequencies are 1 or greater
if (expected >= 1).all():
    print("Assumption 1 passed: All expected frequencies are 1 or greater.")
else:
    print("Assumption 1 failed: Not all expected frequencies are 1 or greater.")

# Assumption 2: At most, 20% of expected frequencies are less than 5
less_than_five = (expected < 5).sum()
total_frequencies = expected.size  # Total number of expected frequencies
if less_than_five / total_frequencies <= 0.2:
    print("Assumption 2 passed: At most 20% of expected frequencies are less than 5.")
else:
    print("Assumption 2 failed: More than 20% of expected frequencies are less than 5.")

# Display the updated DataFrame
print("\nContingency Table with Expected Frequencies:")
print(contingency_with_expected)

# Print results
print("\nChi-squared Test Statistic:", round(chi2_stat, 2))
print("Critical Value at alpha =", alpha, ":", round(critical_value, 2))
print("P-value:", round(p, 4))
print("Degrees of Freedom:", dof)

# Decision
if chi2_stat > critical_value:
    print("\nConclusion: Reject the null hypothesis (H0).")
else:
    print("\nConclusion: Fail to reject the null hypothesis (H0).")


Assumption 1 passed: All expected frequencies are 1 or greater.
Assumption 2 passed: At most 20% of expected frequencies are less than 5.

Contingency Table with Expected Frequencies:
          Abstain from drinking  1-60 drinks per month  \
Single                       67                    213   
Married                     411                    633   
Widowed                      85                     51   
Divorced                     27                     60   

          Over 60 drinks per month  Abstain from drinking (Expected)  \
Single                          74                            117.87   
Married                        129                            390.56   
Widowed                          7                             47.61   
Divorced                        15                             33.96   

          1-60 drinks per month (Expected)  \
Single                              191.18   
Married                             633.50   
Widowed                   