In [2]:
import pandas as pd

df = pd.read_csv("movie_data.csv")

def manual_chi_square(df, col1, col2):   
    print(f"Chi-Square Test: {col1} vs {col2}")

    table = pd.crosstab(df[col1], df[col2])
    print("\nContingency Table (Observed):")
    print(table)

    observed = table.values

    row_total = observed.sum(axis=1)
    col_total = observed.sum(axis=0)
    grand_total = observed.sum()

    expected = []
    for r in row_total:
        row_exp = []
        for c in col_total:
            row_exp.append((r * c) / grand_total)
        expected.append(row_exp)

    chi_square = 0
    for i in range(len(observed)):
        for j in range(len(observed[0])):
            chi_square += ((observed[i][j] - expected[i][j]) ** 2) / expected[i][j]

    dfree = (len(row_total) - 1) * (len(col_total) - 1)

    print("Expected Table:")
    for row in expected:
        print(row)

    print(f"Chi-Square Value: {chi_square:.4f}")
    print(f"Degrees of Freedom: {dfree}")
    print("Compare chi square with critical value from Chi-square table.\n")

    return chi_square, dfree


# 1. Genre Preference Across Age Groups
manual_chi_square(df, "age_group", "genre")


# 2. Re-watch Behavior vs Liking
#     (re_watched vs liked)
manual_chi_square(df, "re_watched", "liked")


# 3. Sharing Behavior Across Countries
#     (shared vs country)
manual_chi_square(df, "country", "shared")

Chi-Square Test: age_group vs genre

Contingency Table (Observed):
genre      Action  Comedy  Drama  Horror  Romance  Sci-Fi  Thriller
age_group                                                          
18-25          23      32     34      28       30      28        18
26-35          32      25     29      30       33      21        36
36-45          37      25     26      27       23      27        30
46-60          27      30     24      36       33      26        36
60+            30      26     31      25       27      32        23
Expected Table:
[np.float64(28.757), np.float64(26.634), np.float64(27.792), np.float64(28.178), np.float64(28.178), np.float64(25.862), np.float64(27.599)]
[np.float64(30.694), np.float64(28.428), np.float64(29.664), np.float64(30.076), np.float64(30.076), np.float64(27.604), np.float64(29.458)]
[np.float64(29.055), np.float64(26.91), np.float64(28.08), np.float64(28.47), np.float64(28.47), np.float64(26.13), np.float64(27.885)]
[np.float64(31.588), np


Contingency Table (Observed):
liked       False  True 
re_watched              
False         637    214
True           98     51
Expected Table:
[np.float64(625.485), np.float64(225.515)]
[np.float64(109.515), np.float64(39.485)]
Chi-Square Value: 5.3688
Degrees of Freedom: 1
Compare chi square with critical value from Chi-square table.

Chi-Square Test: country vs shared

Contingency Table (Observed):
shared             False  True 
country                        
Afghanistan            5      0
Albania                6      0
Algeria                1      1
American Samoa         3      0
Andorra                3      0
...                  ...    ...
Wallis and Futuna      5      1
Western Sahara         3      0
Yemen                  8      0
Zambia                 4      0
Zimbabwe               1      1

[241 rows x 2 columns]
Expected Table:
[np.float64(4.485), np.float64(0.515)]
[np.float64(5.382), np.float64(0.618)]
[np.float64(1.794), np.float64(0.206)]
[np.float64(2.691),

(np.float64(229.10446338052893), 240)

In [5]:
import pandas as pd

data = pd.read_csv("movie_data.csv")
def chi_square(data, col1, col2, alpha=0.05):

    table = pd.crosstab(data[col1], data[col2])
    print("Observed Table:")
    print(table)

    observed = table.values

    row_total = observed.sum(axis=1)
    col_total = observed.sum(axis=0)
    grand_total = observed.sum()

    expected = []
    for r in row_total:
        row_exp = []
        for c in col_total:
            row_exp.append((r * c) / grand_total)
        expected.append(row_exp)

    print("Expected Table:")
    for row in expected:
        print([float(x) for x in row])

    chi_square_val = 0
    for i in range(len(observed)):
        for j in range(len(observed[0])):
            chi_square_val += ((observed[i][j] - expected[i][j]) ** 2) / expected[i][j]

    chi_square_val = float(chi_square_val)
    
    dfree = (len(row_total) - 1) * (len(col_total) - 1)

    print(f"Chi-Square Value = {chi_square_val:.4f}")
    print(f"Degrees of Freedom = {dfree}")
    
    
    if alpha == 0.10:
        critical = 2.71
    elif alpha == 0.05:
        critical = 3.84
    elif alpha == 0.01:
        critical = 6.63
    else:
        critical = 3.84 
        print(" Unknown alpha, using default (0.05) critical value")

    print(f"Critical Value (alpha={alpha}) = {critical}")


    print("Hypothesis Conclusion")

    if chi_square_val > critical:
        print("Reject H0")
        print(f"'{col1}' and '{col2}' are significantly associated.")
    else:
        print("Fail to Reject H0")
        print(f"No significant association between '{col1}' and '{col2}'.")


    return chi_square_val, dfree, critical

In [6]:
chi_square(data, "country", "shared")

Observed Table:
shared             False  True 
country                        
Afghanistan            5      0
Albania                6      0
Algeria                1      1
American Samoa         3      0
Andorra                3      0
...                  ...    ...
Wallis and Futuna      5      1
Western Sahara         3      0
Yemen                  8      0
Zambia                 4      0
Zimbabwe               1      1

[241 rows x 2 columns]
Expected Table:
[4.485, 0.515]
[5.382, 0.618]
[1.794, 0.206]
[2.691, 0.309]
[2.691, 0.309]
[1.794, 0.206]
[4.485, 0.515]
[0.897, 0.103]
[2.691, 0.309]
[2.691, 0.309]
[4.485, 0.515]
[8.073, 0.927]
[4.485, 0.515]
[7.176, 0.824]
[6.279, 0.721]
[4.485, 0.515]
[4.485, 0.515]
[6.279, 0.721]
[0.897, 0.103]
[3.588, 0.412]
[3.588, 0.412]
[1.794, 0.206]
[1.794, 0.206]
[5.382, 0.618]
[2.691, 0.309]
[8.073, 0.927]
[4.485, 0.515]
[3.588, 0.412]
[2.691, 0.309]
[2.691, 0.309]
[4.485, 0.515]
[2.691, 0.309]
[6.279, 0.721]
[7.176, 0.824]
[3.588, 0.412]
[3.

(229.10446338052893, 240, 3.84)