In [1]:
import pandas as pd
 
data = {
    "Action":  [23, 32, 37, 27, 30],
    "Comedy":  [32, 25, 25, 30, 26],
    "Drama":   [34, 29, 26, 24, 31],
    "Horror":  [28, 30, 27, 36, 25],
    "Romance": [30, 33, 23, 33, 27],
    "Sci-Fi":  [28, 21, 27, 26, 32],
    "Thriller":[18, 36, 30, 36, 23]
}

contingency = pd.DataFrame(
    data,
    index=["18-25", "26-35", "36-45", "46-60", "60+"]
)

print("Observed Table:", contingency)


Observed Table:        Action  Comedy  Drama  Horror  Romance  Sci-Fi  Thriller
18-25      23      32     34      28       30      28        18
26-35      32      25     29      30       33      21        36
36-45      37      25     26      27       23      27        30
46-60      27      30     24      36       33      26        36
60+        30      26     31      25       27      32        23


In [2]:
row_totals = contingency.sum(axis=1)
col_totals = contingency.sum(axis=0)
grand_total = contingency.values.sum()

expected = pd.DataFrame(index=contingency.index, columns=contingency.columns)

for r in contingency.index:
    for c in contingency.columns:
        expected.loc[r, c] = row_totals[r] * col_totals[c] / grand_total

print("Expected Table:", expected)


Expected Table:        Action  Comedy   Drama  Horror Romance  Sci-Fi Thriller
18-25  28.757  26.634  27.792  28.178  28.178  25.862   27.599
26-35  30.694  28.428  29.664  30.076  30.076  27.604   29.458
36-45  29.055   26.91   28.08   28.47   28.47   26.13   27.885
46-60  31.588  29.256  30.528  30.952  30.952  28.408   30.316
60+    28.906  26.772  27.936  28.324  28.324  25.996   27.742


In [3]:
import numpy as np

chi2_stat = (((contingency - expected)**2) / expected).to_numpy().sum()
print("Chi-square Statistic:", chi2_stat)


Chi-square Statistic: 22.192815287081388


In [4]:
rows, cols = contingency.shape
dof = (rows - 1) * (cols - 1)
print("Degrees of Freedom:", dof)


Degrees of Freedom: 24


In [5]:
from scipy.stats import chi2

p_value = chi2.sf(chi2_stat, dof)
print("p-value:", p_value)


p-value: 0.5677594189221105


In [7]:
print("Chi-square Statistic:", chi2_stat)
print("Degrees of Freedom:", dof)
print("p-value:", p_value)

if p_value < 0.05:
    print("Reject H0 → Age group and genre are related.")
else:
    print("Fail to reject H0 → No evidence of relation.")


Chi-square Statistic: 22.192815287081388
Degrees of Freedom: 24
p-value: 0.5677594189221105
Fail to reject H0 → No evidence of relation.


In [8]:
import numpy as np
from math import sqrt
from scipy.stats import chi2   

observed = np.array([[40, 20],
                     [10, 30]])

row_totals = observed.sum(axis=1)
col_totals = observed.sum(axis=0)
grand_total = observed.sum()

expected = np.zeros_like(observed, dtype=float)
for i in range(observed.shape[0]):
    for j in range(observed.shape[1]):
        expected[i, j] = (row_totals[i] * col_totals[j]) / grand_total


chi_square = 0
for i in range(observed.shape[0]):
    for j in range(observed.shape[1]):
        chi_square += ((observed[i, j] - expected[i, j]) ** 2) / expected[i, j]


df = (observed.shape[0] - 1) * (observed.shape[1] - 1)


alpha = 0.05
critical_value = chi2.ppf(1 - alpha, df)


print("Observed Table:\n", observed)
print("Expected Table:\n", expected.round(2))
print("Chi-Square Value:", round(chi_square, 3))
print("Degrees of Freedom:", df)
print("Critical Value (0.05):", round(critical_value, 3))

if chi_square > critical_value:
    print("Reject Null Hypothesis (H0) ")
else:
    print("Fail to Reject Null Hypothesis (H0)")


Observed Table:
 [[40 20]
 [10 30]]
Expected Table:
 [[30. 30.]
 [20. 20.]]
Chi-Square Value: 16.667
Degrees of Freedom: 1
Critical Value (0.05): 3.841
Reject Null Hypothesis (H0) 


In [9]:
import pandas as pd
import numpy as np
from scipy.stats import chi2

data = pd.read_csv("movie_2.csv") 

def chi_square_test(data, row_var, col_var, alpha=0.05):

    contingency = pd.crosstab(data[row_var], data[col_var])
    observed = contingency.values

    row_totals = observed.sum(axis=1)
    col_totals = observed.sum(axis=0)
    grand_total = observed.sum()


    expected = np.zeros_like(observed, dtype=float)
    for i in range(observed.shape[0]):
        for j in range(observed.shape[1]):
            expected[i, j] = (row_totals[i] * col_totals[j]) / grand_total


    chi_square = 0
    for i in range(observed.shape[0]):
        for j in range(observed.shape[1]):
            chi_square += ((observed[i, j] - expected[i, j]) ** 2) / expected[i, j]


    df = (observed.shape[0] - 1) * (observed.shape[1] - 1)


    critical_value = chi2.ppf(1 - alpha, df)

    result = "Reject H₀ → Variables are dependent (significant relation)" \
             if chi_square > critical_value else \
             "Fail to Reject H₀ → Variables are independent (no relation)"

    print(f"Chi-Square Test between {row_var} and {col_var}")
    print("Observed Table:\n", contingency)
    print("Expected Table:\n", pd.DataFrame(expected, 
                                            index=contingency.index, 
                                            columns=contingency.columns).round(2))
    print("Chi-Square Value:", round(chi_square, 3))
    print("Degrees of Freedom:", df)
    print("Critical Value (0.05):", round(critical_value, 3))
    print("Conclusion:", result)

chi_square_test(data, "age_group", "genre")


Chi-Square Test between age_group and genre
Observed Table:
 genre      Action  Comedy  Drama  Horror  Romance  Sci-Fi  Thriller
age_group                                                          
18-25          23      32     34      28       30      28        18
26-35          32      25     29      30       33      21        36
36-45          37      25     26      27       23      27        30
46-60          27      30     24      36       33      26        36
60+            30      26     31      25       27      32        23
Expected Table:
 genre      Action  Comedy  Drama  Horror  Romance  Sci-Fi  Thriller
age_group                                                          
18-25       28.76   26.63  27.79   28.18    28.18   25.86     27.60
26-35       30.69   28.43  29.66   30.08    30.08   27.60     29.46
36-45       29.06   26.91  28.08   28.47    28.47   26.13     27.88
46-60       31.59   29.26  30.53   30.95    30.95   28.41     30.32
60+         28.91   26.77  27.94   28.

In [10]:
import pandas as pd
import numpy as np
from scipy.stats import chi2

data = pd.read_csv("movie_2.csv") 

def chi_square_test(data, row_var, col_var, alpha=0.05):

    contingency = pd.crosstab(data[row_var], data[col_var])
    observed = contingency.values

    row_totals = observed.sum(axis=1)
    col_totals = observed.sum(axis=0)
    grand_total = observed.sum()


    expected = np.zeros_like(observed, dtype=float)
    for i in range(observed.shape[0]):
        for j in range(observed.shape[1]):
            expected[i, j] = (row_totals[i] * col_totals[j]) / grand_total


    chi_square = 0
    for i in range(observed.shape[0]):
        for j in range(observed.shape[1]):
            chi_square += ((observed[i, j] - expected[i, j]) ** 2) / expected[i, j]


    df = (observed.shape[0] - 1) * (observed.shape[1] - 1)


    critical_value = chi2.ppf(1 - alpha, df)

    result = "Reject H₀ → Variables are dependent (significant relation)" \
             if chi_square > critical_value else \
             "Fail to Reject H₀ → Variables are independent (no relation)"

    print(f"Chi-Square Test between {row_var} and {col_var}")
    print("Observed Table:\n", contingency)
    print("Expected Table:\n", pd.DataFrame(expected, 
                                            index=contingency.index, 
                                            columns=contingency.columns).round(2))
    print("Chi-Square Value:", round(chi_square, 3))
    print("Degrees of Freedom:", df)
    print("Critical Value (0.05):", round(critical_value, 3))
    print("Conclusion:", result)

chi_square_test(data, "re_watched", "liked")

Chi-Square Test between re_watched and liked
Observed Table:
 liked       False  True 
re_watched              
False         637    214
True           98     51
Expected Table:
 liked        False   True 
re_watched                
False       625.48  225.52
True        109.52   39.48
Chi-Square Value: 5.369
Degrees of Freedom: 1
Critical Value (0.05): 3.841
Conclusion: Reject H₀ → Variables are dependent (significant relation)


In [11]:
import pandas as pd
import numpy as np
from scipy.stats import chi2

data = pd.read_csv("movie_2.csv") 

def chi_square_test(data, row_var, col_var, alpha=0.05):

    contingency = pd.crosstab(data[row_var], data[col_var])
    observed = contingency.values

    row_totals = observed.sum(axis=1)
    col_totals = observed.sum(axis=0)
    grand_total = observed.sum()


    expected = np.zeros_like(observed, dtype=float)
    for i in range(observed.shape[0]):
        for j in range(observed.shape[1]):
            expected[i, j] = (row_totals[i] * col_totals[j]) / grand_total


    chi_square = 0
    for i in range(observed.shape[0]):
        for j in range(observed.shape[1]):
            chi_square += ((observed[i, j] - expected[i, j]) ** 2) / expected[i, j]


    df = (observed.shape[0] - 1) * (observed.shape[1] - 1)


    critical_value = chi2.ppf(1 - alpha, df)

    result = "Reject H₀ → Variables are dependent (significant relation)" \
             if chi_square > critical_value else \
             "Fail to Reject H₀ → Variables are independent (no relation)"

    print(f"Chi-Square Test between {row_var} and {col_var}")
    print("Observed Table:\n", contingency)
    print("Expected Table:\n", pd.DataFrame(expected, 
                                            index=contingency.index, 
                                            columns=contingency.columns).round(2))
    print("Chi-Square Value:", round(chi_square, 3))
    print("Degrees of Freedom:", df)
    print("Critical Value (0.05):", round(critical_value, 3))
    print("Conclusion:", result)

chi_square_test(data, "country", "shared")


Chi-Square Test between country and shared
Observed Table:
 shared             False  True 
country                        
Afghanistan            5      0
Albania                6      0
Algeria                1      1
American Samoa         3      0
Andorra                3      0
...                  ...    ...
Wallis and Futuna      5      1
Western Sahara         3      0
Yemen                  8      0
Zambia                 4      0
Zimbabwe               1      1

[241 rows x 2 columns]
Expected Table:
 shared             False  True 
country                        
Afghanistan         4.49   0.52
Albania             5.38   0.62
Algeria             1.79   0.21
American Samoa      2.69   0.31
Andorra             2.69   0.31
...                  ...    ...
Wallis and Futuna   5.38   0.62
Western Sahara      2.69   0.31
Yemen               7.18   0.82
Zambia              3.59   0.41
Zimbabwe            1.79   0.21

[241 rows x 2 columns]
Chi-Square Value: 229.104
Degrees of Freedo