# Question 1
- Determine whether the interval and the class label have some correlation or not? (where, significance value = 0.1)
- F → Attribute, K → Class
---
|SAMPLE| F| K|
|---|---|---|
|1 |1 |1|
|2 |3 |2|
|3 |7 |1|
|4 |8 |1|
|5 |9 |1|
|6 |11 |2|
|7 |23 |2|
|8 |37| 1|
|9| 39| 2|
|10| 45| 1|
|11 |46 |1|
|12 |59 |1|
---

In [8]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency
import math

In [23]:
def chi_merge(df, col, target, max_intervals):
    intervals = [[val] for val in sorted(df[col].unique())]
    
    while len(intervals) > max_intervals:
        chi2_values = []
        
        for i in range(len(intervals) - 1):
            combined = intervals[i] + intervals[i + 1]
            contingency_table = pd.crosstab(pd.cut(df[col], bins=[-np.inf] + combined + [np.inf]), df[target])
            chi2, p, dof, expected = chi2_contingency(contingency_table)
            chi2_values.append((chi2, i))
        
        chi2_values.sort()
        min_chi2, min_index = chi2_values[0]
        
        intervals[min_index] = intervals[min_index] + intervals[min_index + 1]
        del intervals[min_index + 1]
    
    bins = [-np.inf] + [max(interval) for interval in intervals] + [np.inf]
    return bins

In [21]:
data = {
    'F': [1, 3, 7, 8, 9, 11, 23, 37, 39, 45, 46, 59],
    'K': [1, 2, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1]
}

df = pd.DataFrame(data)

n = len(df)
sturges_bins = math.ceil(math.log2(n) + 1)
sqrt_bins = math.ceil(math.sqrt(n))

print(f"Number of bins (Sturges' formula): {sturges_bins}")
print(f"Number of bins (Square root choice): {sqrt_bins}")

max_intervals = sturges_bins  

bins_f = chi_merge(df, 'F', 'K', max_intervals)

df['F_binned'] = pd.cut(df['F'], bins=bins_f)

contingency_table_f = pd.crosstab(df['F_binned'], df['K'])
chi2_f, p_f, dof_f, expected_f = chi2_contingency(contingency_table_f)

alpha = 0.1

if p_f < alpha:
    print("There is a significant correlation between the interval of F and the class label (reject H0)")
else:
    print("There is no significant correlation between the interval of F and the class label (fail to reject H0)")

print(f"Chi2 Statistic for F: {chi2_f}")
print(f"P-value for F: {p_f}")
print(f"Degrees of Freedom for F: {dof_f}")
print(f"Expected Frequencies for F: \n{expected_f}")
print(f"Bins for F: {bins_f}")

df

Number of bins (Sturges' formula): 5
Number of bins (Square root choice): 4
There is no significant correlation between the interval of F and the class label (fail to reject H0)
Chi2 Statistic for F: 3.0000000000000004
P-value for F: 0.5578254003710746
Degrees of Freedom for F: 4
Expected Frequencies for F: 
[[0.66666667 0.33333333]
 [2.66666667 1.33333333]
 [0.66666667 0.33333333]
 [1.33333333 0.66666667]
 [2.66666667 1.33333333]]
Bins for F: [-inf, np.int64(1), np.int64(9), np.int64(11), np.int64(37), np.int64(59), inf]


Unnamed: 0,F,K,F_binned
0,1,1,"(-inf, 1.0]"
1,3,2,"(1.0, 9.0]"
2,7,1,"(1.0, 9.0]"
3,8,1,"(1.0, 9.0]"
4,9,1,"(1.0, 9.0]"
5,11,2,"(9.0, 11.0]"
6,23,2,"(11.0, 37.0]"
7,37,1,"(11.0, 37.0]"
8,39,2,"(37.0, 59.0]"
9,45,1,"(37.0, 59.0]"


# Question 2 
- Determine whether the interval and the class label have some correlation or not? (where,significance value = 0.1)
- X, Y → Attribute, C → Class
#### Data
|X |Y| C|
|---|---|---|
|1 |2 |A|
|5 |8 |B|
|9 |14| A|
|13 |20 |B|
|17 |26 |A|
|21 |32 |B|
|24 |36 |A|

In [24]:
data = {
    'X': [1, 5, 9, 13, 17, 21, 24],
    'Y': [2, 8, 14, 20, 26, 32, 36],
    'C': ['A', 'B', 'A', 'B', 'A', 'B', 'A']
}

df = pd.DataFrame(data)

n = len(df)
sturges_bins = math.ceil(math.log2(n) + 1)
sqrt_bins = math.ceil(math.sqrt(n))

print(f"Number of bins : {sturges_bins}")

max_intervals = sturges_bins

bins_x = chi_merge(df, 'X', 'C', max_intervals)
bins_y = chi_merge(df, 'Y', 'C', max_intervals)

df['X_binned'] = pd.cut(df['X'], bins=bins_x)
df['Y_binned'] = pd.cut(df['Y'], bins=bins_y)

contingency_table_x = pd.crosstab(df['X_binned'], df['C'])
chi2_x, p_x, dof_x, expected_x = chi2_contingency(contingency_table_x)

contingency_table_y = pd.crosstab(df['Y_binned'], df['C'])
chi2_y, p_y, dof_y, expected_y = chi2_contingency(contingency_table_y)

alpha = 0.1

if p_x < alpha:
    print("There is a significant correlation between the interval of X and the class label (reject H0)")
else:
    print("There is no significant correlation between the interval of X and the class label (fail to reject H0)")

print(f"Chi2 Statistic for X: {chi2_x}")
print(f"P-value for X: {p_x}")
print(f"Degrees of Freedom for X: {dof_x}")
print(f"Expected Frequencies for X: \n{expected_x}")
print(f"Bins for X: {bins_x}")

if p_y < alpha:
    print("There is a significant correlation between the interval of Y and the class label (reject H0)")
else:
    print("There is no significant correlation between the interval of Y and the class label (fail to reject H0)")

print(f"Chi2 Statistic for Y: {chi2_y}")
print(f"P-value for Y: {p_y}")
print(f"Degrees of Freedom for Y: {dof_y}")
print(f"Expected Frequencies for Y: \n{expected_y}")
print(f"Bins for Y: {bins_y}")

df

Number of bins : 4
There is no significant correlation between the interval of X and the class label (fail to reject H0)
Chi2 Statistic for X: 0.875
P-value for X: 0.831456304592444
Degrees of Freedom for X: 3
Expected Frequencies for X: 
[[0.57142857 0.42857143]
 [1.14285714 0.85714286]
 [1.14285714 0.85714286]
 [1.14285714 0.85714286]]
Bins for X: [-inf, np.int64(1), np.int64(9), np.int64(17), np.int64(24), inf]
There is no significant correlation between the interval of Y and the class label (fail to reject H0)
Chi2 Statistic for Y: 0.875
P-value for Y: 0.831456304592444
Degrees of Freedom for Y: 3
Expected Frequencies for Y: 
[[0.57142857 0.42857143]
 [1.14285714 0.85714286]
 [1.14285714 0.85714286]
 [1.14285714 0.85714286]]
Bins for Y: [-inf, np.int64(2), np.int64(14), np.int64(26), np.int64(36), inf]


Unnamed: 0,X,Y,C,X_binned,Y_binned
0,1,2,A,"(-inf, 1.0]","(-inf, 2.0]"
1,5,8,B,"(1.0, 9.0]","(2.0, 14.0]"
2,9,14,A,"(1.0, 9.0]","(2.0, 14.0]"
3,13,20,B,"(9.0, 17.0]","(14.0, 26.0]"
4,17,26,A,"(9.0, 17.0]","(14.0, 26.0]"
5,21,32,B,"(17.0, 24.0]","(26.0, 36.0]"
6,24,36,A,"(17.0, 24.0]","(26.0, 36.0]"
