## Different types of corelations

In [12]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, pointbiserialr, f_oneway, chi2_contingency
from sklearn.metrics import mutual_info_score


In [13]:
# 1) NUMERIC → NUMERIC  (Pearson Correlation)
df_num_num = pd.DataFrame({
    'age': np.random.randint(20, 60, 20),
    'bmi': np.random.uniform(18, 35, 20)
})

corr_pearson, _ = pearsonr(df_num_num['age'], df_num_num['bmi'])
print("Pearson (Numeric → Numeric):", round(corr_pearson, 3))


Pearson (Numeric → Numeric): -0.002


In [14]:
# 2) BINARY CATEGORICAL → NUMERIC  (Point-Biserial)
df_bin_num = pd.DataFrame({
    'smoker': np.random.choice([0, 1], 20),    # binary categorical
    'charges': np.random.randint(2000, 15000, 20)
})

corr_pbs, _ = pointbiserialr(df_bin_num['smoker'], df_bin_num['charges'])
print("Point-Biserial (Binary Categorical → Numeric):", round(corr_pbs, 3))



Point-Biserial (Binary Categorical → Numeric): -0.035


In [15]:
# 3) MULTI-CATEGORICAL → NUMERIC (ANOVA)
df_multi_num = pd.DataFrame({
    'region': np.random.choice(['northeast', 'southeast', 'northwest', 'southwest'], 20),
    'charges': np.random.randint(2000, 15000, 20)
})

groups = [df_multi_num[df_multi_num['region'] == r]['charges'] for r in df_multi_num['region'].unique()]
f_stat, p_val = f_oneway(*groups)
print("ANOVA (Multi-Categorical → Numeric): F =", round(f_stat, 3), ", p =", round(p_val, 3))



ANOVA (Multi-Categorical → Numeric): F = 0.318 , p = 0.812


In [16]:
# 4) NUMERIC → CATEGORICAL (Mutual Information)
df_num_cat = pd.DataFrame({
    'bmi': np.random.uniform(18, 35, 20),
    'risk': np.random.choice(['low', 'medium', 'high'], 20)
})

mi = mutual_info_score(df_num_cat['bmi'].round(1), df_num_cat['risk'])
print("Mutual Information (Numeric → Categorical):", round(mi, 3))



Mutual Information (Numeric → Categorical): 1.055




In [17]:
# 5) CATEGORICAL → CATEGORICAL (Cramer's V)
df_cat_cat = pd.DataFrame({
    'sex': np.random.choice(['male', 'female'], 20),
    'smoker': np.random.choice(['yes', 'no'], 20)
})

cont_table = pd.crosstab(df_cat_cat['sex'], df_cat_cat['smoker'])
chi2, _, _, _ = chi2_contingency(cont_table)
n = cont_table.sum().sum()
phi2 = chi2/n
r, k = cont_table.shape
cramers_v = np.sqrt(phi2 / min(r-1, k-1))
print("Cramer's V (Categorical → Categorical):", round(cramers_v, 3))



Cramer's V (Categorical → Categorical): 0.0


In [18]:
# 6) MIXED (num + cat) → CATEGORICAL (Mutual Information)
df_mixed_cat = pd.DataFrame({
    'age': np.random.randint(18, 60, 20),
    'sex': np.random.choice(['male', 'female'], 20),
    'risk': np.random.choice(['low', 'high'], 20)
})

# Convert categorical to codes for MI
df_mixed_cat_enc = df_mixed_cat.copy()
df_mixed_cat_enc['sex'] = df_mixed_cat['sex'].astype('category').cat.codes
df_mixed_cat_enc['risk'] = df_mixed_cat['risk'].astype('category').cat.codes

mi_age = mutual_info_score(df_mixed_cat_enc['age'], df_mixed_cat_enc['risk'])
mi_sex = mutual_info_score(df_mixed_cat_enc['sex'], df_mixed_cat_enc['risk'])

print("Mutual Information (Mixed → Categorical):")
print(" age → risk:", round(mi_age, 3))
print(" sex → risk:", round(mi_sex, 3))

Mutual Information (Mixed → Categorical):
 age → risk: 0.578
 sex → risk: 0.001


In [None]:
"""
Pearson (Numeric → Numeric): 0.12

Point-Biserial (Binary Categorical → Numeric): 0.55

ANOVA (F, p): 1.23  0.31

Mutual Information (Numeric → Categorical): 0.18

Cramer's V: 0.22

Mutual Information (Mixed → Categorical):
 age → risk: 0.10
 sex → risk: 0.05
"""

"\nPearson (Numeric → Numeric): 0.12\nPoint-Biserial (Binary Categorical → Numeric): 0.55\nANOVA (F, p): 1.23  0.31\nMutual Information (Numeric → Categorical): 0.18\nCramer's V: 0.22\nMutual Information (Mixed → Categorical):\n age → risk: 0.10\n sex → risk: 0.05\n"