## Different types of corelations

In [2]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, pointbiserialr, f_oneway, chi2_contingency
from sklearn.metrics import mutual_info_score


In [13]:
# 1) NUMERIC → NUMERIC  (Pearson Correlation)
# REMEMBER BOTH corr() and pearsonr() ARE BOTH DIFFERENT, DONT GET CONFUSED
df_num_num = pd.DataFrame({
    'age': np.random.randint(20, 60, 20),
    'bmi': np.random.uniform(18, 35, 20)
})

corr_pearson, _ = pearsonr(df_num_num['age'], df_num_num['bmi'])
print("Pearson (Numeric → Numeric):", round(corr_pearson, 3))


Pearson (Numeric → Numeric): 0.378


In [None]:
# suppose i have more than two columns columns in df_num_num

# Example numeric dataframe
df_num_num = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': [2, 4, 6, 8, 10],
    'C': [5, 3, 2, 4, 1]
})

# Create an empty correlation matrix
corr_matrix = pd.DataFrame(index=df_num_num.columns, columns=df_num_num.columns)

# Compute pairwise correlations
for i in df_num_num.columns:
    for j in df_num_num.columns:
        corr, _ = pearsonr(df_num_num[i], df_num_num[j])
        corr_matrix.loc[i, j] = corr

corr_matrix

Unnamed: 0,A,B,C
A,1.0,1.0,-0.7
B,1.0,1.0,-0.7
C,-0.7,-0.7,1.0


In [4]:
# 2) BINARY CATEGORICAL → NUMERIC  (Point-Biserial)
df_bin_num = pd.DataFrame({
    'smoker': np.random.choice([0, 1], 20),    # binary categorical
    'charges': np.random.randint(2000, 15000, 20)
})

corr_pbs, _ = pointbiserialr(df_bin_num['smoker'], df_bin_num['charges'])
print("Point-Biserial (Binary Categorical → Numeric):", round(corr_pbs, 3))



Point-Biserial (Binary Categorical → Numeric): -0.349


In [5]:
# 3) MULTI-CATEGORICAL → NUMERIC (ANOVA)
df_multi_num = pd.DataFrame({
    'region': np.random.choice(['northeast', 'southeast', 'northwest', 'southwest'], 20),
    'charges': np.random.randint(2000, 15000, 20)
})

groups = [df_multi_num[df_multi_num['region'] == r]['charges'] for r in df_multi_num['region'].unique()]
f_stat, p_val = f_oneway(*groups)
print("ANOVA (Multi-Categorical → Numeric): F =", round(f_stat, 3), ", p =", round(p_val, 3))



ANOVA (Multi-Categorical → Numeric): F = 2.162 , p = 0.132


In [6]:
# 4) NUMERIC → CATEGORICAL (Mutual Information)
df_num_cat = pd.DataFrame({
    'bmi': np.random.uniform(18, 35, 20),
    'risk': np.random.choice(['low', 'medium', 'high'], 20)
})

mi = mutual_info_score(df_num_cat['bmi'].round(1), df_num_cat['risk'])
print("Mutual Information (Numeric → Categorical):", round(mi, 3))



Mutual Information (Numeric → Categorical): 1.081




In [7]:
# 5) CATEGORICAL → CATEGORICAL (Cramer's V)
df_cat_cat = pd.DataFrame({
    'sex': np.random.choice(['male', 'female'], 20),
    'smoker': np.random.choice(['yes', 'no'], 20)
})

cont_table = pd.crosstab(df_cat_cat['sex'], df_cat_cat['smoker'])
chi2, _, _, _ = chi2_contingency(cont_table)
n = cont_table.sum().sum()
phi2 = chi2/n
r, k = cont_table.shape
cramers_v = np.sqrt(phi2 / min(r-1, k-1))
print("Cramer's V (Categorical → Categorical):", round(cramers_v, 3))



Cramer's V (Categorical → Categorical): 0.201


In [8]:
# 6) MIXED (num + cat) → CATEGORICAL (Mutual Information)
df_mixed_cat = pd.DataFrame({
    'age': np.random.randint(18, 60, 20),
    'sex': np.random.choice(['male', 'female'], 20),
    'risk': np.random.choice(['low', 'high'], 20)
})

# Convert categorical to codes for MI
df_mixed_cat_enc = df_mixed_cat.copy()
df_mixed_cat_enc['sex'] = df_mixed_cat['sex'].astype('category').cat.codes
df_mixed_cat_enc['risk'] = df_mixed_cat['risk'].astype('category').cat.codes

mi_age = mutual_info_score(df_mixed_cat_enc['age'], df_mixed_cat_enc['risk'])
mi_sex = mutual_info_score(df_mixed_cat_enc['sex'], df_mixed_cat_enc['risk'])

print("Mutual Information (Mixed → Categorical):")
print(" age → risk:", round(mi_age, 3))
print(" sex → risk:", round(mi_sex, 3))

Mutual Information (Mixed → Categorical):
 age → risk: 0.624
 sex → risk: 0.046


In [9]:
"""
Pearson (Numeric → Numeric): 0.12

Point-Biserial (Binary Categorical → Numeric): 0.55

ANOVA (F, p): 1.23  0.31

Mutual Information (Numeric → Categorical): 0.18

Cramer's V: 0.22

Mutual Information (Mixed → Categorical):
 age → risk: 0.10
 sex → risk: 0.05
"""

"\nPearson (Numeric → Numeric): 0.12\n\nPoint-Biserial (Binary Categorical → Numeric): 0.55\n\nANOVA (F, p): 1.23  0.31\n\nMutual Information (Numeric → Categorical): 0.18\n\nCramer's V: 0.22\n\nMutual Information (Mixed → Categorical):\n age → risk: 0.10\n sex → risk: 0.05\n"

In [10]:
""" 
SIMPLE CODE FOR IMPUTATION

from sklearn.impute import SimpleImputer
import numpy as np

imp = SimpleImputer(strategy='mean')
df['Age'] = imp.fit_transform(df[['Age']])


OTHER WAY 
df['column_name'] = df['column_name'].fillna(df['column_name'].mean())




"""

" \nSIMPLE CODE FOR IMPUTATION\n\nfrom sklearn.impute import SimpleImputer\nimport numpy as np\n\nimp = SimpleImputer(strategy='mean')\ndf['Age'] = imp.fit_transform(df[['Age']])\n\n\nOTHER WAY \ndf['column_name'] = df['column_name'].fillna(df['column_name'].mean())\n\n\n\n\n"