In [16]:
import pandas as pd
import polars as pl
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from scipy.stats import chi2_contingency
from sklearn.feature_selection import chi2
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
df = pl.read_csv('../../Dementia/JanBDRcount.csv')
df = df.to_pandas()

In [27]:
for col in df.columns:
    df[col].fillna(3, inplace=True)

X = df.drop(columns=['FID', 'IID', 'PAT', 'MAT', 'SEX', 'PHENOTYPE'])
y = df['PHENOTYPE']

In [10]:
assert X.isnull().sum().sum() == 0, "There are still missing values in X"
assert y.isnull().sum().sum() == 0, "There are still missing values in y"

In [11]:
print(X.isnull().sum().sum())

0


In [33]:
chi2_stat, p_values = chi2(X, y)

# Create a DataFrame to show p-values for each feature
feature_importance = pd.DataFrame({'Feature': X.columns, 'P_Value': p_values})
feature_importance = feature_importance.sort_values(by='P_Value', ascending=True)  # Ascending order
print(feature_importance)

              Feature       P_Value
263752   rs16957876_0  1.803884e-26
9343    rs138982442_A  3.303062e-26
40099    rs12614416_G  6.409356e-25
60484     rs1492865_G  5.260565e-24
203930   rs79640820_C  5.813961e-24
...               ...           ...
134766     rs629458_0  1.000000e+00
167945   rs12339683_0  1.000000e+00
55114   rs114658092_0  1.000000e+00
78829     rs4693608_G  1.000000e+00
150362       rs6768_0  1.000000e+00

[297678 rows x 2 columns]


In [32]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency

def chi_square_selector(X, y, alpha=0.05):
    # Convert categorical target variable y into numeric categories if needed
    _, y_numeric = np.unique(y, return_inverse=True)
    
    chi2_stat = []
    p_values = []
    
    for feature in X.columns:
        # Create contingency table
        contingency_table = pd.crosstab(X[feature], y_numeric)
        
        # Compute chi-square test statistic, p-value, degrees of freedom, and expected frequencies
        chi2, p, _, _ = chi2_contingency(contingency_table)
        
        # Store results
        chi2_stat.append(chi2)
        p_values.append(p)
    
    # Convert lists to numpy arrays for easier manipulation
    chi2_stat = np.array(chi2_stat)
    p_values = np.array(p_values)
    
    # Select features based on p-value threshold (alpha)
    selected_features = X.columns[p_values <= alpha]
    
    return selected_features, chi2_stat, p_values
def chi_square_selector(X, y, alpha=0.05):
    # Convert categorical target variable y into numeric categories if needed
    _, y_numeric = np.unique(y, return_inverse=True)
    
    chi2_stat = []
    p_values = []
    
    for feature in X.columns:
        # Create contingency table
        contingency_table = pd.crosstab(X[feature], y_numeric)
        
        # Compute chi-square test statistic, p-value, degrees of freedom, and expected frequencies
        chi2, p, _, _ = chi2_contingency(contingency_table)
        
        # Store results
        chi2_stat.append(chi2)
        p_values.append(p)
    
    # Convert lists to numpy arrays for easier manipulation
    chi2_stat = np.array(chi2_stat)
    p_values = np.array(p_values)
    
    # Select features based on p-value threshold (alpha)
    selected_features = X.columns[p_values <= alpha]
    
    return selected_features, chi2_stat, p_values

selected_features, chi2_stat, p_values = chi_square_selector(X, y)
feature_importance = pd.DataFrame({'Feature': X.columns, 'P_Value': p_values})
feature_importance = feature_importance.sort_values(by='P_Value', ascending=True)  # Ascending order
print(feature_importance)


              Feature       P_Value
279960     rs157582_A  2.767274e-18
279955   rs34342646_A  1.291443e-16
279958   rs34404554_C  1.690866e-16
279975   rs12721046_A  5.233055e-16
279956   rs71352238_G  5.923158e-16
...               ...           ...
250712    rs2966176_0  1.000000e+00
43074    rs11677176_0  1.000000e+00
9165    rs150856593_0  1.000000e+00
6341    rs116672238_0  1.000000e+00
1015    rs145264193_0  1.000000e+00

[297678 rows x 2 columns]
