In [2]:
import pandas as pd
import polars as pl
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
df = pl.read_csv('../../Dementia/JanBDRcount.csv')
df = df.to_pandas()

In [4]:
# df.info
for col in df.columns:
    df[col].fillna(3, inplace=True)

X = df.drop(columns=['FID', 'IID', 'PAT', 'MAT', 'SEX', 'PHENOTYPE'])
y = df['PHENOTYPE']


In [5]:
assert X.isnull().sum().sum() == 0, "There are still missing values in X"
assert y.isnull().sum().sum() == 0, "There are still missing values in y"

In [6]:
print(X.isnull().sum().sum())

0


In [7]:
X.head(5)

Unnamed: 0,rs3131972_A,rs11240777_A,rs4970383_A,rs4475691_A,rs13302982_A,rs28391282_A,rs2341354_A,rs9777703_G,rs1891910_A,rs142743151_A,...,rs6009945_C,rs9616810_A,rs9616812_A,rs9616816_A,rs77452243_A,rs2341010_A,rs739365_A,rs6010063_G,rs10451_A,rs2285395_A
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,2.0,1.0


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
#Hyperparameter tuning for better prediction of features
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 7, 9],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, verbose=3)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV 1/3] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.704 total time=   2.7s
[CV 2/3] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.702 total time=   2.1s
[CV 3/3] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.702 total time=   2.1s
[CV 1/3] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.704 total time=   2.9s
[CV 2/3] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.702 total time=   2.9s
[CV 3/3] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.702 total time=   3.2s
[CV 1/3] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=300;, score=0.704 total time=   3.3s
[CV 2/3] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=300;, score=0.702 total time=   3.5s
[C

In [15]:
# Best parameters with random forest classifier
best_params = grid_search.best_params_
print(f'Best Parameters: {best_params}')

# Train the model with best parameters
best_rf = RandomForestClassifier(**best_params , random_state=42)
best_rf.fit(X_train, y_train)

# Important Features
feature_importances = best_rf.feature_importances_

Best Parameters: {'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}


In [11]:
# Visualization

features = X.columns


In [12]:
importances = pd.DataFrame({'feature': features, 'importance': feature_importances})
importances = importances.sort_values(by='importance', ascending=False)


In [13]:
importances.head(30)

Unnamed: 0,feature,importance
279957,rs157580_G,0.002557
279972,rs429358_C,0.002176
95749,rs4865857_G,0.002115
86968,rs1018139_A,0.002058
118910,rs9345409_A,0.001997
35197,rs144492151_A,0.001926
176338,rs79796707_A,0.001749
265368,rs1872087_A,0.001702
220616,rs1887828_C,0.001667
241460,rs8036080_A,0.001618


In [14]:
data = importances
x=data['importance']
y=data['feature']
plt.figure(figsize=(10, 6))
sns.barplot(x=x, y=y)
plt.title('Feature Importances')
plt.show()

KeyboardInterrupt: 