In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.decomposition import PCA

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import SelectKBest

import seaborn as sns

In [8]:
df_test=pd.read_csv("test_data.csv")
X=pd.read_csv("training_data.csv")
y=pd.read_csv("training_data_targets.csv", header=None)

print(X.shape)
print(y.shape)

(1058, 48)
(1058, 1)


In [7]:
missing_values_per_column = X.isnull().sum()
print(missing_values_per_column)

age                           0
gendera                       0
BMI                         190
hypertensive                  0
atrialfibrillation            0
CHD with no MI                0
diabetes                      0
deficiencyanemias             0
depression                    0
Hyperlipemia                  0
Renal failure                 0
COPD                          0
heart rate                   12
Systolic blood pressure      14
Diastolic blood pressure     14
Respiratory rate             12
temperature                  18
SP O2                        12
Urine output                 34
hematocrit                    0
RBC                           0
MCH                           0
MCHC                          0
MCV                           0
RDW                           0
Leucocyte                     0
Platelets                     0
Neutrophils                 122
Basophils                   216
Lymphocyte                  122
PT                           18
INR     

In [15]:
mean_imputer = SimpleImputer(strategy='mean')
X_mean_imp = pd.DataFrame(
    data=mean_imputer.fit_transform(X),
    columns=X.columns    
)

X_mean_imp.shape[0] - X_mean_imp.count()

In [24]:
X_train, X_test, y_train, y_test = train_test_split(
    X_mean_imp, y.values.ravel(),
    test_size=0.2, random_state=42
)

# Define the RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [20, 50, 80],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy of the Best Model: {accuracy:.2f}')

Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Accuracy of the Best Model: 0.91


In [26]:
print(classification_report(y_test, best_model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.91      0.99      0.95       191
           1       0.75      0.14      0.24        21

    accuracy                           0.91       212
   macro avg       0.83      0.57      0.60       212
weighted avg       0.90      0.91      0.88       212

