In [21]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.decomposition import PCA

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score,make_scorer
from sklearn.feature_selection import SelectKBest

import seaborn as sns

In [10]:
# df_test=pd.read_csv("test_data.csv")
X=pd.read_csv("training_data.csv")
y=pd.read_csv("training_data_targets.csv", header=None)

print(X.shape)
print(y.shape)

(1058, 48)
(1058, 1)


In [13]:
# Missing values
missing_values_per_column = X.isnull().sum()
print(missing_values_per_column)

age                           0
gendera                       0
BMI                         190
hypertensive                  0
atrialfibrillation            0
CHD with no MI                0
diabetes                      0
deficiencyanemias             0
depression                    0
Hyperlipemia                  0
Renal failure                 0
COPD                          0
heart rate                   12
Systolic blood pressure      14
Diastolic blood pressure     14
Respiratory rate             12
temperature                  18
SP O2                        12
Urine output                 34
hematocrit                    0
RBC                           0
MCH                           0
MCHC                          0
MCV                           0
RDW                           0
Leucocyte                     0
Platelets                     0
Neutrophils                 122
Basophils                   216
Lymphocyte                  122
PT                           18
INR     

In [15]:
mean_imputer = SimpleImputer(strategy='mean')
X_mean_imp = pd.DataFrame(
    data=mean_imputer.fit_transform(X),
    columns=X.columns    
)

print((X_mean_imp.shape[0] - X_mean_imp.count()).tolist())

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [17]:
median_imputer = SimpleImputer(strategy='median')
X_median_imp = pd.DataFrame(
    data=median_imputer.fit_transform(X),
    columns=X.columns    
)

print((X_median_imp.shape[0] - X_median_imp.count()).tolist())
   



[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [18]:
mode_imputer = SimpleImputer(strategy='most_frequent')
X_mode_imp = pd.DataFrame(
    data=mode_imputer.fit_transform(X),
    columns=X.columns    
)

print((X_mode_imp.shape[0] - X_mode_imp.count()).tolist())

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [28]:
knn_imputer = KNNImputer(n_neighbors=3)
X_knn_imp = pd.DataFrame(
    data=knn_imputer.fit_transform(X),
    columns=X.columns    
)


In [29]:
scorer=make_scorer(f1_score,average="weighted")

X_train, X_test, y_train, y_test = train_test_split(
    X_knn_imp, y.values.ravel(),
    test_size=0.3, random_state=42
)

# Define the RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [20, 50, 80],
    'max_depth': [None, 10, 20,40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4,9]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring=scorer)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test)

# Evaluate the best model
f1 = f1_score(y_test, y_pred)
print(f'Accuracy of the Best Model: {f1:.2f}')

Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 20}
Accuracy of the Best Model: 0.28


In [22]:
scorer=make_scorer(f1_score,average="weighted")

X_train, X_test, y_train, y_test = train_test_split(
    X_mean_imp, y.values.ravel(),
    test_size=0.2, random_state=42
)

# Define the RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [20, 50, 80],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring=scorer)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test)

# Evaluate the best model
f1 = f1_score(y_test, y_pred)
print(f'Accuracy of the Best Model: {f1:.2f}')

Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 20}
Accuracy of the Best Model: 0.23


In [23]:

X_train, X_test, y_train, y_test = train_test_split(
    X_median_imp, y.values.ravel(),
    test_size=0.2, random_state=42
)

# Define the RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [20, 50, 80],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring=scorer)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test)

# Evaluate the best model
f1 = f1_score(y_test, y_pred)
print(f'Accuracy of the Best Model: {f1:.2f}')

Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 20}
Accuracy of the Best Model: 0.22


In [24]:

X_train, X_test, y_train, y_test = train_test_split(
    X_mode_imp, y.values.ravel(),
    test_size=0.2, random_state=42
)

# Define the RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [20, 50, 80],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring=scorer)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test)

# Evaluate the best model
f1 = f1_score(y_test, y_pred)
print(f'Accuracy of the Best Model: {f1:.2f}')

Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 20}
Accuracy of the Best Model: 0.25


In [None]:
#scaling


In [35]:
X_train, X_test, y_train, y_test = train_test_split(
    X_mode_imp, y.values.ravel(),
    test_size=0.2, random_state=42
)

# Define the Logistic Regression model
logreg = LogisticRegression(max_iter=5, random_state=42)

# Define the hyperparameter grid to search
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}

# Create a custom scoring function for F1 score
scorer = make_scorer(f1_score, average='weighted')

# Create the GridSearchCV object with Logistic Regression and F1 score as the scoring metric
grid_search = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5, scoring=scorer)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test)

# Evaluate the best model using F1 score
f1 = f1_score(y_test, y_pred, average='weighted')
print(f'Weighted F1 Score of the Best Model: {f1:.2f}')





STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Hyperparameters: {'C': 0.001, 'penalty': 'l2'}
Weighted F1 Score of the Best Model: 0.85


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
30 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/home/iiserb/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/iiserb/.local/lib/pyt