In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler,FunctionTransformer
from sklearn.linear_model import LogisticRegression, Lasso, Ridge
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score,log_loss,confusion_matrix,auc,roc_curve, ConfusionMatrixDisplay, classification_report
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.svm import SVC

pd.set_option('display.max_columns',150)

In [None]:
greeks = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/greeks.csv')
greeks

In [None]:
greeks.Alpha.unique()

In [None]:
palette_color = sns.color_palette('Paired')
plt.pie(greeks.Alpha.value_counts(), labels=greeks.Alpha.value_counts().index, colors=palette_color, autopct='%.0f%%')
plt.title('PieChart of Age related Diseses')
plt.show()

In [None]:
train = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
train.head()

In [None]:
train.info()

In [None]:
null_table = pd.DataFrame()
null_table['Null Columns'] = round((train.isna().sum()/len(train))* 100 , 3)
null_table['Null Count'] = train.isna().sum()
null_table['Dtype'] = train.dtypes
null_table = null_table.sort_values('Null Count', ascending=False)
null_table[null_table['Null Columns'] > 0]

In [None]:
#plotting some histograms to visualise the distribution of features
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(12, 8))
axes = axes.flatten()
for i, column in enumerate(train.columns[1:13]):
    sns.distplot(train[column], ax=axes[i], kde=True)
plt.tight_layout()
plt.show()

# Data Preperation

In [None]:
col_transformer1 = ColumnTransformer(
    transformers=[
        ('drop_id','drop','Id'),
        ('mapping_col_EJ',FunctionTransformer(lambda x:x.replace({'A':1,'B':0})),['EJ'])
    ],remainder='passthrough'
)

impute_scale = Pipeline(
    steps=[
        ('using_simple_imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ])


main_pipe = Pipeline(
    steps=[
        ('columns transform',col_transformer1),
        ('imputing_and_scaling',impute_scale)
    ]
)

main_pipe

In [None]:
X = train.iloc[:,:-1]
y = train.iloc[:,-1]

X = main_pipe.fit_transform(X)
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=45)

# Model Prepration

* LOGISTIC REGRESSION

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(solver='newton-cg')
logreg.fit(X_train,y_train)
y_pred = logreg.predict(X_test)
print(f'classification report:\n ',classification_report(y_test,y_pred))

In [None]:
y_pred_prob = logreg.predict_proba(X_test)
logloss = log_loss(y_test, y_pred_prob)
print(f"Log Loss of LOGISTIC REGESSION: {logloss:.4f}")


In [None]:
confuse_matrix = confusion_matrix(y_test,y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=confuse_matrix,
                              display_labels=logreg.classes_)
disp.plot()

plt.show()

In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=44)
scores = cross_val_score(logreg, X, y, cv=kfold, scoring='neg_log_loss')
mean_log_loss = -scores.mean()
print(mean_log_loss)

In [None]:
fpr,tpr,_  = roc_curve(y_test,y_pred_prob[:,1])
roc_auc  = auc(fpr,tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.01])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()

* GRADIENT BOOSTING CLASSIFIER

In [None]:
grad_boost = GradientBoostingClassifier(learning_rate=0.01)
grad_boost.fit(X_train,y_train)
y_pred_gradboost = grad_boost.predict(X_test)
y_pred_prob_grad = grad_boost.predict_proba(X_test)
print(f'Classification report of Gradient Boost CLassifier:\n', classification_report(y_test,y_pred_gradboost))
print(f'\n\n LogLoss Of ADABOOST: ', round(log_loss(y_test,y_pred_prob_grad),4))

* RANDOM FOREST CLASSIFIER

In [None]:
forest_clf = RandomForestClassifier().fit(X_train,y_train)
y_pred_forest = forest_clf.predict(X_test)
y_pred_prob_forest = forest_clf.predict_proba(X_test)
print(f'Classification Report of Random Forest Classifier: \n', classification_report(y_test,y_pred_forest))
print(f'\n\n Log loss of Random Forest Classifier:  ', round(log_loss(y_test,y_pred_prob_forest),4))

In [None]:
forest_clf.get_params()

* SUPPORT VECTOR CLASSIFIER WITH GRIDSEARCHCV


In [None]:

parameters = {'C': [0.001, 0.01, 0.1,1],
              'kernel' : ['linear','poly', 'rbf', 'sigmoid'],
              'degree' : [1,2,3,4,5],
              'gamma': [0.01, 0.1, 1, 'scale', 'auto']
             }
svc = SVC(probability=True, decision_function_shape='ovo')
gscv = GridSearchCV(svc,param_grid=parameters,cv =5).fit(X_train,y_train)


In [None]:
best_params = gscv.best_params_
best_estimator = gscv.best_estimator_
best_estimator

In [None]:
y_pred_svc = best_estimator.predict(X_test)
y_pred_prob_svc = best_estimator.predict_proba(X_test)
print(f'Classification Report for SVC: \n', classification_report(y_test,y_pred_svc))
print(f'\n\n Logloss for this model: ', round(log_loss(y_test,y_pred_prob_svc),4))

In [None]:
hut  = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
hut2 = main_pipe.transform(hut)
hut_pred = forest_clf.predict_proba(hut2)

In [None]:
sub = pd.DataFrame()
sub['Id'] = hut['Id']
sub['class_0'] = hut_pred[:,0]
sub['class_1'] = hut_pred[:,1]
sub

In [None]:
sub.to_csv('submission.csv', index=False)