In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',40)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, OrdinalEncoder,RobustScaler,PowerTransformer,MinMaxScaler, LabelEncoder,label_binarize
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, InputLayer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import f1_score,confusion_matrix, roc_curve, auc,roc_auc_score
from itertools import cycle

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s3e22/train.csv',index_col='id')
test = pd.read_csv('/kaggle/input/playground-series-s3e22/test.csv', index_col='id')

train

In [None]:
train.info()

In [None]:
null_df = pd.DataFrame()
null_df['null count'] = train.isna().sum()
null_df['null pct'] = (train.isna().sum() / len(train)) *100
null_df['dtype'] = train.dtypes
null_df = null_df.sort_values('null count', ascending= False)
null_df = null_df[null_df['null count'] > 0]
null_df

In [None]:
train.columns

In [None]:
sns.barplot(x = train.rectal_exam_feces.value_counts().index,y =  train.rectal_exam_feces.value_counts())
plt.show()

In [None]:
cat_cols = train.select_dtypes('object')
num_cols = train.select_dtypes(['float', 'int'])
cat_cols 

In [None]:
for i in cat_cols.columns:
    print('Name of Columns: ', i)
    print('value counts: ', cat_cols[i].value_counts())
    print('\n')

In [None]:
fig, ax = plt.subplots(1,2, figsize=(16,5))
sns.boxplot(train.lesion_1, ax=ax[0])
sns.histplot(train.lesion_1,kde=True, ax=ax[1])
plt.tight_layout()
plt.show()

In [None]:
num_cols

In [None]:
fig, ax = plt.subplots(3,2, figsize=(18,15))
sns.distplot(num_cols.rectal_temp, bins=50,ax=ax[0,0])
sns.distplot(num_cols.pulse, bins=50, ax=ax[0,1])
sns.distplot(num_cols.respiratory_rate, bins=50, ax=ax[1,0])
sns.distplot(num_cols.nasogastric_reflux_ph, bins=50, ax=ax[1,1])
sns.distplot(num_cols.packed_cell_volume, bins=50, ax=ax[2,0])
sns.distplot(num_cols.total_protein, bins=50, ax=ax[2,1])

## Data preparing step

In [None]:
categorical_order = {
    'age' : ['young', 'adult'],
    'temp_of_extremities' : ['cold','cool','normal','warm'],
    'peripheral_pulse' : ['absent','reduced','normal','increased'],
    'mucous_membrane' : ['dark_cyanotic','bright_red','pale_cyanotic','pale_pink','bright_pink','normal_pink'],
    'capillary_refill_time' : ['more_3_sec','3','less_3_sec'],
    'pain' : ['alert','depressed','mild_pain','severe_pain','extreme_pain'],
    'peristalsis' : ['absent','normal','hypermotile', 'hypomotile'],
    'abdominal_distention' : ['none','slight','moderate','severe'],
    'nasogastric_tube' : ['none','slight','significant'],
    'nasogastric_reflux' : ['none', 'less_1_liter', 'more_1_liter'],
    'rectal_exam_feces' :['absent','normal','decreased','increased'],
    'abdomen' : ['normal','other','firm','distend_small','distend_large'],
    'abdomo_appearance' : ['clear','cloudy','serosanguious']
}

def dataframe_preprocess(dataframe):
    df = dataframe.copy()
    
    num_cols = df.select_dtypes(include=['float','int']).columns
    #filling some mising values
    for col in df.select_dtypes('object').columns:
        df[col] = df[col].fillna(df[col].mode()[0])
        
    replacements = {
        'pain' : {'slight' :'alert', 'moderate' : 'alert'},
        'peristalsis' : {'distend_small': 'absent'},
        'nasogastric_reflux' : {'slight': 'none'},
        'rectal_exam_feces' : {'serosanguious' : 'absent'},
        'surgery' : {'yes':1, 'no': 0},
        'surgical_lesion' : {'yes':1, 'no': 0},
        'cp_data' : {'yes':1, 'no': 0}
    }
    
    for col , replacement_dict in replacements.items():
        df[col] = df[col].replace(replacement_dict)
    
    #ordinal encoder 
    ordered_encoder = OrdinalEncoder(categories=[categorical_order[col] for col in categorical_order.keys()])
    ordinal_cols = list(categorical_order.keys())
    df[ordinal_cols] = ordered_encoder.fit_transform(df[ordinal_cols])
    
    #scaling the data
    scaler = RobustScaler()
    for i in num_cols:
        df[i] = scaler.fit_transform(np.array(df[i]).reshape(-1,1))
    
    return np.array(df)

main_pipe = Pipeline(
    steps=[
        ('preprocessor', FunctionTransformer(dataframe_preprocess))
    ]
)

main_pipe

In [None]:
output_encoder = LabelEncoder()

In [None]:
X = main_pipe.fit_transform(train.iloc[:,:-1])
#y = to_categorical(output_encoder.fit_transform(train.iloc[:,-1]))
y = output_encoder.fit_transform(train.iloc[:,-1])

X_train,X_test,y_train,y_test = train_test_split(X , y, test_size=0.1, random_state = 40)
inputDim = X_train.shape[1]

## Model Preperation

In [None]:
from sklearn.ensemble import RandomForestClassifier
from tensorflow_decision_forests.keras import RandomForestModel,GradientBoostedTreesModel

In [None]:
gbtm_clf = GradientBoostedTreesModel()
gbtm_clf.fit(X_train,y_train)

In [None]:
gbm_pred = gbtm_clf.predict(X_test,)
print('F1 Score of model prediction : ' ,f1_score(to_categorical(y_test),to_categorical(np.argmax(gbm_pred, axis=1)),average='macro'))

In [None]:
cm = confusion_matrix(y_test, np.argmax(gbm_pred, axis=1))

# Compute ROC curve and ROC AUC for each class
n_classes = gbm_pred.shape[1]
y_true_binary = label_binarize(y_test, classes=np.arange(n_classes))
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_true_binary[:, i], gbm_pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Computing micro-average ROC curve and ROC AUC
fpr["micro"], tpr["micro"], _ = roc_curve(y_true_binary.ravel(), gbm_pred.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()
tick_marks = np.arange(n_classes)
plt.xticks(tick_marks, np.arange(n_classes), rotation=45)
plt.yticks(tick_marks, np.arange(n_classes))
plt.xlabel('Predicted')
plt.ylabel('True')

# Displaying values in the confusion matrix
for i in range(n_classes):
    for j in range(n_classes):
        plt.text(j, i, format(cm[i, j], 'd'), horizontalalignment="center", color="white" if cm[i, j] > cm.max() / 2 else "black")

# Ploting ROC curves
plt.figure(figsize=(10, 6))
colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2, label='Class {0} (ROC AUC = {1:.2f})'.format(i, roc_auc[i]))

# Ploting micro-average ROC curve
plt.plot(fpr["micro"], tpr["micro"], color='deeppink', linestyle='--', lw=2, label='Micro-average (ROC AUC = {0:.2f})'.format(roc_auc["micro"]))

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.02])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()


## Submission

In [None]:
def submit():
    test_data = dataframe_preprocess(test)
    test_pred = gbtm_clf.predict(test_data)
    
    submit_df = pd.DataFrame()
    submit_df['id'] = test.index
    submit_df['outcome'] = output_encoder.inverse_transform(np.argmax(test_pred, axis=1))
    
    submit_df.to_csv('submit.csv', index=False)
    print('Work Done........Csv Saved')
    return None

In [None]:
submit()