In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
df = pd.read_csv('/kaggle/input/in-hospital-mortality-prediction/data01.csv')

## 1.Data overview

In [None]:
df.info()

In [None]:
sns.countplot(data=df,x="outcome")

In [None]:
df.isnull().sum()

In [None]:
sns.heatmap(df.isnull(), cbar=False,cmap='YlOrRd')

## 2.Data preprocessing

In [None]:
from pandas import DataFrame
from sklearn.impute import SimpleImputer
df.fillna(value=np.nan,inplace=True)
si = SimpleImputer(missing_values = np.nan, strategy ='mean')
si.fit(df)
data = DataFrame(si.transform(df),columns=df.columns)

In [None]:
data.isnull().sum()

In [None]:
data['outcome'].value_counts()

In [None]:
data.drop(data[(data.outcome >0 )& (data.outcome<1)].index,inplace=True)

## 3.correlation analysis

In [None]:
fig = plt.figure(figsize = (10, 10))
target_corr = pd.DataFrame(data.corr()['outcome'].sort_values(ascending = True))
plt.barh(target_corr.index, target_corr['outcome'],color="#FF9912")
plt.title('Correlataion with outcome')
plt.show()

## 4.Exploratory Data Analysis

In [None]:
sns.countplot(data=data,x='hypertensive',hue='outcome')

In [None]:
sns.histplot(data=data,x='age',hue='outcome',palette='Paired_r')

## 5.Cross-validation

In [None]:
from sklearn.model_selection import train_test_split
y = data.outcome
X = data.drop(columns=['outcome','group'],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

## 6.Model selection

In [None]:
from catboost import CatBoostClassifier
cat_feature = None
cb = CatBoostClassifier(loss_function='MultiClass', 
                         eval_metric='Accuracy',                   
                         cat_features=cat_feature,
                        use_best_model=False)

## 7.Hyperparameter optimization

In [None]:
params = {'depth': [4, 7, 10],
          'learning_rate': [0.03, 0.1, 0.15],
          'l2_leaf_reg': [1,3,5,7,9], 
          'iterations': [300, 500]}
grid_search_result = cb.grid_search(params,
                                    X=X,
                                    y=y,
                                    cv=3,
                                    partition_random_seed=0, 
                                    calc_cv_statistics=True,
                                    search_by_train_test_split=True,
                                    shuffle=True,
                                    train_size=0.8,
                                    verbose=False,
                                    stratified=True,
                                    plot=True)

In [None]:
grid_search_result.get("params")

## 8.model validation

In [None]:
clf = CatBoostClassifier(eval_metric='Accuracy', 
                            loss_function='MultiClass',
                            depth=4, 
                            iterations=500, 
                            l2_leaf_reg=1,
                            learning_rate=0.1,
                            cat_features=None)
clf.fit(X_train, y_train,silent=True)
y_pred = clf.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,y_pred))
print('\n')
print(confusion_matrix(y_test,y_pred))

## 9.Model Feature Importances

In [None]:
importances = clf.feature_importances_
idxs = np.argsort(importances)
plt.figure(figsize=(10,15))
plt.title('Feature Importances')
plt.barh(range(len(idxs)), importances[idxs], align='center')
plt.yticks(range(len(idxs)), [X.columns[i] for i in idxs])
plt.xlabel('Model Feature Importances')
plt.show()