#  HEALTH CARE ANALYSIS

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
%matplotlib inline

import matplotlib.pyplot as plt
import missingno as msno
import plotly.express as px
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from scipy import stats
from scipy.stats import norm, skew 
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.feature_selection import SelectFromModel
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin, clone
import optuna
import xgboost as xgb
from xgboost import XGBClassifier
import catboost
from catboost import CatBoostClassifier
import lightgbm as lgbm
from lightgbm import LGBMClassifier
optuna.logging.set_verbosity(optuna.logging.WARNING)
from lightgbm import *
pd.set_option("display.max_columns", None)

import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('/Users/jaideepsai/Desktop/DATA-ANALYTICS/Data Source/extracted-data/Health Care/Thyroid_Diff.csv')
display(train)

In [None]:
cat_cols = list(train.drop(columns=['Recurred', 'Age'], axis=1).columns)

# Set up subplots
fig, axes = plt.subplots(nrows=5, ncols=3, figsize=(15, 32))
plt.subplots_adjust(hspace=0.5)
fig.suptitle('Bar plots of categorical variables', fontsize=18, y=0.95)

for col, ax in zip(cat_cols, axes.flatten()):
    bar_plot = sns.countplot(x=col, data=train, ax=ax)
    
    # Chart formatting
    bar_plot.set_xticklabels(bar_plot.get_xticklabels(), rotation =45)
    ax.set_title(col.upper())
    ax.set_xlabel("")
    ax.set_ylabel("Count")
plt.show()

# EDA

In [None]:
print('train')
display(train.isnull().sum())

plt.figure(figsize = (4, 2))
plt.title("Training Set")
sns.heatmap(train.isnull())
plt.show()


In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train['Gender'] = le.fit_transform(train['Gender'])
train['Smoking'] = le.fit_transform(train['Smoking'])
train['Hx Smoking'] = le.fit_transform(train['Hx Smoking'])
train['Hx Radiothreapy'] = le.fit_transform(train['Hx Radiothreapy'])
train['Thyroid Function'] = le.fit_transform(train['Thyroid Function'])
train['Physical Examination'] = le.fit_transform(train['Physical Examination'])
train['Adenopathy'] = le.fit_transform(train['Adenopathy'])
train['Pathology'] = le.fit_transform(train['Pathology'])
train['Focality'] = le.fit_transform(train['Focality'])
train['Risk'] = le.fit_transform(train['Risk'])
train['T'] = le.fit_transform(train['T'])
train['N'] = le.fit_transform(train['N'])
train['M'] = le.fit_transform(train['M'])
train['Stage'] = le.fit_transform(train['Stage'])
train['Response'] = le.fit_transform(train['Response'])
train['Recurred'] = le.fit_transform(train['Recurred'])
display(train)

In [None]:
sns.set_style("white")
sns.set_color_codes(palette='deep')
f, ax = plt.subplots(figsize=(1, 3))
sns.histplot(train['Recurred'])
ax.xaxis.grid(False)

sns.despine(trim=True, left=True)
plt.show()

print("Skewness: %f" % train['Recurred'].skew())
print("Kurtosis: %f" % train['Recurred'].kurt())

In [None]:
X_data_feature= train.drop(columns=['Recurred'],axis=1)
y_data_feature= train['Recurred']

model = [CatBoostClassifier(logging_level='Silent')]

model = [model[i].fit(X_data_feature,y_data_feature) for i in range(len(model))]

num_chr = [12, 12, 10]

for i in range(len(model)):
    print(str(model[i])[:num_chr[i]] + ': \n',
          model[i].feature_importances_)
    feat_importances = pd.Series(model[i].feature_importances_,
                                 index=X_data_feature.columns)
    feat_importances.nlargest(16).plot(kind='barh', color='green')
    plt.xlim(0, 60)
    plt.show()

In [None]:
model = [XGBClassifier()]

model = [model[i].fit(X_data_feature,y_data_feature) for i in range(len(model))]

num_chr = [12, 12, 10]

for i in range(len(model)):
    print(str(model[i])[:num_chr[i]] + ': \n',
          model[i].feature_importances_)
    feat_importances = pd.Series(model[i].feature_importances_,
                                 index=X_data_feature.columns)
    feat_importances.nlargest(16).plot(kind='barh', color='royalblue')
    plt.xlim(0, 0.6)
    plt.show()

In [None]:
corr = train.corr(method='pearson')
fig, ax = plt.subplots(figsize=(15, 15))
sns.heatmap(corr, cmap='RdBu', annot=True, fmt=".2f")
plt.xticks(range(len(corr.columns)), corr.columns);
plt.yticks(range(len(corr.columns)), corr.columns)
plt.show()

In [None]:
X= train.drop(columns=['Recurred'],axis=1)
y= train['Recurred']

In [None]:
from sklearn.preprocessing import MinMaxScaler

X_train=X
y_train=y

MinMaxScaler = MinMaxScaler()
X_train = MinMaxScaler.fit_transform(X_train)
X_train = pd.DataFrame(X_train)
X_train

In [None]:
X_train, X_eval, y_train, y_eval = train_test_split(X_train, y_train,test_size=0.2,random_state=2019)
print("Shape of X_train: ",X_train.shape)
print("Shape of X_eval: ", X_eval.shape)
print("Shape of y_train: ",y_train.shape)
print("Shape of y_eval",y_eval.shape)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,BaggingClassifier
from sklearn.ensemble import  AdaBoostClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier

clf1 = BaggingClassifier()
clf2 = DecisionTreeClassifier()
clf3 = ExtraTreesClassifier()
clf4 = RandomForestClassifier()
clf5 = XGBClassifier()
clf6 = AdaBoostClassifier()


In [None]:
clf1 = clf1.fit(X_train, y_train)
clf2 = clf2.fit(X_train, y_train)
clf3 = clf3.fit(X_train, y_train)
clf4 = clf4.fit(X_train, y_train)
clf5 = clf5.fit(X_train, y_train)
clf6 = clf6.fit(X_train, y_train)

eclf = VotingClassifier(estimators=[('BC', clf1), ('DT', clf2), ('ETSC', clf3), ('RTC', clf4),
                                    ('XGB', clf5), ('ABC', clf6)],voting='hard')
Voting_model = eclf.fit(X_train, y_train)

y_pred_Voting = Voting_model.predict(X_eval) 

In [None]:
Voting_acc = accuracy_score(y_eval, y_pred_Voting)
print("Voting accuracy is: {0:.3f}%".format(Voting_acc * 100))
cm = confusion_matrix(y_eval, y_pred_Voting)
plt.figure(figsize=(4, 4))
sns.heatmap(cm, annot=True, fmt='.0f')
plt.xlabel("Predicted Digits")
plt.ylabel("True Digits")
plt.show()