## Import Modules

In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scikitplot as skplt

from yellowbrick.model_selection import LearningCurve, ValidationCurve
from scipy.stats import friedmanchisquare, wilcoxon

pd.set_option('display.max_columns', None)


## Adult Dataset 

In [2]:
adult_columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
adult = pd.read_csv('adult.data', names = adult_columns, header=None, skipinitialspace=True )

In [3]:
adult

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [4]:
adult = adult.replace("?", np.nan)
adult = adult.dropna()

In [5]:
A_categorical_columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
adult = pd.get_dummies(adult, columns=A_categorical_columns, drop_first=True)

In [6]:
adult['target'] = (adult['income'] == '>50K').astype(int)

In [7]:
adult.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,income,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,education_11th,education_12th,education_1st-4th,education_5th-6th,education_7th-8th,education_9th,education_Assoc-acdm,education_Assoc-voc,education_Bachelors,education_Doctorate,education_HS-grad,education_Masters,education_Preschool,education_Prof-school,education_Some-college,marital-status_Married-AF-spouse,marital-status_Married-civ-spouse,marital-status_Married-spouse-absent,marital-status_Never-married,marital-status_Separated,marital-status_Widowed,occupation_Armed-Forces,occupation_Craft-repair,occupation_Exec-managerial,occupation_Farming-fishing,occupation_Handlers-cleaners,occupation_Machine-op-inspct,occupation_Other-service,occupation_Priv-house-serv,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Male,native-country_Canada,native-country_China,native-country_Columbia,native-country_Cuba,native-country_Dominican-Republic,native-country_Ecuador,native-country_El-Salvador,native-country_England,native-country_France,native-country_Germany,native-country_Greece,native-country_Guatemala,native-country_Haiti,native-country_Holand-Netherlands,native-country_Honduras,native-country_Hong,native-country_Hungary,native-country_India,native-country_Iran,native-country_Ireland,native-country_Italy,native-country_Jamaica,native-country_Japan,native-country_Laos,native-country_Mexico,native-country_Nicaragua,native-country_Outlying-US(Guam-USVI-etc),native-country_Peru,native-country_Philippines,native-country_Poland,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,target
0,39,77516,13,2174,0,40,<=50K,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,0
1,50,83311,13,0,0,13,<=50K,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,0
2,38,215646,9,0,0,40,<=50K,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,0
3,53,234721,7,0,0,40,<=50K,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,0
4,28,338409,13,0,0,40,<=50K,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0


In [8]:
X1 = adult.drop(columns=['target'])
y1 = adult['target']

In [9]:
X_train_A, X_test_A, y_train_A, y_test_A = train_test_split(
    X1, y1, test_size=0.2, random_state=1, stratify=y1)

In [10]:
A_numeric_columns = ['age', 'fnlwgt', 'education-num',
            'capital-gain', 'capital-loss', 'hours-per-week']


def Scalar(trd, ted, cols):
    scaler = StandardScaler()
    trd[cols] = scaler.fit_transform(trd[cols])
    ted[cols] = scaler.transform(ted[cols])
    return trd, ted, scaler

Scalar(X_train_A, X_test_A, A_numeric_columns)


(            age    fnlwgt  education-num  capital-gain  capital-loss  \
 8233  -0.106282 -0.817074      -0.438885      -0.14889     -0.218787   
 21701 -0.942161  1.911688       1.129402      -0.14889     -0.218787   
 22740 -1.018150  0.477821      -0.438885      -0.14889     -0.218787   
 24732  0.653608 -0.283862      -0.438885      -0.14889     -0.218787   
 30436  0.957564  0.589294       0.345258      -0.14889     -0.218787   
 ...         ...       ...            ...           ...           ...   
 1242  -0.182271  0.046992      -0.438885      -0.14889     -0.218787   
 10077  1.489487  0.340585      -2.007172      -0.14889     -0.218787   
 13572  0.577619  0.029342      -0.438885      -0.14889     -0.218787   
 22888 -1.246117  2.271861      -0.438885      -0.14889     -0.218787   
 6495  -0.562216  1.583146       1.129402      -0.14889     -0.218787   
 
        hours-per-week income  workclass_Local-gov  workclass_Private  \
 8233        -0.080382  <=50K                Fals

## Cover Type Dataset

In [11]:
cover_type_columns = [
    'Elevation',
    'Aspect',
    'Slope',
    'Horizontal_Distance_To_Hydrology',
    'Vertical_Distance_To_Hydrology',
    'Horizontal_Distance_To_Roadways',
    'Hillshade_9am',
    'Hillshade_Noon',
    'Hillshade_3pm',
    'Horizontal_Distance_To_Fire_Points',
    'Wilderness_Area1',
    'Wilderness_Area2',
    'Wilderness_Area3',
    'Wilderness_Area4',
] + [f'Soil_Type{i}' for i in range(1, 41)] + [
    'Cover_Type'
]

cover_type = pd.read_csv('covtype.data.gz', header=None, names=cover_type_columns)

In [12]:
cover_type.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4,Soil_Type1,Soil_Type2,Soil_Type3,Soil_Type4,Soil_Type5,Soil_Type6,Soil_Type7,Soil_Type8,Soil_Type9,Soil_Type10,Soil_Type11,Soil_Type12,Soil_Type13,Soil_Type14,Soil_Type15,Soil_Type16,Soil_Type17,Soil_Type18,Soil_Type19,Soil_Type20,Soil_Type21,Soil_Type22,Soil_Type23,Soil_Type24,Soil_Type25,Soil_Type26,Soil_Type27,Soil_Type28,Soil_Type29,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,2596,51,3,258,0,510,221,232,148,6279,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5


In [13]:
cover_type['target'] = (cover_type['Cover_Type'] == 2).astype(int)
cover_type = cover_type.drop(columns='Cover_Type')
cover_type.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4,Soil_Type1,Soil_Type2,Soil_Type3,Soil_Type4,Soil_Type5,Soil_Type6,Soil_Type7,Soil_Type8,Soil_Type9,Soil_Type10,Soil_Type11,Soil_Type12,Soil_Type13,Soil_Type14,Soil_Type15,Soil_Type16,Soil_Type17,Soil_Type18,Soil_Type19,Soil_Type20,Soil_Type21,Soil_Type22,Soil_Type23,Soil_Type24,Soil_Type25,Soil_Type26,Soil_Type27,Soil_Type28,Soil_Type29,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,target
0,2596,51,3,258,0,510,221,232,148,6279,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
1,2590,56,2,212,-6,390,220,235,151,6225,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,2804,139,9,268,65,3180,234,238,135,6121,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,2785,155,18,242,118,3090,238,238,122,6211,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
4,2595,45,2,153,-1,391,220,234,150,6172,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [14]:
X2 = cover_type.drop(columns = ['target'])
y2 = cover_type['target']

X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(X2, y2, test_size=0.2, random_state=2, stratify=y2)

In [15]:
cv_numeric_columns = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points']
Scalar(X_train_cv, X_test_cv, cv_numeric_columns)

(        Elevation    Aspect     Slope  Horizontal_Distance_To_Hydrology  \
 267100  -0.303308  0.397053  1.320531                         -0.984850   
 330184  -0.585225  1.665899 -0.548017                         -1.069561   
 132748  -0.260485  1.487189 -0.014146                         -0.867195   
 509963   0.760126 -0.156950  0.252789                         -1.069561   
 72765   -1.241842 -0.675211  0.119322                         -0.984850   
 ...           ...       ...       ...                               ...   
 98514   -1.163333 -0.335661  0.519725                         -1.126035   
 82406   -0.192682 -0.058659 -1.749227                          0.502304   
 288117  -1.869910  0.155794  1.053596                          0.436418   
 45642   -0.049939 -0.621598 -0.548017                         -0.241273   
 524136  -0.292602 -0.818180  1.587467                         -0.843664   
 
         Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \
 267100   

## Letter Dataset

In [16]:
letter_columns = ['lettr', 'x-box', 'y-box', 'width','high', 'onpix', 'x-bar', 'y-bar', 'x2bar', 'y2bar', 'xybar', 'x2ybr', 'xy2br', 'x-ege', 'xegvy', 'y-ege', 'yegvx']
letter = pd.read_csv('letter-recognition.data', header=None, names=letter_columns)

In [17]:
letter.head()

Unnamed: 0,lettr,x-box,y-box,width,high,onpix,x-bar,y-bar,x2bar,y2bar,xybar,x2ybr,xy2br,x-ege,xegvy,y-ege,yegvx
0,T,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10


In [18]:
letter['target'] = (letter['lettr'] >= 'N').astype(int)
letter = letter.drop(columns=['lettr'])

In [19]:
letter.head()

Unnamed: 0,x-box,y-box,width,high,onpix,x-bar,y-bar,x2bar,y2bar,xybar,x2ybr,xy2br,x-ege,xegvy,y-ege,yegvx,target
0,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8,1
1,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10,0
2,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9,0
3,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8,1
4,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10,0


In [20]:
X3 = letter.drop(columns = ['target'])
y3 = letter['target']

X_train_ltr, X_test_ltr, y_train_ltr, y_test_ltr = train_test_split(X3, y3, test_size=0.2, random_state=3, stratify=y3)

In [21]:
ltr_numeric_columns = X_train_ltr.columns
Scalar(X_train_ltr, X_test_ltr, ltr_numeric_columns)

(          x-box     y-box     width      high     onpix     x-bar     y-bar  \
 16444  0.508716  1.203263  1.427481  1.166594  2.507814 -0.447176 -0.211143   
 2231  -0.536076  0.294297 -0.062550 -0.162511 -0.687035  0.047078 -1.070649   
 18297 -0.013680 -1.523636 -0.062550 -0.605546  0.225779  0.047078  0.218610   
 4297  -0.536076 -0.311681 -0.559227 -0.605546 -0.230628  0.047078 -0.211143   
 16103 -0.536076  0.294297 -0.559227  0.280524 -0.687035 -0.447176 -0.211143   
 ...         ...       ...       ...       ...       ...       ...       ...   
 16144 -1.058472 -0.917658 -0.559227 -1.048581 -0.687035  1.035585 -0.211143   
 3924  -1.058472 -0.917658 -1.055904 -0.162511 -0.230628  0.541331 -0.211143   
 1708   0.508716  1.203263 -0.062550  1.166594 -0.230628  0.047078  0.648363   
 4594   1.031112 -0.008692  1.427481 -0.162511  0.225779  2.024092 -0.211143   
 15727 -0.536076  0.597285 -0.559227  0.723559 -1.143442  2.024092 -2.359908   
 
           x2bar     y2bar     xybar  

## House Votes Dataset

In [22]:
hv_columns = [
    'Class',
    'handicapped-infants',
    'water-project-cost-sharing',
    'adoption-of-the-budget-resolution',
    'physician-fee-freeze',
    'el-salvador-aid',
    'religious-groups-in-schools',
    'anti-satellite-test-ban',
    'aid-to-nicaraguan-contras',
    'mx-missile',
    'immigration',
    'synfuels-corporation-cutback',
    'education-spending',
    'superfund-right-to-sue',
    'crime',
    'duty-free-exports',
    'export-administration-act-south-africa'
]

house_votes = pd.read_csv('house-votes-84.data', header=None, names=hv_columns)

In [23]:
house_votes.head()

Unnamed: 0,Class,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,republican,n,y,n,y,y,y,n,n,n,y,?,y,y,y,n,y
1,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,?
2,democrat,?,y,y,?,y,y,n,n,n,n,y,n,y,y,n,n
3,democrat,n,y,y,n,?,y,n,n,n,n,y,n,y,n,n,y
4,democrat,y,y,y,n,y,y,n,n,n,n,y,?,y,y,y,y


In [24]:
house_votes.eq('?').sum()

Class                                       0
handicapped-infants                        12
water-project-cost-sharing                 48
adoption-of-the-budget-resolution          11
physician-fee-freeze                       11
el-salvador-aid                            15
religious-groups-in-schools                11
anti-satellite-test-ban                    14
aid-to-nicaraguan-contras                  15
mx-missile                                 22
immigration                                 7
synfuels-corporation-cutback               21
education-spending                         31
superfund-right-to-sue                     25
crime                                      17
duty-free-exports                          28
export-administration-act-south-africa    104
dtype: int64

In [25]:
house_votes = house_votes.replace('?', 'unknown')

In [26]:
house_votes['target'] = (house_votes['Class'] == 'republican').astype(int)
house_votes = house_votes.drop(columns = ['Class'])

In [27]:
house_votes.head()

Unnamed: 0,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa,target
0,n,y,n,y,y,y,n,n,n,y,unknown,y,y,y,n,y,1
1,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,unknown,1
2,unknown,y,y,unknown,y,y,n,n,n,n,y,n,y,y,n,n,0
3,n,y,y,n,unknown,y,n,n,n,n,y,n,y,n,n,y,0
4,y,y,y,n,y,y,n,n,n,n,y,unknown,y,y,y,y,0


In [28]:
X4 = house_votes.drop(columns = ['target'])
y4 = house_votes['target']
hv_cat_cols = X4.columns
house_votes = pd.get_dummies(house_votes, columns=hv_cat_cols, drop_first=True)

In [29]:
X_train_hv, X_test_hv, y_Train_hv, y_Test_hv = train_test_split(X4, y4, test_size=0.2, random_state=4, stratify=y4 )

## Heart Disease Dataset

In [30]:
hd_columns = heart_columns = [
    'age',
    'sex',
    'cp',
    'trestbps',
    'chol',
    'fbs',
    'restecg',
    'thalach',
    'exang',
    'oldpeak',
    'slope',
    'ca',
    'thal',
    'num'
]

heart_disease = pd.read_csv('processed.cleveland.data', header=None, names=hd_columns)

In [31]:
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [32]:
heart_disease['target'] = (heart_disease['num'] > 0).astype(int)
heart_disease = heart_disease.drop(columns = ['num'])
heart_disease.head()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [33]:
heart_disease = heart_disease[~heart_disease.isin(['?']).any(axis=1)]

In [34]:
X5 = heart_disease.drop(columns = ['target'])
y5 = heart_disease['target']

X_train_hd, X_test_hd, y_train_hd, y_test_hd = train_test_split(X5, y5, test_size=0.2, random_state=5, stratify=y5)    
heart_disease.eq('?').sum()


age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [35]:
hd_numeric_columns = X_train_hd.columns
Scalar(X_train_hd, X_test_hd, hd_numeric_columns)

(          age       sex        cp  trestbps      chol       fbs   restecg  \
 40   1.216698 -1.469694  0.888245  1.001410 -0.437480 -0.423207  1.032203   
 212 -1.471949  0.680414 -0.133668 -0.106334 -0.647441 -0.423207  1.032203   
 200 -0.463706 -1.469694  0.888245 -1.214078  0.116055 -0.423207  1.032203   
 204 -1.247895  0.680414  0.888245 -1.214078 -0.704703 -0.423207 -0.981230   
 250  0.320482  0.680414  0.888245 -1.214078 -0.895577 -0.423207 -0.981230   
 ..        ...       ...       ...       ...       ...       ...       ...   
 177  0.208455  0.680414  0.888245  0.004440 -1.220063 -0.423207  1.032203   
 36  -1.247895  0.680414  0.888245 -0.660206 -1.353675 -0.423207  1.032203   
 249  0.880617  0.680414 -1.155580 -0.217109 -0.761966  2.362908  1.032203   
 18  -0.687760 -1.469694 -0.133668 -0.106334  0.516890 -0.423207 -0.981230   
 189  1.664806  0.680414 -0.133668  0.447538  0.116055 -0.423207  1.032203   
 
       thalach     exang   oldpeak     slope        ca      th