## Import Modules

In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scikitplot as skplt

from yellowbrick.model_selection import LearningCurve, ValidationCurve
from scipy.stats import friedmanchisquare, wilcoxon

pd.set_option('display.max_columns', None)


## Adult Dataset 

In [2]:
adult_columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
adult = pd.read_csv('adult.data', names = adult_columns, header=None, skipinitialspace=True )

In [3]:
adult

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [4]:
adult = adult.replace("?", np.nan)
adult = adult.dropna()

In [5]:
A_categorical_columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
adult = pd.get_dummies(adult, columns=A_categorical_columns, drop_first=True)

In [6]:
adult['income'] = (adult['income'] == '>50K').astype(int)

In [None]:
X1 = adult.drop(columns=['income'])
y1 = adult['income']

In [None]:
X_train_A, X_test_A, y_train_A, y_test_A = train_test_split(
    X1, y1, test_size=0.2, random_state=1, stratify=y1)

In [None]:
A_numeric_columns = ['age', 'fnlwgt', 'education-num',
            'capital-gain', 'capital-loss', 'hours-per-week']


def Scalar(trd, ted, cols):
    scaler = StandardScaler()
    trd[cols] = scaler.fit_transform(trd[cols])
    ted[cols] = scaler.transform(ted[cols])
    return trd, ted, scaler

Scalar(X_train_A, X_test_A, A_numeric_columns)


(            age    fnlwgt  education-num  capital-gain  capital-loss  \
 8233  -0.106282 -0.817074      -0.438885      -0.14889     -0.218787   
 21701 -0.942161  1.911688       1.129402      -0.14889     -0.218787   
 22740 -1.018150  0.477821      -0.438885      -0.14889     -0.218787   
 24732  0.653608 -0.283862      -0.438885      -0.14889     -0.218787   
 30436  0.957564  0.589294       0.345258      -0.14889     -0.218787   
 ...         ...       ...            ...           ...           ...   
 1242  -0.182271  0.046992      -0.438885      -0.14889     -0.218787   
 10077  1.489487  0.340585      -2.007172      -0.14889     -0.218787   
 13572  0.577619  0.029342      -0.438885      -0.14889     -0.218787   
 22888 -1.246117  2.271861      -0.438885      -0.14889     -0.218787   
 6495  -0.562216  1.583146       1.129402      -0.14889     -0.218787   
 
        hours-per-week  workclass_Local-gov  workclass_Private  \
 8233        -0.080382                False             

## Cover Type Dataset

In [11]:
cover_type_columns = [
    'Elevation',
    'Aspect',
    'Slope',
    'Horizontal_Distance_To_Hydrology',
    'Vertical_Distance_To_Hydrology',
    'Horizontal_Distance_To_Roadways',
    'Hillshade_9am',
    'Hillshade_Noon',
    'Hillshade_3pm',
    'Horizontal_Distance_To_Fire_Points',
    
    # Wilderness Area (4 binary columns)
    'Wilderness_Area1',
    'Wilderness_Area2',
    'Wilderness_Area3',
    'Wilderness_Area4',
    
    # Soil Type (40 binary columns)
] + [f'Soil_Type{i}' for i in range(1, 41)] + [

    # Target column
    'Cover_Type'
]

cover_type = pd.read_csv('covtype.data.gz', header=None, names=cover_type_columns)

In [12]:
cover_type.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4,Soil_Type1,Soil_Type2,Soil_Type3,Soil_Type4,Soil_Type5,Soil_Type6,Soil_Type7,Soil_Type8,Soil_Type9,Soil_Type10,Soil_Type11,Soil_Type12,Soil_Type13,Soil_Type14,Soil_Type15,Soil_Type16,Soil_Type17,Soil_Type18,Soil_Type19,Soil_Type20,Soil_Type21,Soil_Type22,Soil_Type23,Soil_Type24,Soil_Type25,Soil_Type26,Soil_Type27,Soil_Type28,Soil_Type29,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,2596,51,3,258,0,510,221,232,148,6279,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5


In [14]:
cover_type['target'] = (cover_type['Cover_Type'] == 2).astype(int)
cover_type.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4,Soil_Type1,Soil_Type2,Soil_Type3,Soil_Type4,Soil_Type5,Soil_Type6,Soil_Type7,Soil_Type8,Soil_Type9,Soil_Type10,Soil_Type11,Soil_Type12,Soil_Type13,Soil_Type14,Soil_Type15,Soil_Type16,Soil_Type17,Soil_Type18,Soil_Type19,Soil_Type20,Soil_Type21,Soil_Type22,Soil_Type23,Soil_Type24,Soil_Type25,Soil_Type26,Soil_Type27,Soil_Type28,Soil_Type29,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type,target
0,2596,51,3,258,0,510,221,232,148,6279,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5,0
1,2590,56,2,212,-6,390,220,235,151,6225,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5,0
2,2804,139,9,268,65,3180,234,238,135,6121,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1
3,2785,155,18,242,118,3090,238,238,122,6211,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,1
4,2595,45,2,153,-1,391,220,234,150,6172,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5,0


In [15]:
X2 = cover_type.drop(columns = ['target'])
y2 = cover_type['target']

X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(X2, y2, test_size=0.2, random_state=2, stratify=y2)

In [21]:
cv_numeric_columns = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points']
Scalar(X_train_cv, X_test_cv, cv_numeric_columns)

(        Elevation    Aspect     Slope  Horizontal_Distance_To_Hydrology  \
 267100  -0.303308  0.397053  1.320531                         -0.984850   
 330184  -0.585225  1.665899 -0.548017                         -1.069561   
 132748  -0.260485  1.487189 -0.014146                         -0.867195   
 509963   0.760126 -0.156950  0.252789                         -1.069561   
 72765   -1.241842 -0.675211  0.119322                         -0.984850   
 ...           ...       ...       ...                               ...   
 98514   -1.163333 -0.335661  0.519725                         -1.126035   
 82406   -0.192682 -0.058659 -1.749227                          0.502304   
 288117  -1.869910  0.155794  1.053596                          0.436418   
 45642   -0.049939 -0.621598 -0.548017                         -0.241273   
 524136  -0.292602 -0.818180  1.587467                         -0.843664   
 
         Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \
 267100   

## Letter Dataset

In [22]:
letter = pd.read_csv('letter-recognition.data')

In [23]:
letter.head()

Unnamed: 0,T,2,8,3,5,1,8.1,13,0,6,6.1,10,8.2,0.1,8.3,0.2,8.4
0,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
1,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
2,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
3,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10
4,S,4,11,5,8,3,8,8,6,9,5,6,6,0,8,9,7


## House Votes Dataset

In [10]:
house_votes = pd.read_csv('house-votes-84.data')

## Heart Disease Dataset

In [11]:
heart_disease = pd.read_csv('processed.cleveland.data')