In [61]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split, KFold

## Import Dataset

In [92]:
data_path = '/Users/Asus/Desktop/Unity Game/python/Data_Analysis/Machine_Learning_Python/'

df = pd.read_csv(os.path.join(data_path, 'HW2.csv'))
df.head()

Unnamed: 0,S.No,plod,name,title,gender,culture,dateOfBirth,DateoFdeath,mother,father,...,isAliveHeir,isAliveSpouse,isMarried,isNoble,age,numDeadRelations,boolDeadRelations,isPopular,popularity,isAlive
0,1,0.946,Viserys II Targaryen,,1,,,,Rhaenyra Targaryen,Daemon Targaryen,...,0.0,,0,0,,11,1,1,0.605351,0
1,2,0.613,Walder Frey,Lord of the Crossing,1,Rivermen,208.0,,,,...,,1.0,1,1,97.0,1,1,1,0.896321,1
2,3,0.507,Addison Hill,Ser,1,,,,,,...,,,0,1,,0,0,0,0.267559,1
3,4,0.924,Aemma Arryn,Queen,0,,82.0,105.0,,,...,,0.0,1,1,23.0,0,0,0,0.183946,0
4,5,0.383,Sylva Santagar,Greenstone,0,Dornish,276.0,,,,...,,1.0,1,1,29.0,0,0,0,0.043478,1


## Categorical dataset

In [93]:
df.describe(include='object')

Unnamed: 0,name,title,culture,mother,father,heir,house,spouse
count,1946,938,677,21,26,23,1519,276
unique,1946,262,64,17,20,22,347,254
top,Brea,Ser,Northmen,Alyssa Velaryon,Aegon I Targaryen,Jaehaerys Targaryen,Night's Watch,Walder Frey
freq,1,384,124,2,2,2,105,7


In [94]:
df.loc[:, df.dtypes == 'object'].isnull().sum() / len(df)*100

name        0.000000
title      51.798561
culture    65.210689
mother     98.920863
father     98.663926
heir       98.818088
house      21.942446
spouse     85.817061
dtype: float64

 - Drop feature with more than 50% null data.
 - Name feature does not bring impact to the study

##### - Factorize house data

In [95]:
df['house_val'] = pd.factorize(df.house)[0]
df[['house_val', 'house']].head(15)

Unnamed: 0,house_val,house
0,-1,
1,0,House Frey
2,1,House Swyft
3,2,House Arryn
4,3,House Santagar
5,-1,
6,4,House Targaryen
7,-1,
8,-1,
9,5,House Osgrey


## Numerical dataset

##### - taking only numerical features, checking null value

In [96]:
df = df.loc[:, df.dtypes != 'object']
df.isnull().sum() / len(df) *100

S.No                  0.000000
plod                  0.000000
gender                0.000000
dateOfBirth          77.749229
DateoFdeath          77.183967
book1                 0.000000
book2                 0.000000
book3                 0.000000
book4                 0.000000
book5                 0.000000
isAliveMother        98.920863
isAliveFather        98.663926
isAliveHeir          98.818088
isAliveSpouse        85.817061
isMarried             0.000000
isNoble               0.000000
age                  77.749229
numDeadRelations      0.000000
boolDeadRelations     0.000000
isPopular             0.000000
popularity            0.000000
isAlive               0.000000
house_val             0.000000
dtype: float64

## Seperating X and y data
##### - remove null contain columns

In [112]:
df = df.loc[:, df.notnull().all()]
y = df.isAlive
X = df.loc[:, [col for col in df.columns if col not in ['S.No', 'isAlive']]]
X.head()

Unnamed: 0,plod,gender,book1,book2,book3,book4,book5,isMarried,isNoble,numDeadRelations,boolDeadRelations,isPopular,popularity,house_val
0,0.946,1,0,0,0,0,0,0,0,11,1,1,0.605351,-1
1,0.613,1,1,1,1,1,1,1,1,1,1,1,0.896321,0
2,0.507,1,0,0,0,1,0,0,1,0,0,0,0.267559,1
3,0.924,0,0,0,0,0,0,1,1,0,0,0,0.183946,2
4,0.383,0,0,0,0,1,0,1,1,0,0,0,0.043478,3


## Cross Validation

In [130]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)

## KFold

In [156]:
models = []
models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier(n_neighbors=20)))

results = {}
for name, model in models:
    kfold = KFold(n_splits=10, random_state=None)
    cv_result = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
    results[name] = (cv_result.mean(), cv_result.std())
    
results

{'LR': (0.791170388751034, 0.035294796322086854),
 'KNN': (0.7879239040529364, 0.032588118710661786)}

## StratifiedKFold

In [154]:
results_Strat = {}
for name, model in models:
    kfold = StratifiedKFold(n_splits=10, random_state=None)
    cv_result = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
    results_Strat[name] = (cv_result.mean(), cv_result.std())
    
results_Strat

{'LR': (0.7925020678246484, 0.033497610749336534),
 'KNN': (0.7860339123242349, 0.02204217313200516)}