<h2><strong>Classification Model Development</strong></h2>
<br>
<h3>~ <strong>Bindushree R P</strong></h3>

In [None]:
#importing all the libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random as rand
import numpy as np

#importing gender guesser
import gender_guesser.detector as gender

#importing all the sklearn and stat model packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import statsmodels.formula.api as smf

#importing the sklearn packages for classification model
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree

#loading data 
file = "../path_practice/__storage/GOT_character_predictions.xlsx"

df_got = pd.read_excel(io         = file,
                       header     = 0,
                       sheet_name = 0)

# setting random seed
rand.seed(a = 327)

#Setting pandas print options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width',1000)
pd.set_option('display.max_colwidth',100)

#displaying first 5 rows of the dataset
df_got.head(n=10)

In [None]:
#Checking for the number of missing values
df_got.isnull().sum(axis=0)

In [None]:
#Retriving the dataset information
df_got.info()

In [None]:
#loading the data dictionary
file1 = "../path_practice/__storage/GOT_data_dictionary.xlsx" 

df_got_data_dict = pd.read_excel(io         = file1,
                                header     = 0,
                                sheet_name = 0)

#displaying the data dictionary
df_got_data_dict

In the previous step, we loaded the dictionary dataset in order to analyze the features that happen after event horizon (if survived). After this we shall drop all the features that have occured or collected after the event horizon. However, we did not find any such features in our dataset, Popularity does strike like one but it is not necessarily based on is alive feature. Hence we would not drop any features from our dataset and nothing is considered to have taken place after event horizon.

In [None]:
#Looking at the missing values
df_got.isnull().sum(axis = 0)

In [None]:
#Analysing the available title column values to check if there are any duplicates
df_got.loc[: ,'title'].value_counts()

In [None]:
#user-defined functions


#function for missing value flagger

def mv_flagger(df):
    """
Flags all columns that have missing values with 'm-COLUMN_NAME'.

PARAMETERS
----------
df : DataFrame to flag missing values


RETURNS
-------
DataFrame with missing value flags."""


    for col in df:

        if df[col].isnull().astype(int).sum() > 0:
            df['m_'+col] = df[col].isnull().astype(int)
            
    return df



# text_split_feature

def text_split_feature(col, df, sep=' ', new_col_name='number_of_names'):
    """
Splits values in a string Series (as part of a DataFrame) and sums the number
of resulting items. Automatically appends summed column to original DataFrame.

PARAMETERS
----------
col          : column to split
df           : DataFrame where column is located
sep          : string sequence to split by, default ' '
new_col_name : name of new column after summing split, default
               'number_of_names'
"""
    
    df[new_col_name] = 0
    
    
    for index, val in df.iterrows():
        df.loc[index, new_col_name] = len(df.loc[index, col].split(sep = ' '))

In [None]:
#running the missing value flagger function to create columns for missing values
df_got_translated = mv_flagger(df = df_got)

#retrieving the columns
df_got_translated.columns

In [None]:
df_got_corr = df_got.corr(method= "pearson").round(decimals=2)

df_got_corr['isAlive'].sort_values(ascending = False)

In [None]:
# developing histograms for numerical variables to check for skewness
sns.histplot(data   = df_got,
             x      = 'dateOfBirth',
             kde    = True,
             binwidth= 1000)
#.set(ylim   = (0))


# title and axis labels
plt.title(label   = "original date of birth distribution")
plt.xlabel(xlabel = "DOB") # avoiding using dataset labels
plt.ylabel(ylabel = "Count")

# displaying the histogram
plt.show()

In [None]:
sns.histplot(data   = df_got,
             x      = 'age',
             kde    = True,
             binwidth= 1000)


# title and axis labels
plt.title(label   = "original age distribution")
plt.xlabel(xlabel = "age") # avoiding using dataset labels
plt.ylabel(ylabel = "Count")

# displaying the histogram
plt.show()

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

From the above histograms, we can say that both <strong>DateofBirth</strong> and <strong>Age</strong> is skewed and it must be imputed with median. But also it is a posibility to create a new column with a nested condition to check if the value in either column Age or DOB is greater than zero and if not to create a missing value in a brand new column. Need to look into this, for now I have imputed the missing values with <strong>median</strong>. 

<strong>Imputing missing values</strong><br>

1. Age             - imputed with median<br>
2. Date of birth   - imputed with median<br>
3. Title           - imputed with unknown<br>
4. Culture         - imputed with unknown<br>
5. Mother          - imputed with unknown<br>
6. Father          - imputed with unknown<br>
7. Heir            - imputed with unknown<br>
8. House           - Used nested conditional to fill the top houses and imputed the rest with Unknown<br>
9. Spouse          - imputed with unknown<br>
10. isAliveMother  - imputed with 0<br>
11. isAliveFather  - imputed with 0<br>
12. isAliveHeir    - imputed with 0<br>
13. isAliveSpouse  - imputed with 0<br>

In [None]:
# imputing missing values for age 
imputed_age = df_got_translated['age'].median()
df_got_translated['age'] = df_got_translated['age'].fillna(imputed_age)


# checking results
df_got_translated['age'].isnull().sum(axis = 0)

In [None]:
# imputing missing values for dob 
imputed_dob = df_got_translated['dateOfBirth'].median()
df_got_translated['dateOfBirth'] = df_got_translated['dateOfBirth'].fillna(imputed_dob)


# checking results
df_got_translated['dateOfBirth'].isnull().sum(axis = 0)

In [None]:
#trying to look into culture to understand the trend, analyse any duplicates and if possible to group the duplicates into one value
df_got_translated['culture'].value_counts()

Analysing the cultures column to club the duplicates into one value:
1. Same names - Northmen
Northmen                   124
northmen                     9
2. Same category - Ironborn
Ironborn                   112
Ironmen                      5
ironborn                     1
3. same names = Free Folk
Free Folk                   51
Free folk                   11
free folk                    1
Wildling                     2
First Men                    3
Wildlings                    2
4. Same = Qarth
Qartheen                     6
Qarth                        1
5. Same = Braavosi
Braavosi                    42
Braavos                      1
6. same = Ghiscari
Ghiscari                    25
Ghiscaricari                 1
7. Same = Dornish
Dornish                     25
Dornishmen                  14
Dothraki                    23
Dorne                        2
8. Same category = Riverlands
Rivermen                    19
Riverlands                   2
9. Same category = Vale
Valemen                     19
Vale mountain clans         15
Vale                         1
10. Same = Meereen
Meereen                      1
Meereenese                   3
11. Same = Reach
Reach                       16
Reachmen                     1
The Reach                    1
12. same = Westerman
Westerman                    9
Westermen                    4
westermen                    2
Westerlands                  2
13. Same = Stormlander
Stormlands                   7
Stormlander                  1
14. Same = Lysene
Lysene                       4
Lyseni                       3
15. Same = Asshai
Asshai                       2
Asshai'i                     1
16. Same = Summer Islanders
Summer Islands               1
Summer Islander              1
Summer Isles                 5
17. Same = Andals
Andals                       1
Andal                        1
18. Same = Norvoshi
Norvos                       1
Norvoshi                     1
19. Same = Lhazareen
Lhazareen                    2
Lhazarene                    1

20. Others
    Northern mountain clans      5
    Crannogmen                   4
    Astapori                     4
    Pentoshi                     3
    Myrish                       3
    Sistermen                    2
    Qohor                        2
    Astapor                      1
    Rhoynar                      1
    Naathi                       1
    Ibbenese                     1

The below cultures will not be used in the part where we translate the culture column with one value for all duplicates.
21. Tyroshi                      7
22. Westeros                    12
23. Valyrian                    43

24. NAN = UNKNOWN

Total of 34 different cultures, if grouped then 24, we will create or group the cultures to the above 24 different cultures so that we can create dummies for the same.

In [None]:
#Cultures are translated into one for those with duplicate values 
all_cultures = {'Northmen': ['Northmen','northmen'],
                'Ironborn': ['Ironborn','Ironmen','ironborn'],
                'Freefolk': ['Free Folk','Free folk','free folk','Wildling','First Men','Wildlings'],
                'Qarth': ['Qartheen','Qarth'],
                'Braavosi': ['Braavosi','Braavos'],
                'Ghiscari': ['Ghiscari', 'Ghiscaricari'],
                'Dornish': ['Dornish','Dornishmen','Dorne'],
                'Riverlands': ['Rivermen','Riverlands'],
                'Vale': ['Valemen','Vale mountain clans','Vale'],
                'Meereen': ['Meereen','Meereenese'],
                'Reach': ['Reach','Reachmen','The Reach'],
                'Westerman': ['Westerman','Westermen','westermen','Westerlands'],
                'Stormlander':['Stormlands','Stormlander'],
                'Lysene':['Lysene','Lyseni'],
                'Asshai':['Asshai',"Asshai'i"],
                'SummerIslanders': ['Summer Islands','Summer Islander','Summer Isles'],
                'Andals':['Andals','Andal'],
                'Norvoshi':['Norvos','Norvoshi'],
                'Lhazareen':['Lhazareen','Lhazarene'],
                'Others':['Northern mountain clans','Crannogmen','Astapori',
                          'Pentoshi','Myrish','Sistermen','Qohor','Astapor',
                          'Rhoynar','Naathi','Ibbenese']}

#translating values and updating the same in dataframe

for cultures in all_cultures:
    df_got_translated.loc[df_got_translated.culture.isin(values=all_cultures[cultures]), 'culture'] = cultures

df_got_translated['culture'].value_counts()

In [None]:
# imputing missing values for title, culture, mother, father, heir and spouse with unknown
df_got_translated['title']   = df_got_translated['title'].fillna('Unknown')

df_got_translated['mother']  = df_got_translated['mother'].fillna('Unknown')

df_got_translated['culture'] = df_got_translated['culture'].fillna('Unknown')

df_got_translated['father']  = df_got_translated['father'].fillna('Unknown')

df_got_translated['heir']    = df_got_translated['heir'].fillna('Unknown')

df_got_translated['spouse']  = df_got_translated['spouse'].fillna('Unknown')

# checking results
df_got_translated.isnull().sum(axis = 0)

In [None]:
# imputing missing values for isAliveMother, isAliveFather, isAliveHeir, isAliveSpouse with 0

fill = 0

df_got_translated['isAliveMother']   = df_got_translated['isAliveMother'].fillna(fill)

df_got_translated['isAliveFather'] = df_got_translated['isAliveFather'].fillna(fill)

df_got_translated['isAliveHeir']  = df_got_translated['isAliveHeir'].fillna(fill)

df_got_translated['isAliveSpouse']  = df_got_translated['isAliveSpouse'].fillna(fill)

# checking results
df_got_translated.isnull().sum(axis = 0)

In [None]:
text_split_feature(col= 'name', 
                   df= df_got_translated, 
                   sep=' ', 
                   new_col_name='number_of_names')

# checking results
df_got_translated['number_of_names'].value_counts(normalize = False,
                                        sort      = False,
                                        ascending = False).sort_index()

In [None]:
#adding age and dob to feature engineer the available two continuous and interval features
df_got_translated['age_dob'] = df_got_translated['age'] + df_got_translated['dateOfBirth']

df_got_translated.columns

In [None]:
# installing gender_guesser
%pip install gender_guesser

In [None]:
# STEP 1: splitting name to get the lastname

# placeholder list
placeholder_lst = []


# looping over each email address
for index, col in df_got_translated.iterrows():
    
    # splitting name at ' '
    split_name = df_got_translated.loc[index, 'name'].split(sep = ' ')

    #Added this to reverse the order of words 
    surname = list(reversed(split_name))
    
    # appending placeholder_lst with the results
    placeholder_lst.append(surname)
    

# converting placeholder_lst into a DataFrame 
df_got_name = pd.DataFrame(placeholder_lst)


# displaying the results
df_got_name.head(n=20)

In [None]:
# renaming column to concatenate
df_got_name.columns = ['lastname' , 'middlename', 'firstname', 'value1', 'value2', 'value3']


# concatenating personal_email_domain with friends DataFrame
df_got_translated = pd.concat([df_got_translated, df_got_name['lastname']],
                   axis = 1)


# printing value counts of personal_email_domain
df_got_translated.loc[: ,'lastname'].value_counts()

In [None]:
for  index, value in df_got_translated.iterrows():
    if "Frey" in df_got_translated.loc[index, 'name']:
        df_got_translated.loc[index, 'house_new'] = 'House Frey'
        
    elif "Stark" in df_got_translated.loc[index, 'name']:
        df_got_translated.loc[index, 'house_new'] = 'House Stark'
    
    elif "Targaryen" in df_got_translated.loc[index, 'name']:
        df_got_translated.loc[index, 'house_new'] = 'House Targaryen'
    
    elif "Lannister" in df_got_translated.loc[index, 'name']:
        df_got_translated.loc[index, 'house_new'] = 'House Lannister'
    
    elif "Greyjoy" in df_got_translated.loc[index, 'name']:
        df_got_translated.loc[index, 'house_new'] = 'House Greyjoy'
    
    elif "Tyrell" in df_got_translated.loc[index, 'name']:
        df_got_translated.loc[index, 'house_new'] = 'House Tyrell'
    
    elif "Martell" in df_got_translated.loc[index, 'name']:
        df_got_translated.loc[index, 'house_new'] = 'House Martell'
            
    else:
        df_got_translated.loc[index, 'house_new'] = 'Unknown'

In [None]:
#df_got_translated["house"].fillna(value=df_got_translated["house_new"], inplace = True)
df_got_translated['house_new'].value_counts()

In [None]:
#Imputing the missing values in house with house_new 
df_got_translated['house'] = df_got_translated['house'].fillna(df_got_translated['house_new'])


In [None]:
#Creating a new list for titles with just Known and Unknown to concat into the translated dataframe
title = []
    
for titles in df_got_translated['title']:
    if titles == 'Unknown':
        title.append('title_unknown')

    else:
        title.append('title_known')

#Converting the title list into a dataframe
df_title = pd.DataFrame(title)
df_title.head(n=10)

In [None]:
# renaming column to concatenate
df_title.columns = ['titles']


# concatenating title with got translated DataFrame
df_got_translated = pd.concat([df_got_translated, df_title['titles']],
                   axis = 1)


# printing value counts of title
df_got_translated.loc[: ,'titles'].value_counts()

In [None]:
#Creating a new list for house with just Known and Unknown to concat into the translated dataframe
house = []

#for loop to append values into the list 
for houses in df_got_translated['house']:
    if houses == 'Unknown':
        house.append('house_unknown')

    else:
        house.append('house_known')

#Converting the title list into a dataframe
df_house = pd.DataFrame(house)

In [None]:
# renaming column to concatenate
df_house.columns = ['house_translated']


# concatenating house with got translated DataFrame
df_got_translated = pd.concat([df_got_translated, df_house['house_translated']],
                   axis = 1)


# printing value counts of title
df_got_translated.loc[: ,'house_translated'].value_counts()

In [None]:
#splitting name to get the firstname and add first name column to got dataframe

# placeholder list
placeholder_lst2 = []


# looping over each email address
for index, col in df_got_translated.iterrows():
    
    #splitting name at ' '
    split_name = df_got_translated.loc[index, 'name'].split(sep = ' ')
    
    #appending placeholder_lst with the results
    placeholder_lst2.append(split_name)
    

#converting placeholder_lst into a DataFrame 
df_got_firstname = pd.DataFrame(placeholder_lst2)


#displaying the results
df_got_firstname.head(n=50)

In [None]:
#renaming column to concatenate
df_got_firstname.columns = ['first_name' , 'middlename', 'lastname', 'value1', 'value2', 'value3']


#concatenating firstname with got DataFrame
df_got_translated = pd.concat([df_got_translated, df_got_firstname['first_name']],
                   axis = 1)


#printing value counts of firstname
df_got_translated.columns


In [None]:
#Using gender guesser on firstname to guess the gender 

#placeholder list
placeholder_lst3 = []


#looping to guess gender
for name in df_got_translated['first_name']:
    guess = gender.Detector().get_gender(name)
    print(guess)
    placeholder_lst3.append(guess)


#converting list into a series
df_got_translated['gender_guess'] = pd.Series(placeholder_lst3)


#checking results
df_got_translated.head(n = 30)

In [None]:
#Using gender guesser output as a list to add as a column in the dataframe
gender_guesses_list = ['unknown',
'unknown',
'andy',
'unknown',
'female',
'unknown',
'unknown',
'unknown',
'male',
'male',
'mostly_male',
'mostly_male',
'mostly_male',
'mostly_male',
'mostly_male',
'mostly_male',
'unknown',
'male',
'unknown',
'unknown',
'male',
'male',
'female',
'unknown',
'unknown',
'female',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'mostly_female',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'female',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'female',
'unknown',
'unknown',
'unknown',
'male',
'male',
'andy',
'andy',
'unknown',
'andy',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'female',
'male',
'male',
'unknown',
'male',
'male',
'male',
'male',
'male',
'male',
'male',
'mostly_male',
'male',
'mostly_male',
'mostly_male',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'male',
'unknown',
'male',
'unknown',
'male',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'female',
'unknown',
'mostly_male',
'unknown',
'unknown',
'male',
'female',
'andy',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'mostly_male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'female',
'unknown',
'female',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'mostly_female',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'andy',
'female',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'male',
'male',
'unknown',
'unknown',
'unknown',
'male',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'female',
'male',
'female',
'female',
'female',
'female',
'female',
'female',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'female',
'unknown',
'unknown',
'unknown',
'unknown',
'female',
'unknown',
'mostly_female',
'female',
'unknown',
'mostly_female',
'unknown',
'female',
'unknown',
'female',
'unknown',
'male',
'male',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'male',
'unknown',
'andy',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'female',
'unknown',
'unknown',
'unknown',
'female',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'female',
'male',
'male',
'male',
'male',
'male',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'female',
'female',
'female',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'male',
'male',
'male',
'male',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'mostly_female',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'male',
'male',
'male',
'unknown',
'male',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'mostly_male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'mostly_male',
'female',
'male',
'male',
'male',
'female',
'unknown',
'unknown',
'female',
'unknown',
'unknown',
'male',
'male',
'male',
'female',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'female',
'unknown',
'unknown',
'unknown',
'female',
'unknown',
'andy',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'female',
'male',
'unknown',
'unknown',
'female',
'male',
'unknown',
'male',
'unknown',
'unknown',
'male',
'female',
'female',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'female',
'unknown',
'female',
'female',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'male',
'male',
'male',
'unknown',
'male',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'female',
'unknown',
'male',
'male',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'female',
'unknown',
'male',
'unknown',
'male',
'male',
'unknown',
'unknown',
'male',
'male',
'unknown',
'male',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'male',
'female',
'unknown',
'unknown',
'female',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'female',
'unknown',
'female',
'female',
'female',
'unknown',
'unknown',
'male',
'male',
'male',
'male',
'male',
'male',
'male',
'male',
'male',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'male',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'mostly_male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'mostly_female',
'mostly_female',
'mostly_female',
'unknown',
'female',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'female',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'female',
'male',
'male',
'male',
'male',
'unknown',
'female',
'female',
'female',
'unknown',
'mostly_male',
'unknown',
'unknown',
'male',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'male',
'male',
'female',
'male',
'female',
'unknown',
'unknown',
'female',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'male',
'male',
'unknown',
'unknown',
'female',
'unknown',
'female',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'male',
'male',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'female',
'unknown',
'male',
'unknown',
'unknown',
'mostly_female',
'male',
'female',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'male',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'male',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'female',
'female',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'male',
'unknown',
'male',
'male',
'female',
'mostly_female',
'female',
'mostly_female',
'mostly_female',
'mostly_female',
'mostly_female',
'mostly_female',
'mostly_female',
'unknown',
'unknown',
'female',
'female',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'female',
'unknown',
'male',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'female',
'unknown',
'female',
'unknown',
'unknown',
'female',
'female',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'male',
'male',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'male',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'female',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'male',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'female',
'unknown',
'female',
'unknown',
'unknown',
'male',
'male',
'male',
'male',
'male',
'unknown',
'unknown',
'unknown',
'male',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'female',
'mostly_male',
'unknown',
'female',
'male',
'male',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'male',
'male',
'male',
'male',
'male',
'unknown',
'unknown',
'male',
'male',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'male',
'male',
'unknown',
'male',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'male',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'female',
'unknown',
'male',
'male',
'male',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'male',
'male',
'male',
'male',
'male',
'male',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'female',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'female',
'female',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'male',
'female',
'unknown',
'female',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'male',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'male',
'unknown',
'unknown',
'male',
'unknown',
'female',
'unknown',
'unknown',
'unknown',
'mostly_male',
'male',
'unknown',
'unknown',
'male',
'male',
'male',
'unknown',
'female',
'unknown',
'unknown',
'unknown',
'unknown',
'female',
'unknown',
'female',
'female',
'female',
'male',
'male',
'unknown',
'unknown',
'unknown',
'male',
'male',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'female',
'unknown',
'unknown',
'unknown',
'female',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'female',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'male',
'unknown',
'male',
'unknown',
'female',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'male',
'male',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'male',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'male',
'unknown',
'male',
'male',
'andy',
'male',
'male',
'unknown',
'unknown',
'unknown',
'female',
'unknown',
'female',
'unknown',
'male',
'male',
'male',
'male',
'male',
'male',
'mostly_male',
'mostly_male',
'mostly_male',
'mostly_male',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'male',
'male',
'unknown',
'male',
'male',
'unknown',
'male',
'male',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'female',
'female',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'male',
'male',
'unknown',
'unknown',
'male',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'male',
'male',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'female',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'male',
'male',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'male',
'male',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'male',
'unknown',
'unknown',
'female',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'mostly_male',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'female',
'female',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'male',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'mostly_female',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'male',
'male',
'unknown',
'unknown',
'unknown',
'male',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'mostly_female',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'male',
'male',
'mostly_female',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'female',
'unknown',
'female',
'unknown',
'female',
'unknown',
'female',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'andy',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'female',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'female',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'female',
'unknown',
'unknown',
'unknown',
'mostly_female',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'male',
'male',
'male',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'male',
'female',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'female',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'female',
'unknown',
'unknown',
'unknown',
'male',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'male',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'female',
'unknown',
'female',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'female',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'female',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'female',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'male',
'unknown',
'male',
'male',
'unknown',
'male',
'unknown',
'male',
'male',
'male',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'female',
'unknown',
'unknown',
'unknown',
'male',
'male',
'male',
'male',
'male',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'female',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'female',
'unknown',
'unknown',
'unknown',
'female',
'unknown',
'unknown',
'unknown',
'mostly_male',
'male',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'male',
'male',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'mostly_female',
'unknown',
'unknown',
'unknown',
'female',
'male',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'female',
'male',
'mostly_male',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'female',
'unknown',
'unknown',
'unknown',
'male',
'unknown',
'unknown']

#using the above list to create a "Guessed_Gender" column in the df_got_translated Dataset
df_got_translated['Guessed_Gender'] = pd.Series(gender_guesses_list)

In [None]:
#one hot encoding variables
one_hot_titles       = pd.get_dummies(df_got_translated['titles'])
one_hot_name         = pd.get_dummies(df_got_translated['number_of_names'])
one_hot_culture      = pd.get_dummies(df_got_translated['culture'])
one_hot_gender       = pd.get_dummies(df_got_translated['Guessed_Gender'])
one_hot_houses       = pd.get_dummies(df_got_translated['house_translated'])



#joining codings together
df_got_translated = df_got_translated.join(other = [one_hot_titles, one_hot_name, one_hot_culture, one_hot_gender, one_hot_houses])

In [None]:
# checking results
df_got_translated.info()

In [None]:
#Renaming all the columns after getting dummies
df_got_translated.columns = ['S.No','name','title','culture','dateOfBirth','mother','father','heir',
                             'house','spouse','book1_A_Game_Of_Thrones','book2_A_Clash_Of_Kings',
                             'book3_A_Storm_Of_Swords','book4_A_Feast_For_Crows','book5_A_Dance_with_Dragons',
                             'isAliveMother','isAliveFather','isAliveHeir','isAliveSpouse','isMarried','isNoble',
                             'age','numDeadRelations','popularity','isAlive','m_title','m_culture','m_dateOfBirth',
                             'm_mother','m_father','m_heir','m_house','m_spouse','m_isAliveMother','m_isAliveFather',
                             'm_isAliveHeir','m_isAliveSpouse','m_age','number_of_names','age_dob','lastname','house_new',
                             'titles','house_translated','first_name','Guessed_Gender','d_title_known','d_title_unknown',
                             'd_numberofname_1','d_numberofname_2','d_numberofname_3','d_numberofname_4','d_numberofname_5',
                             'd_numberofname_6','d_cul_Andals','d_cul_Asshai','d_cul_Braavosi','d_cul_Dornish','d_cul_Dothraki',
                             'd_cul_Freefolk','d_cul_Ghiscari','d_cul_Ironborn','d_cul_Lhazareen','d_cul_Lysene','d_cul_Meereen',
                             'd_cul_Northmen','d_cul_Norvoshi','d_cul_Others','d_cul_Qarth','d_cul_Reach','d_cul_Riverlands',
                             'd_cul_Stormlander','d_cul_SummerIslanders','d_cul_Tyroshi','d_cul_Unknown','d_cul_Vale','d_cul_Valyrian',
                             'd_cul_Westerman','d_cul_Westeros','d_gender_andy','d_gender_female','d_gender_male',
                             'd_gender_mostly_female','d_gender_mostly_male','d_gender_unknown','d_house_known','d_house_unknown']


#checking results
df_got_translated.columns

In [None]:
#dropping categorical variables after they've been encoded
df_got_translated = df_got_translated.drop(['name','title','culture','mother','father',
                                            'heir','house','spouse','house_new','lastname',
                                            'titles','house_translated','first_name','Guessed_Gender'], axis = 1)


#checking the results
df_got_translated.info()

In [None]:
########################################
# optimal_neighbors
########################################
def optimal_neighbors(x_data,
                      y_data,
                      standardize = True,
                      pct_test=0.25,
                      seed=219,
                      response_type='reg',
                      max_neighbors=20,
                      show_viz=True):
    """
Exhaustively compute training and testing results for KNN across
[1, max_neighbors]. Outputs the maximum test score and (by default) a
visualization of the results.
PARAMETERS
----------
x_data        : explanatory variable data
y_data        : response variable
standardize   : whether or not to standardize the x data, default True
pct_test      : test size for training and validation from (0,1), default 0.25
seed          : random seed to be used in algorithm, default 219
response_type : type of neighbors algorithm to use, default 'reg'
    Use 'reg' for regression (KNeighborsRegressor)
    Use 'class' for classification (KNeighborsClassifier)
max_neighbors : maximum number of neighbors in exhaustive search, default 20
show_viz      : display or surpress k-neigbors visualization, default True
"""    
    
    
    if standardize == True:
        #optionally standardizing x_data
        scaler             = StandardScaler()
        scaler.fit(x_data)
        x_scaled           = scaler.transform(x_data)
        x_scaled_df        = pd.DataFrame(x_scaled)
        x_data             = x_scaled_df



    #train-test split
    x_train, x_test, y_train, y_test = train_test_split(x_data,
                                                        y_data,
                                                        test_size = pct_test,
                                                        random_state = seed)


    #creating lists for training set accuracy and test set accuracy
    training_accuracy = []
    test_accuracy = []
    
    
    #setting neighbor range
    neighbors_settings = range(1, max_neighbors + 1)


    for n_neighbors in neighbors_settings:
        #building the model based on response variable type
        if response_type == 'reg':
            clf = KNeighborsRegressor(n_neighbors = n_neighbors)
            clf.fit(x_train, y_train)
            
        elif response_type == 'class':
            clf = KNeighborsClassifier(n_neighbors = n_neighbors)
            clf.fit(x_train, y_train)            
            
        else:
            print("Error: response_type must be 'reg' or 'class'")
        
        
        #recording the training set accuracy
        training_accuracy.append(clf.score(x_train, y_train))
    
        #recording the generalization accuracy
        test_accuracy.append(clf.score(x_test, y_test))


    #optionally displaying visualization
    if show_viz == True:
        #plotting the visualization
        fig, ax = plt.subplots(figsize=(12,8))
        plt.plot(neighbors_settings, training_accuracy, label = "training accuracy")
        plt.plot(neighbors_settings, test_accuracy, label = "test accuracy")
        plt.ylabel("Accuracy")
        plt.xlabel("n_neighbors")
        plt.legend()
        plt.show()
    
    
    #returning optimal number of neighbors
    print(f"The optimal number of neighbors is: {test_accuracy.index(max(test_accuracy))+1}")
    return test_accuracy.index(max(test_accuracy))+1


########################################
# visual_cm
########################################
def visual_cm(true_y, pred_y, labels = None):
    """
Creates a visualization of a confusion matrix.

PARAMETERS
----------
true_y : true values for the response variable
pred_y : predicted values for the response variable
labels : , default None
    """
    #visualizing the confusion matrix

    #setting labels
    lbls = labels
    

    #declaring a confusion matrix object
    cm = confusion_matrix(y_true = true_y,
                          y_pred = pred_y)


    #heatmap
    sns.heatmap(cm,
                annot       = True,
                xticklabels = lbls,
                yticklabels = lbls,
                cmap        = 'Blues',
                fmt         = 'g')


    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix of the Classifier')
    plt.show()

In [None]:
#checking correlation with isAlive
df_got_corr = df_got_translated.corr(method= "pearson").round(decimals=2)

df_got_corr['isAlive'].sort_values(ascending = False)

In [None]:
df_got_translated.loc[: ,'isAlive'].value_counts(normalize = True).round(decimals = 2)

<strong>Preparing Explanatory Variables</strong> 

In [None]:
#explanatory variable isAlive is defined
#x data
got_data = df_got_translated.drop(['isAlive',
                                  'book5_A_Dance_with_Dragons',
                                  'isAliveMother',
                                  'isAliveFather',
                                  'isAliveHeir',
                                  'isAliveSpouse',
                                  'isMarried',
                                  'isNoble',
                                  'd_title_unknown',
                                  'd_numberofname_6',
                                  'd_cul_Others',
                                  'd_gender_andy',
                                  'd_house_unknown'],
                                  axis = 1)


#explanatory variable is being defined
#y variable
got_target = df_got_translated.loc[ : , 'isAlive' ]

In [None]:
#train-test split with stratification
x_train, x_test, y_train, y_test = train_test_split(
            got_data,
            got_target,
            test_size    = 0.10,
            random_state = 219,
            stratify     = got_target) # preserving balance


#merging training data for statsmodels
got_train = pd.concat([x_train, y_train], axis = 1)

In [None]:
print(f"""

Response Variable Proportions (Training Set)
--------------------------------------------
{y_train.value_counts(normalize = True).round(decimals = 2)}



Response Variable Proportions (Testing Set)
--------------------------------------------
{y_test.value_counts(normalize = True).round(decimals = 2)}
""")



<strong>Univariate Logistic Model</strong>

In [None]:
# instantiating a logistic regression model object
logistic_small = smf.logit(formula   = """isAlive ~ age_dob""",
                           data = got_train)


# FITTING the model object
results_logistic = logistic_small.fit()


# checking the results SUMMARY
results_logistic.summary2() # summary2() has AIC and BIC

In [None]:
# instantiating a logistic regression model object
logistic_full = smf.logit(formula = """ isAlive ~ book1_A_Game_Of_Thrones + 
                                                 book2_A_Clash_Of_Kings + 
                                                 book3_A_Storm_Of_Swords + 
                                                 book4_A_Feast_For_Crows + 
                                                 numDeadRelations + 
                                                 popularity + 
                                                 age_dob + 
                                                 d_gender_female + 
                                                 d_gender_male + 
                                                 d_gender_mostly_female + 
                                                 d_gender_mostly_male + 
                                                 d_house_known """,
                                        data    = got_train)


# fitting the model object
results_full = logistic_full.fit()


# checking the results SUMMARY
results_full.summary2()

In [None]:
# instantiating a logistic regression model object
logistic_full = smf.logit(formula = """ isAlive ~ book1_A_Game_Of_Thrones + 
                                                  book2_A_Clash_Of_Kings + 
                                                  book3_A_Storm_Of_Swords + 
                                                  book4_A_Feast_For_Crows + 
                                                  popularity + 
                                                  age_dob """,
                                        data    = got_train)


# fitting the model object
results_full = logistic_full.fit()


# checking the results SUMMARY
results_full.summary2()

In [None]:
# instantiating a logistic regression model object with all significant features
logit_sig = smf.logit(formula = """ isAlive ~ book1_A_Game_Of_Thrones + 
                                              book2_A_Clash_Of_Kings + 
                                              book4_A_Feast_For_Crows + 
                                              popularity + 
                                              age_dob""",
                                            data    = got_train)


# fitting the model object
logit_sig = logit_sig.fit()


# checking the results SUMMARY
logit_sig.summary2()

In [None]:
# explanatory sets from last session

# creating a dictionary to store candidate models

candidate_dict = {

 # full model
 'logit_full'   : ['book1_A_Game_Of_Thrones','book2_A_Clash_Of_Kings','book3_A_Storm_Of_Swords',
                   'book4_A_Feast_For_Crows','numDeadRelations','popularity','age_dob','d_gender_andy',
                   'd_gender_female','d_gender_male','d_gender_mostly_female','d_gender_mostly_male',
                   'd_house_known'],
 

 # significant variables only (set 1)
 'logit_sig'    : ['book1_A_Game_Of_Thrones','book2_A_Clash_Of_Kings','book4_A_Feast_For_Crows',
                   'popularity','age_dob']

}

In [None]:
# printing candidate variable sets
print(f"""
/--------------------------\\
|Explanatory Variable Sets |
\\--------------------------/

Full Model:
-----------
{candidate_dict['logit_full']}


Significant p-value Model:
--------------------------------
{candidate_dict['logit_sig']}

""")

In [None]:
# train/test split with the full model
got_data   =  df_got_translated.loc[ : , candidate_dict['logit_sig']]
got_target =  df_got_translated.loc[ : , 'isAlive']


# This is the exact code we were using before
x_train, x_test, y_train, y_test = train_test_split(
            got_data,
            got_target,
            test_size    = 0.10,
            random_state = 219,
            stratify     = got_target)


# INSTANTIATING a logistic regression model
logreg = LogisticRegression(solver = 'lbfgs',
                            C = 1,
                            random_state = 219)


# FITTING the training data
logreg_fit = logreg.fit(x_train, y_train)


# PREDICTING based on the testing set
logreg_pred = logreg_fit.predict(x_test)


# SCORING the results
print('Training ACCURACY:', logreg_fit.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', logreg_fit.score(x_test, y_test).round(4))


# saving scoring data for future use
logreg_train_score = logreg_fit.score(x_train, y_train).round(4) # accuracy
logreg_test_score  = logreg_fit.score(x_test, y_test).round(4) # accuracy

# displaying and saving the gap between training and testing
print('LogReg Train-Test Gap   :', abs(logreg_train_score - logreg_test_score).round(4))
logreg_test_gap = abs(logreg_train_score - logreg_test_score).round(4)

In [None]:
# creating a confusion matrix
print(confusion_matrix(y_true = y_test,
                       y_pred = logreg_pred))

In [None]:
# unpacking the confusion matrix
logreg_tn, \
logreg_fp, \
logreg_fn, \
logreg_tp = confusion_matrix(y_true = y_test, y_pred = logreg_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {logreg_tn}
False Positives: {logreg_fp}
False Negatives: {logreg_fn}
True Positives : {logreg_tp}
""")

In [None]:
# calling the visual_cm function
visual_cm(true_y = y_test,
          pred_y = logreg_pred,
          labels = ['Alive', 'Not Alive'])

In [None]:
# area under the roc curve (auc)
print(roc_auc_score(y_true  = y_test,
                    y_score = logreg_pred).round(decimals = 4))


# saving AUC score for future use
logreg_auc_score = roc_auc_score(y_true  = y_test,
                                 y_score = logreg_pred).round(decimals = 4)

In [None]:
# zipping each feature name to its coefficient
logreg_model_values = zip(df_got_translated[candidate_dict['logit_sig']].columns,
                          logreg_fit.coef_.ravel().round(decimals = 2))


# setting up a placeholder list to store model features
logreg_model_lst = [('intercept', logreg_fit.intercept_[0].round(decimals = 2))]


# printing out each feature-coefficient pair one by one
for val in logreg_model_values:
    logreg_model_lst.append(val)
    

# checking the results
for pair in logreg_model_lst:
    print(pair)

<strong>Classification Trees (CART Models)</strong>

In [None]:
########################################
# plot_feature_importances
########################################
def plot_feature_importances(model, train, export = False):
    """
    Plots the importance of features from a CART model.
    
    PARAMETERS
    ----------
    model  : CART model
    train  : explanatory variable training data
    export : whether or not to export as a .png image, default False
    """
    
    # declaring the number
    n_features = x_train.shape[1]
    
    # setting plot window
    fig, ax = plt.subplots(figsize=(12,9))
    
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), train.columns)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    
    if export == True:
        plt.savefig('Tree_Leaf_50_Feature_Importance.png')

In [None]:
# INSTANTIATING a classification tree object
full_tree = DecisionTreeClassifier()


# FITTING the training data
full_tree_fit = full_tree.fit(x_train, y_train)


# PREDICTING on new data
full_tree_pred = full_tree_fit.predict(x_test)


# SCORING the model
print('Full Tree Training ACCURACY:', full_tree_fit.score(x_train,
                                                     y_train).round(4))

print('Full Tree Testing ACCURACY :', full_tree_fit.score(x_test,
                                                     y_test).round(4))

print('Full Tree AUC Score:', roc_auc_score(y_true  = y_test,
                                            y_score = full_tree_pred).round(4))


# saving scoring data for future use
full_tree_train_score = full_tree_fit.score(x_train, y_train).round(4) # accuracy
full_tree_test_score  = full_tree_fit.score(x_test, y_test).round(4)   # accuracy


# saving AUC
full_tree_auc_score   = roc_auc_score(y_true  = y_test,
                                      y_score = full_tree_pred).round(4) # auc

In [None]:
# unpacking the confusion matrix
full_tree_tn, \
full_tree_fp, \
full_tree_fn, \
full_tree_tp = confusion_matrix(y_true = y_test, y_pred = full_tree_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {full_tree_tn}
False Positives: {full_tree_fp}
False Negatives: {full_tree_fn}
True Positives : {full_tree_tp}
""")

In [None]:
# setting figure size
#plt.figure(figsize=(150,50))


# developing a plotted tree
#plot_tree(decision_tree = full_tree_fit, 
#          feature_names = df_got_translated.columns,
#          filled        = True, 
#          rounded       = True, 
#          fontsize      = 14)


# rendering the plot
#plt.show()

In [None]:
# INSTANTIATING a classification tree object
tree_pruned = DecisionTreeClassifier(max_depth        = 4,
                                     min_samples_leaf = 25,
                                     random_state     = 219)


# FITTING the training data
pruned_tree_fit = tree_pruned.fit(x_train, y_train)


# PREDICTING on new data
pruned_tree_pred = pruned_tree_fit.predict(x_test)


# SCORING the model
print('Training ACCURACY:', pruned_tree_fit.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', pruned_tree_fit.score(x_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = pruned_tree_pred).round(4))


# saving scoring data for future use
pruned_tree_train_score = pruned_tree_fit.score(x_train, y_train).round(4) # accuracy
pruned_tree_test_score  = pruned_tree_fit.score(x_test, y_test).round(4) # accuracy


# saving auc score
pruned_tree_auc_score   = roc_auc_score(y_true  = y_test,
                                        y_score = pruned_tree_pred).round(4) # auc

In [None]:
# unpacking the confusion matrix
pruned_tree_tn, \
pruned_tree_fp, \
pruned_tree_fn, \
pruned_tree_tp = confusion_matrix(y_true = y_test, y_pred = pruned_tree_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {pruned_tree_tn}
False Positives: {pruned_tree_fp}
False Negatives: {pruned_tree_fn}
True Positives : {pruned_tree_tp}
""")

In [None]:
# setting figure size
plt.figure(figsize=(20, 10)) # adjusting to better fit the visual


# developing a plotted tree
plot_tree(decision_tree = pruned_tree_fit, # changing to pruned_tree_fit
          feature_names = df_got_translated.columns,
          filled        = True, 
          rounded       = True, 
          fontsize      = 14)


# rendering the plot
plt.show()

In [None]:
# plotting feature importance
plot_feature_importances(pruned_tree_fit,
                         train = x_train,
                         export = False)

In [None]:
# comparing results
print(f"""
Model         AUC Score      TN, FP, FN, TP
-----         ---------      --------------
Logistic      {logreg_auc_score}         {logreg_tn, logreg_fp, logreg_fn, logreg_tp}
Full Tree     {full_tree_auc_score}         {full_tree_tn, full_tree_fp, full_tree_fn, full_tree_tp}
Pruned Tree   {pruned_tree_auc_score}           {pruned_tree_tn, pruned_tree_fp, pruned_tree_fn, pruned_tree_tp}
""")


# creating a dictionary for model results
model_performance = {
    
    'Model Name'    : ['Logistic', 'Full Tree', 'Pruned Tree'],
           
    'AUC Score' : [logreg_auc_score, full_tree_auc_score, pruned_tree_auc_score],
    
    'Training Accuracy' : [logreg_train_score, full_tree_train_score,
                           pruned_tree_train_score],
           
    'Testing Accuracy'  : [logreg_test_score, full_tree_test_score,
                           pruned_tree_test_score],

    'Confusion Matrix'  : [(logreg_tn, logreg_fp, logreg_fn, logreg_tp),
                           (full_tree_tn, full_tree_fp, full_tree_fn, full_tree_tp),
                           (pruned_tree_tn, pruned_tree_fp, pruned_tree_fn, pruned_tree_tp)]}


# converting model_performance into a DataFrame
model_performance = pd.DataFrame(model_performance)


# sending model results to Excel
model_performance.to_excel('../path_practice/__results/classification_model_performance_got.xlsx',
                           index = False)

<strong>Classification modeling with KNN</strong>

In [None]:
# determining the optimal number of neighbors
opt_neighbors = optimal_neighbors(x_data        = got_data,
                                  y_data        = got_target,
                                  response_type = 'class')

In [None]:
# INSTANTIATING StandardScaler()
scaler = StandardScaler()


# FITTING the data
scaler.fit(got_data)


# TRANSFORMING the data
x_scaled     = scaler.transform(got_data)


# converting to a DataFrame
x_scaled_df  = pd.DataFrame(x_scaled) 


# train-test split with the scaled data
x_train_scaled, x_test_scaled, y_train_scaled, y_test_scaled = train_test_split(
            x_scaled_df,
            got_target,
            random_state = 219,
            test_size    = 0.10,
            stratify     = got_target)


# INSTANTIATING a KNN classification model with optimal neighbors
knn_opt = KNeighborsClassifier(n_neighbors = opt_neighbors)


# FITTING the training data
knn_fit = knn_opt.fit(x_train_scaled, y_train_scaled)


# PREDICTING based on the testing set
knn_pred = knn_fit.predict(x_test_scaled)


# SCORING the results
print('Training ACCURACY:', knn_fit.score(x_train_scaled, y_train_scaled).round(4))
print('Testing  ACCURACY:', knn_fit.score(x_test_scaled, y_test_scaled).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = knn_pred).round(4))


# saving scoring data
knn_train_score = knn_fit.score(x_train_scaled, y_train_scaled).round(4)
knn_test_score  = knn_fit.score(x_test_scaled, y_test_scaled).round(4)


# saving AUC score
knn_auc_score   = roc_auc_score(y_true  = y_test,
                                          y_score = knn_pred).round(4)

In [None]:
# calling the visual_cm function
visual_cm(true_y = y_test,
          pred_y = knn_pred,
          labels = ['Alive', 'Not Alive'])

In [None]:
# unpacking the confusion matrix
knn_tree_tn, \
knn_tree_fp, \
knn_tree_fn, \
knn_tree_tp = confusion_matrix(y_true = y_test, y_pred = knn_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {knn_tree_tn}
False Positives: {knn_tree_fp}
False Negatives: {knn_tree_fn}
True Positives : {knn_tree_tp}
""")

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer 

<strong>Logistic Regression with Hyperparameter tuning</strong>

In [None]:
# INSTANTIATING a logistic regression model with default values
lr_default = LogisticRegression(solver = 'lbfgs',
                                C = 1.0,
                                warm_start = False,
                                random_state = 219)
#warm start lets this algorith learn from previous algorithms, its false by default

In [None]:
# FITTING the training data
lr_default_fit = lr_default.fit(x_train, y_train)


# PREDICTING based on the testing set
lr_default_pred = lr_default_fit.predict(x_test)


# SCORING the results
print('Training ACCURACY:', lr_default_fit.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', lr_default_fit.score(x_test, y_test).round(4))


# SCORING with AUC
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = lr_default_pred).round(4))


# saving scoring data for future use
logreg_train_score = lr_default_fit.score(x_train, y_train).round(4) # accuracy
logreg_test_score  = lr_default_fit.score(x_test, y_test).round(4)   # accuracy


# saving AUC score
logreg_auc_score = roc_auc_score(y_true  = y_test,
                                 y_score = lr_default_pred).round(4)

<strong>Hyperparameter tuning with RandomizedSearchCV</strong>

In [None]:
########################################
# RandomizedSearchCV
########################################

#declaring a hyperparameter space
C_range          = np.arange(0.1, 5.0, 0.1)
warm_start_range = [True, False]
solver_range     = ['newton-cg', 'sag', 'lbfgs']


#creating a hyperparameter grid
param_grid = {'C'          : C_range,
              'warm_start' : warm_start_range,
              'solver'     : solver_range}


#instantiating the model object without hyperparameters
lr_tuned = LogisticRegression(random_state = 219,
                              max_iter     = 1000) # increased for convergence


#GridSearchCV object
lr_tuned_cv = RandomizedSearchCV(estimator           = lr_tuned,   # the model object
                                 param_distributions = param_grid, # parameters to tune
                                 cv                  = 3,          # how many folds in cross-validation
                                 n_iter              = 250,        # number of combinations of hyperparameters to try
                                 random_state        = 219,        # starting point for random sequence
                                 scoring = make_scorer(
                                           roc_auc_score,
                                           needs_threshold = False)) # scoring criteria (AUC)


#fitting to the FULL DATASET (due to cross-validation)
lr_tuned_cv.fit(got_data, got_target)


# printing the optimal parameters and best score
print("Tuned Parameters  :", lr_tuned_cv.best_params_)
print("Tuned CV AUC      :", lr_tuned_cv.best_score_.round(4))

In [None]:
#checking the results of RandomizedSearch CV
lr_tuned_cv.cv_results_

In [None]:
#checking the best estimator for the model
lr_tuned_cv.best_estimator_

In [None]:
#building a model based on hyperparameter tuning results

#instantiating a logistic regression model with tuned values
lr_tuned = LogisticRegression(C            = 3.9000000000000004,
                              warm_start   = True,
                              solver       = 'newton-cg',
                              max_iter     = 1000,
                              random_state = 219)


#fitting the model
lr_tuned.fit(got_data, got_target)


#predicting based on the testing set
lr_tuned_pred = lr_tuned.predict(x_test)


#scoring the results
print('Training ACCURACY:', lr_tuned.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', lr_tuned.score(x_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = lr_tuned_pred).round(4))


#saving scoring data for future use
lr_tuned_train_score = lr_tuned.score(x_train, y_train).round(4) # accuracy
lr_tuned_test_score  = lr_tuned.score(x_test, y_test).round(4)   # accuracy


#saving the AUC score
lr_tuned_auc         = roc_auc_score(y_true  = y_test,
                                     y_score = lr_tuned_pred).round(4) # auc

In [None]:
#unpacking the confusion matrix
lr_tuned_tn, \
lr_tuned_fp, \
lr_tuned_fn, \
lr_tuned_tp = confusion_matrix(y_true = y_test, y_pred = lr_tuned_pred).ravel()


#printing each result one-by-one
print(f"""
True Negatives : {lr_tuned_tn}
False Positives: {lr_tuned_fp}
False Negatives: {lr_tuned_fn}
True Positives : {lr_tuned_tp}
""")

In [None]:
#loading model performance
model_performance = pd.read_excel('../path_practice/__results/classification_model_performance_got.xlsx')


#declaring model performance objects
lr_train_acc = lr_tuned.score(x_train, y_train).round(4)
lr_test_acc  = lr_tuned.score(x_test, y_test).round(4)
lr_auc       = roc_auc_score(y_true  = y_test,
                             y_score = lr_tuned_pred).round(4)


#appending to model_performance
model_performance = model_performance.append(
                          {'Model Name'        : 'Tuned LR',
                           'Training Accuracy' : lr_train_acc,
                           'Testing Accuracy'  : lr_test_acc,
                           'AUC Score'         : lr_auc,
                           'Confusion Matrix'  : (lr_tuned_tn,
                                                  lr_tuned_fp,
                                                  lr_tuned_fn,
                                                  lr_tuned_tp)},
                           ignore_index = True)


#checking the results
model_performance

<strong>Hyperparameter tuning on classification trees</strong>

In [None]:
#declaring a hyperparameter space
criterion_range = ['gini', 'entropy']
splitter_range  = ['best', 'random']
depth_range     = np.arange(1, 25, 1)
leaf_range      = np.arange(1, 100, 1)


#creating a hyperparameter grid
param_grid = {'criterion'        : criterion_range,
              'splitter'         : splitter_range,
              'max_depth'        : depth_range,
              'min_samples_leaf' : leaf_range}


#instantiating the model object without hyperparameters
tuned_tree = DecisionTreeClassifier(random_state = 219)


#RandomizedSearchCV object
tuned_tree_cv = RandomizedSearchCV(estimator             = tuned_tree,
                                   param_distributions   = param_grid,
                                   cv                    = 3,
                                   n_iter                = 1000,
                                   random_state          = 219,
                                   scoring = make_scorer(roc_auc_score,
                                             needs_threshold = False))


#fitting to the FULL DATASET (due to cross-validation)
tuned_tree_cv.fit(got_data, got_target)

# printing the optimal parameters and best score
print("Tuned Parameters  :", tuned_tree_cv.best_params_)
print("Tuned Training AUC:", tuned_tree_cv.best_score_.round(4))

In [None]:
#building a model based on hyperparameter tuning results as above

#instantiating a logistic regression model with tuned values
tree_tuned = DecisionTreeClassifier(splitter         = 'best',
                                    min_samples_leaf = 4,
                                    max_depth        = 17,
                                    criterion        = 'entropy',
                                    random_state     = 219)


#fitting to the FULL DATASET (due to cross-validation)
tree_tuned_fit = tree_tuned.fit(got_data, got_target)


#predicting based on the testing set
tree_tuned_pred = tree_tuned.predict(x_test)


#scoring the results
print('Training ACCURACY:', tree_tuned.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', tree_tuned.score(x_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = tree_tuned_pred).round(4))


#saving scoring data for future use
tree_tuned_train_score = tree_tuned.score(x_train, y_train).round(4) # accuracy
tree_tuned_test_score  = tree_tuned.score(x_test, y_test).round(4)   # accuracy


#saving the AUC score
tree_tuned_auc         = roc_auc_score(y_true  = y_test,
                                       y_score = tree_tuned_pred).round(4) # auc

In [None]:
#unpacking the confusion matrix
tuned_tree_tn, \
tuned_tree_fp, \
tuned_tree_fn, \
tuned_tree_tp = confusion_matrix(y_true = y_test, y_pred = tree_tuned_pred).ravel()


#printing each result one-by-one
print(f"""
True Negatives : {tuned_tree_tn}
False Positives: {tuned_tree_fp}
False Negatives: {tuned_tree_fn}
True Positives : {tuned_tree_tp}
""")

In [None]:
#declaring model performance objects
tree_train_acc = tree_tuned.score(x_train, y_train).round(4)
tree_test_acc  = tree_tuned.score(x_test, y_test).round(4)
tree_auc       = roc_auc_score(y_true  = y_test,
                              y_score = tree_tuned_pred).round(4)


#appending to model_performance
model_performance = model_performance.append(
                          {'Model Name'        : 'Tuned Tree',
                           'Training Accuracy' : tree_train_acc,
                           'Testing Accuracy'  : tree_test_acc,
                           'AUC Score'         : tree_auc,
                           'Confusion Matrix'  : (tuned_tree_tn,
                                                  tuned_tree_fp,
                                                  tuned_tree_fn,
                                                  tuned_tree_tp)},
                           ignore_index = True)


#checking the results
model_performance

In [None]:
#setting figure size
plt.figure(figsize=(40, 10))


#developing a plotted tree
plot_tree(decision_tree = tree_tuned_fit, 
          feature_names = df_got_translated.columns,
          filled        = True, 
          rounded       = True, 
          fontsize      = 14)


#rendering the plot
plt.show()

In [None]:
#saving the DataFrame to Excel
model_performance.to_excel('../path_practice/__results/classification_model_performance_got.xlsx',
                           index = False)

In [None]:
#new tools
from sklearn.ensemble import RandomForestClassifier     # random forest
from sklearn.ensemble import GradientBoostingClassifier # gbm

In [None]:
#loading model performance
model_performance = pd.read_excel('../path_practice/__results/classification_model_performance_got.xlsx')

In [None]:
model_performance

In [None]:
########################################
# plot_feature_importances
########################################
def plot_feature_importances(model, train, export = False):
    """
    Plots the importance of features from a CART model.
    
    PARAMETERS
    ----------
    model  : CART model
    train  : explanatory variable training data
    export : whether or not to export as a .png image, default False
    """
    
    #declaring the number
    n_features = train.shape[1]
    
    #setting plot window
    fig, ax = plt.subplots(figsize=(12,9))
    
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), train.columns)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    
    if export == True:
        plt.savefig('../path_practice/__results/Feature_Importance_got.png')

In [None]:
#train/test split with the logit_sig variables
got_data   =  df_got_translated.loc[ : , candidate_dict['logit_full'] ]
got_target =  df_got_translated.loc[ : , 'isAlive' ]


#train/test split
x_train, x_test, y_train, y_test = train_test_split(
                                                    got_data,
                                                    got_target,
                                                    random_state = 219,
                                                    test_size    = 0.10,
                                                    stratify     = got_target
                                                   )

<strong>Random Forest</strong>

In [None]:
#instantiating a random forest model with default values
rf_default = RandomForestClassifier(n_estimators     = 100,
                                    criterion        = "gini",
                                    max_depth        = 4,
                                    min_samples_leaf = 1,
                                    bootstrap        = True,
                                    warm_start       = False,
                                    random_state     = 219)

In [None]:
#fitting the training data
rf_default_fit = rf_default.fit(x_train, y_train)


#predicting based on the testing set
rf_default_fit_pred = rf_default_fit.predict(x_test)


#scoring the results
print('Training ACCURACY:', rf_default_fit.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', rf_default_fit.score(x_test, y_test).round(4))


#saving AUC score
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = rf_default_fit_pred).round(4))

In [None]:
#plotting feature importances
plot_feature_importances(rf_default_fit, x_train, export = False)

In [None]:
#unpacking the confusion matrix
rf_tn, \
rf_fp, \
rf_fn, \
rf_tp = confusion_matrix(y_true = y_test, y_pred = rf_default_fit_pred).ravel()


#printing each result one-by-one
print(f"""
True Negatives : {rf_tn}
False Positives: {rf_fp}
False Negatives: {rf_fn}
True Positives : {rf_tp}
""")

In [None]:
#declaring model performance objects
rf_train_acc = rf_default_fit.score(x_train, y_train).round(4)
rf_test_acc  = rf_default_fit.score(x_test, y_test).round(4)
rf_auc       = roc_auc_score(y_true  = y_test,
                             y_score = rf_default_fit_pred).round(4)


#appending to model_performance
model_performance = model_performance.append(
                          {'Model Name'         : 'Random Forest (Full)',
                           'Training Accuracy'  : rf_train_acc,
                           'Testing Accuracy'   : rf_test_acc,
                           'AUC Score'          : rf_auc,
                           'Confusion Matrix'   : (rf_tn,
                                                   rf_fp,
                                                   rf_fn,
                                                   rf_tp)},
                          ignore_index = True)


#checking the results
model_performance

In [None]:
#fitting the training data
rf_default_fit = rf_default.fit(x_train, y_train)


#predicting based on the testing set
rf_default_fit_pred = rf_default_fit.predict(x_test)


#declaring a hyperparameter space
estimator_range  = np.arange(100, 1100, 250)
leaf_range       = np.arange(1, 31, 10)
criterion_range  = ['gini', 'entropy']
bootstrap_range  = [True, False]
warm_start_range = [True, False]


#creating a hyperparameter grid
param_grid = {'n_estimators'     : estimator_range,
              'min_samples_leaf' : leaf_range,
              'criterion'        : criterion_range,
              'bootstrap'        : bootstrap_range,
              'warm_start'       : warm_start_range}


#instantiating the model object without hyperparameters
forest_grid = RandomForestClassifier(random_state = 219)


#GridSearchCV object
forest_cv = RandomizedSearchCV(estimator           = forest_grid,
                               param_distributions = param_grid,
                               cv         = 3,
                               n_iter     = 1000,
                               scoring    = make_scorer(roc_auc_score,
                                            needs_threshold = False))


#fitting to the FULL DATASET (due to cross-validation)
forest_cv.fit(got_data, got_target)


#predict step is not needed


#printing the optimal parameters and best score
print("Tuned Parameters  :", forest_cv.best_params_)
print("Tuned Training AUC:", forest_cv.best_score_.round(4))

In [None]:
#best estimators based on RandomizedSearchCV
forest_cv.best_estimator_

In [None]:
#building a model based on hyperparameter tuning results

#instantiating with best_estimator
forest_tuned = RandomForestClassifier(criterion='gini', 
                                      min_samples_leaf=1,
                                      n_estimators=350, 
                                      random_state=219, 
                                      warm_start=True)


#fitting to the FULL DATASET (due to cross-validation)
forest_tuned_fit = forest_tuned.fit(got_data, got_target)


#predicting based on the testing set
forest_tuned_pred = forest_tuned_fit.predict(x_test)


#scoring the results
print('Forest Tuned Training ACCURACY:', forest_tuned.score(x_train, y_train).round(4))
print('Forest Tuned Testing  ACCURACY:', forest_tuned.score(x_test, y_test).round(4))
print('Forest Tuned AUC Score        :', roc_auc_score(y_true  = y_test,
                                                       y_score = forest_tuned_pred).round(4))


#saving scoring data for future use
forest_tuned_train_score = forest_tuned.score(x_train, y_train).round(4) # accuracy
forest_tuned_test_score  = forest_tuned.score(x_test, y_test).round(4)   # accuracy


#saving the AUC score
forest_tuned_auc = roc_auc_score(y_true  = y_test,
                                 y_score = forest_tuned_pred).round(4) # auc

In [None]:
#plotting feature importances
plot_feature_importances(forest_tuned_fit,
                         train = x_train,
                         export = False)

In [None]:
#unpacking the confusion matrix
tuned_rf_tn, \
tuned_rf_fp, \
tuned_rf_fn, \
tuned_rf_tp = confusion_matrix(y_true = y_test, y_pred = forest_tuned_pred).ravel()


#printing each result one-by-one
print(f"""
True Negatives : {tuned_rf_tn}
False Positives: {tuned_rf_fp}
False Negatives: {tuned_rf_fn}
True Positives : {tuned_rf_tp}
""")

In [None]:
#declaring model performance objects
tuned_rf_train_acc = forest_tuned_fit.score(x_train, y_train).round(4)
tuned_rf_test_acc  = forest_tuned_fit.score(x_test, y_test).round(4)
tuned_rf_auc       = roc_auc_score(y_true  = y_test,
                                   y_score = forest_tuned_pred).round(4)


#appending to model_performance
model_performance = model_performance.append(
                          {'Model Name'         : 'Tuned Random Forest (Full)',
                           'Training Accuracy'  : tuned_rf_train_acc,
                           'Testing Accuracy'   : tuned_rf_test_acc,
                           'AUC Score'          : tuned_rf_auc,
                           'Confusion Matrix'   : (tuned_rf_tn,
                                                   tuned_rf_fp,
                                                   tuned_rf_fn,
                                                   tuned_rf_tp)},
                          ignore_index = True)


#checking the results
model_performance

<strong>Gradient Boost Machines (GBM)</strong>

In [None]:
#instantiating the model object without hyperparameters
full_gbm_default = GradientBoostingClassifier(loss          = 'deviance',
                                              learning_rate = 0.1,
                                              n_estimators  = 100,
                                              criterion     = 'friedman_mse',
                                              max_depth     = 3,
                                              warm_start    = False,
                                              random_state  = 219)


#fit step is needed as we are not using .best_estimator
full_gbm_default_fit = full_gbm_default.fit(x_train, y_train)


#predicting based on the testing set
full_gbm_default_pred = full_gbm_default_fit.predict(x_test)


#scoring the results
print('Training ACCURACY:', full_gbm_default_fit.score(x_train, y_train).round(4))
print('Testing ACCURACY :', full_gbm_default_fit.score(x_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = full_gbm_default_pred).round(4))

In [None]:
#unpacking the confusion matrix
gbm_default_tn, \
gbm_default_fp, \
gbm_default_fn, \
gbm_default_tp = confusion_matrix(y_true = y_test, y_pred = full_gbm_default_pred).ravel()


#printing each result one-by-one
print(f"""
True Negatives : {gbm_default_tn}
False Positives: {gbm_default_fp}
False Negatives: {gbm_default_fn}
True Positives : {gbm_default_tp}
""")

In [None]:
#scoring the model
gbm_train_acc = full_gbm_default_fit.score(x_train, y_train).round(4)
gbm_test_acc  = full_gbm_default_fit.score(x_test, y_test).round(4)
gbm_auc       = roc_auc_score(y_true  = y_test,
                              y_score = full_gbm_default_pred).round(4)


#appending to model_performance
model_performance = model_performance.append(
                          {'Model Name'       : 'GBM (Full)',
                          'Training Accuracy' : gbm_train_acc,
                          'Testing Accuracy'  : gbm_test_acc,
                          'AUC Score'         : gbm_auc,
                          'Confusion Matrix'  : (gbm_default_tn,
                                                 gbm_default_fp,
                                                 gbm_default_fn,
                                                 gbm_default_tp)},
                          ignore_index = True)


#checking the results
model_performance

Below code is frozen and commented as it takes long to run.

In [None]:
#declaring a hyperparameter space
learn_range        = np.arange(0.1,2.2,0.5)
estimator_range    = np.arange(100, 501, 25)
depth_range        = np.arange(2, 11, 2)
warm_start_range = [True, False]


#creating a hyperparameter grid
param_grid = {'learning_rate' : learn_range,
              'max_depth'     : depth_range,
              'n_estimators'  : estimator_range,
              'warm_start'    : warm_start_range}

#instantiating the model object without hyperparameters
full_gbm_grid = GradientBoostingClassifier(random_state = 219)


#GridSearchCV object
full_gbm_cv = RandomizedSearchCV(estimator           = full_gbm_grid,
                                 param_distributions = param_grid,
                                 cv                  = 3,
                                 n_iter              = 500,
                                 random_state        = 219,
                                 scoring             = make_scorer(roc_auc_score,
                                                       needs_threshold = False))


#fitting to the FULL DATASET (due to cross-validation)
full_gbm_cv.fit(got_data, got_target)


#predict step is not needed


#printing the optimal parameters and best score
print("Tuned Parameters  :", full_gbm_cv.best_params_)
print("Tuned Training AUC:", full_gbm_cv.best_score_.round(4))

Tuned Parameters  : {'warm_start': False, 'n_estimators': 375, 'max_depth': 2, 'learning_rate': 1.1}
Tuned Training AUC: 0.7551

In [None]:
#instantiating with best_estimator
gbm_tuned = GradientBoostingClassifier(learning_rate = 1.1,
                                       max_depth     = 2,
                                       n_estimators  = 375,
                                       warm_start    = False,
                                       random_state  = 219)


#fitting to the FULL DATASET (due to cross-validation)
gbm_tuned_fit = gbm_tuned.fit(got_data, got_target)


#predicting based on the testing set
gbm_tuned_pred = gbm_tuned_fit.predict(x_test)


#scoring the results
print('Training ACCURACY:', gbm_tuned_fit.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', gbm_tuned_fit.score(x_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = gbm_tuned_pred).round(4))

In [None]:
#unpacking the confusion matrix
gbm_tuned_tn, \
gbm_tuned_fp, \
gbm_tuned_fn, \
gbm_tuned_tp = confusion_matrix(y_true = y_test, y_pred = gbm_tuned_pred).ravel()


#printing each result one-by-one
print(f"""
True Negatives : {gbm_tuned_tn}
False Positives: {gbm_tuned_fp}
False Negatives: {gbm_tuned_fn}
True Positives : {gbm_tuned_tp}
""")

In [None]:
#declaring model performance objects
gbm_train_acc = gbm_tuned_fit.score(x_train, y_train).round(4)
gbm_test_acc  = gbm_tuned_fit.score(x_test, y_test).round(4)
gbm_auc       = roc_auc_score(y_true  = y_test,
                              y_score = gbm_tuned_pred).round(4)


#appending to model_performance
model_performance = model_performance.append(
                          {'Model Name'        : '**Final Model - Tuned GBM**',
                          'Training Accuracy'  : gbm_train_acc,
                          'Testing Accuracy'   : gbm_test_acc,
                          'AUC Score'          : gbm_auc,
                          'Confusion Matrix'   : (gbm_tuned_tn,
                                                  gbm_tuned_fp,
                                                  gbm_tuned_fn,
                                                  gbm_tuned_tp)},
                          ignore_index = True)


#checking the results
model_performance

In [None]:
#saving the DataFrame to Excel
model_performance.to_excel('../path_practice/__results/classification_model_performance_got.xlsx',
                           index = False)

In [None]:
#comparing results

print(f"""
     Model          AUC        Training     Testing       Confusion Matrix
                   Score       Accuracy     Accuracy      (TN, FP, FN, TP)
--------------   ----------   ----------   ----------   --------------------
1. Logistic        {model_performance['AUC Score'][0]}       {model_performance['Training Accuracy'][0]}       {model_performance['Testing Accuracy'][0]}        {model_performance['Confusion Matrix'][0]}
2. Full Tree       {model_performance['AUC Score'][1]}       {model_performance['Training Accuracy'][1]}       {model_performance['Testing Accuracy'][1]}       {model_performance['Confusion Matrix'][1]}
2. Pruned Tree     {model_performance['AUC Score'][2]}         {model_performance['Training Accuracy'][2]}        {model_performance['Testing Accuracy'][2]}       {model_performance['Confusion Matrix'][2]}
3. Tuned LR        {model_performance['AUC Score'][3]}       {model_performance['Training Accuracy'][3]}       {model_performance['Testing Accuracy'][3]}        {model_performance['Confusion Matrix'][3]}
4. Tuned Tree      {model_performance['AUC Score'][4]}       {model_performance['Training Accuracy'][4]}       {model_performance['Testing Accuracy'][4]}       {model_performance['Confusion Matrix'][4]}
5. Random Forest   {model_performance['AUC Score'][5]}         {model_performance['Training Accuracy'][5]}       {model_performance['Testing Accuracy'][5]}       {model_performance['Confusion Matrix'][5]}
   (Full)
6. Tuned Random    {model_performance['AUC Score'][6]}       {model_performance['Training Accuracy'][6]}       {model_performance['Testing Accuracy'][6]}       {model_performance['Confusion Matrix'][6]}
   Forest (Full)
7. GBM (Full)      {model_performance['AUC Score'][7]}       {model_performance['Training Accuracy'][7]}       {model_performance['Testing Accuracy'][7]}       {model_performance['Confusion Matrix'][7]}
8. *Tuned GBM*     {model_performance['AUC Score'][8]}       {model_performance['Training Accuracy'][8]}       {model_performance['Testing Accuracy'][8]}       {model_performance['Confusion Matrix'][8]}
   Final Model
""")

print(f"""Tuned GBM is my final model with a test train gap of 0.0305 and an AUC Score of {gbm_auc}.""")