In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%matplotlib inline

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("notebook")
#sns.set_context("poster")


In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score

from sklearn import preprocessing


<h1>Basic Classification Algorithms</h1>

Here we review six of the most well-known classification algorithms. 

Two linear:

<ul>
    <li>Logistic Regression.</li>
    <li>Linear Discriminant Analysis.</li>
</ul>
and four non-linear:
<ul>
    <li>k-nn - k-Nearest Neighbors.</li>
    <li>Naive Bayes.</li>
    <li>CART - Classification and Regression Trees.</li>
    <li>SVM - Support Vector Machines.</li>
</ul>

Then we will address the simple and common question of <b><i>What algorithms should I use in this dataset?</b></i>

In all cases we will use a dataset that we are familiar with, the Pima Indians dataset, with a 10-fold cross-validation. 



<img src="Pima_indians_cowboy_1889.jpg">

In this exercise we will use one of the traditional Machine Learning dataset, the Pima Indians diabetes dataset.

This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.

Content
The datasets consists of several medical predictor variables and one target variable, <b>Outcome</b>. Predictor variables includes the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.
<blockquote>
        <ul style="list-style-type:square;">
            <li>Pregnancies</li> 
            <li>Glucose</li>
            <li>BloodPressure</li>
            <li>SkinThickness</li>
            <li>Insulin</li>
            <li>BMI</li>
            <li>DiabetesPedigreeFunction (scores de likelihood of diabetes based on family history)</li>
            <li>Age</li>
            <li>Outcome</li>
        </ul>
</blockquote>

In [None]:
# Load the Pima indians dataset and separate input and output components 

from numpy import set_printoptions
set_printoptions(precision=3)

filename="pima-indians-diabetes.data.csv"
names=["pregnancies", "glucose", "pressure", "skin", "insulin", "bmi", "pedi", "age", "outcome"]
p_indians=pd.read_csv(filename, names=names)
p_indians.head()

# First we separate into input and output components
array=p_indians.values
X=array[:,0:8]
y=array[:,8]
np.set_printoptions(suppress=True)
X
pd.DataFrame(X).head()

#Now we standarize our data 

std_scaler=preprocessing.StandardScaler()
X_std=std_scaler.fit_transform(X)

minmax_scaler=preprocessing.MinMaxScaler()
X_minmax=minmax_scaler.fit_transform(X)

# Create the DataFrames for plotting
resall=pd.DataFrame()
res_w1=pd.DataFrame()
res_w2=pd.DataFrame()
res_w3=pd.DataFrame()
# creando df vacíos

# Logistic Regression

It is probably the best known and the oldest. We are also pretty familiar with it !

Logistic regression assumes a Gaussian distribution for the numeric input variables and can solve binary and multi-class classification problems. 

We will use the <b>LogisticRegression</b> class.

In [None]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression

kfold=KFold(n_splits=10, random_state=7)

model=LogisticRegression(solver="liblinear")

results=cross_val_score(model, X, y, cv=kfold)

print(f'Logistic Regression - Accuracy {results.mean()*100:.3f}% std {results.std()*100:3f}')

results_scl=cross_val_score(model, X_std, y, cv=kfold)

print(f'Logistic Regression (-1..1) - Accuracy {results_scl.mean()*100:.3f}% std {results_scl.std()*100:3f}')

results_minmax=cross_val_score(model, X_minmax, y, cv=kfold)

print(f'Logistic Regression ( 0..1) - Accuracy {results_minmax.mean()*100:.3f}% std {results_minmax.std()*100:3f}')

# if the range of variables is large scaling doesn't matter in a log regression 
# but if you are not sure if they are (or you don't want to check ... ) just try ! 

res_w1["Res"]=results
res_w1["Type"]="log"

res_w2["Res"]=results_scl
res_w2["Type"]="log -1..1"

res_w3["Res"]=results_minmax
res_w3["Type"]="log 0..1"

resall=pd.concat([resall,res_w1,res_w2,res_w3], ignore_index=True)


# LDA - Linear Discriminant Analysis

Linear Discriminant Analysis or discriminant analysis is a generalization of Fisher's linear discriminant, originally developed by Ronald Fisher in 1936. Although it is different from ANOVA (Analysis of variance), they are closely related. 

LDA also assumes a Gaussian distribution of the numerical input variables and can be used for binary or multi-class classification. 

We will use the <b>LinearDiscriminantAnalysis</b> class.


In [None]:
# LDA - Linear Discriminant Analysis 

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

kfold=KFold(n_splits=10, random_state=7)

model=LinearDiscriminantAnalysis()

results=cross_val_score(model, X, y, cv=kfold)

print(f'LDA Linear Discriminant Analysis - Accuracy {results.mean()*100:.3f}% std {results.std()*100:3f}')

results_scl=cross_val_score(model, X_std, y, cv=kfold)

print(f'LDA (-1..1) - Accuracy {results_scl.mean()*100:.3f}% std {results_scl.std()*100:3f}')

results_minmax=cross_val_score(model, X_minmax, y, cv=kfold)

print(f'LDA ( 0..1) - Accuracy {results_minmax.mean()*100:.3f}% std {results_minmax.std()*100:3f}')


res_w1["Res"]=results
res_w1["Type"]="LDA"

res_w2["Res"]=results_scl
res_w2["Type"]="LDA -1..1"

res_w3["Res"]=results_minmax
res_w3["Type"]="LDA 0..1"

resall=pd.concat([resall,res_w1,res_w2,res_w3], ignore_index=True)


# Accuracies me salen iguales, porque como primero busca VAPS y VEPS, cualquier cambio lineal me dará lo mismo.


# k-nn k-Nearest Neighbors

k-Nearerst Neighbors is a non-linear machine learning algorithm that uses distance metrics to find the most similar k-elements, taking the meand outcome of the neighbors as the prediction.

One interesting advantage of this algorithm is that we can choose a different metric for calculating the distance. The default metric is Minkowski, equivalent to euclidean (with p=2). It can be easily transformed to Mnahattan distance with p=1. 

For constructing a knn model you must use the <b>KNeighorsClassifier</b> class.

In [None]:
# KNN Classification

from sklearn.neighbors import KNeighborsClassifier

kfold=KFold(n_splits=10, random_state=7)

model=KNeighborsClassifier()

results=cross_val_score(model, X, y, cv=kfold)

print(f'KNN - Accuracy {results.mean()*100:.3f}% std {results.std()*100:3f}')

results_scl=cross_val_score(model, X_std, y, cv=kfold)

print(f'KNN (-1..1) - Accuracy {results_scl.mean()*100:.3f}% std {results_scl.std()*100:3f}')

results_minmax=cross_val_score(model, X_minmax, y, cv=kfold)

print(f'KNN ( 0..1) - Accuracy {results_minmax.mean()*100:.3f}% std {results_minmax.std()*100:3f}')

# scaling in knn is necessary ...

res_w1["Res"]=results
res_w1["Type"]="KNN"

res_w2["Res"]=results_scl
res_w2["Type"]="KNN -1..1"

res_w3["Res"]=results_minmax
res_w3["Type"]="KNN 0..1"

resall=pd.concat([resall,res_w1,res_w2,res_w3], ignore_index=True)

# Naive Bayes

In Naive Bayes class labels are represented by a vector of features and each feature is considered independent of the others (the naive part of the name comes from this assumption). Probabilities are calculated following the bayesian approach. 

In spite of its oversimplified assumptions, the algorithm works quite well in complex, real world situations. The algorithm is particularly usefull with small samples of data. 

For Naive Bayes we will use the <b>GaussianNB</b> class. 

In [None]:
# Naive Bayes

from sklearn.naive_bayes import GaussianNB

kfold=KFold(n_splits=10, random_state=7)

model=GaussianNB()

results=cross_val_score(model, X, y, cv=kfold)

print(f'Naive Bayes - Accuracy {results.mean()*100:.3f}% std {results.std()*100:3f}')

results_scl=cross_val_score(model, X_std, y, cv=kfold)

print(f'Naive Bayes (-1..1) - Accuracy {results_scl.mean()*100:.3f}% std {results_scl.std()*100:3f}')

results_minmax=cross_val_score(model, X_minmax, y, cv=kfold)

print(f'Naive Bayes ( 0..1) - Accuracy {results_minmax.mean()*100:.3f}% std {results_minmax.std()*100:3f}')

res_w1["Res"]=results
res_w1["Type"]="NB"

res_w2["Res"]=results_scl
res_w2["Type"]="NB -1..1"

res_w3["Res"]=results_minmax
res_w3["Type"]="NB 0..1"

resall=pd.concat([resall,res_w1,res_w2,res_w3], ignore_index=True)

# CART - Classification and Regression Trees

Cart builds a binary tree from the data where the splits are chosen greedly evaluating all the attributes in order to minimize a cost function (Gini index or entropy typically).

They are the base for random forests and more sophisticated algorithms. 

For CART we will use the <b>DecisionTreeClassifier</b> class.


In [None]:
# Decision Trees

from sklearn.tree import DecisionTreeClassifier

seed=7

kfold=KFold(n_splits=10, random_state=seed)

model=DecisionTreeClassifier(class_weight="balanced", random_state=seed)


results=cross_val_score(model, X, y, cv=kfold)

print(f'Decision Tree - Accuracy {results.mean()*100:.3f}% std {results.std()*100:3f}')

results_scl=cross_val_score(model, X_std, y, cv=kfold)

print(f'Decision Tree (-1..1) - Accuracy {results_scl.mean()*100:.3f}% std {results_scl.std()*100:3f}')

results_minmax=cross_val_score(model, X_minmax, y, cv=kfold)

print(f'Decision Tree ( 0..1) - Accuracy {results_minmax.mean()*100:.3f}% std {results_minmax.std()*100:3f}')

res_w1["Res"]=results
res_w1["Type"]="DT"

res_w2["Res"]=results_scl
res_w2["Type"]="DT -1..1"

res_w3["Res"]=results_minmax
res_w3["Type"]="DT 0..1"

resall=pd.concat([resall,res_w1,res_w2,res_w3], ignore_index=True)

In [None]:
# Displaying a tree
#    you need to install graphviz
# ! pip install graphviz

from IPython.display import HTML

from sklearn import tree
from graphviz import Source
from IPython.display import SVG, display
from ipywidgets import interactive


style = "<style>svg{width:70% !important;height:70% !important;}</style>"
HTML(style)

model=DecisionTreeClassifier(class_weight="balanced", random_state=seed)
model.fit(X,y)

graph=Source(tree.export_graphviz(model,
        out_file=None,      
        feature_names=p_indians.columns[0:-1],
        class_names=['No Diabetes','Diabetes'],
        filled=True,
        rounded=True))

display(SVG(graph.pipe(format="svg")))


In [None]:
import os
os.environ["PATH"] += os.pathsep + 'C:/Users/GEORGINA/Anaconda3/envs/keras/Library/bin/graphviz/'

In [None]:
# Displaying a tree
#    you need to install graphviz
# ! pip install graphviz

from IPython.display import HTML # te pone el jupyter notebook en formato html

from sklearn import tree
from graphviz import Source
from IPython.display import SVG, display
from ipywidgets import interactive

seed=7

def plot_tree(crit, split, depth, min_split, min_leaf=1):
    
    indians_tree=DecisionTreeClassifier(random_state=seed,
                criterion=crit, # la función que mide calidad de un split, coeficiente de Gini. (aunque tb entropía)
                splitter=split,
                max_depth=depth,
                min_samples_split=min_split, # el mín. numero de casos que debe tener un split para que se haga (ej.2)
                min_samples_leaf=min_leaf)
    indians_tree.fit(X,y)
    
    graph=Source(tree.export_graphviz(indians_tree,
            out_file=None,
            feature_names=p_indians.columns[0:-1],
            class_names=["0","1","2"],
            filled=True,
            rounded=True))
    display(SVG(graph.pipe(format="svg"))) # seria el equivalente al 'show'
    
    return indians_tree

inter=interactive(plot_tree,
        crit=["gini","entropy"],
        split=["best","random"],
        depth=[None,1,2,3,4],
        min_split=(2,100),
        min_leaf=(1,200))

display(inter)

# Gini y Entropía para calcular desigualdad. gini, maximiazr desigualdad. entropia, parámetro que te crea mayor distorsión/vari

# Interactive me permite poner las distintas opciones.

# Support Vector Machines

Support vector machines seeks a line that separates best two classes. The data instances that are closest to this line are, better separating the classes, are called support vectors. 

Support Vector Machines have the advantage that you can change the kernel function to use. Radial basis function is used by default, a pretty powerful one. 

You can construct a SVM model with the <b>SVC</b> class.

In [None]:
# SVM - Support Vector Machines

from sklearn.svm import SVC

kfold=KFold(n_splits=10, random_state=7)

model=SVC(gamma="scale")

results=cross_val_score(model, X, y, cv=kfold)

print(f'Support Vector Machines - Accuracy {results.mean()*100:.3f}% std {results.std()*100:3f}')

results_scl=cross_val_score(model, X_std, y, cv=kfold)

print(f'SVM (-1..1) - Accuracy {results_scl.mean()*100:.3f}% std {results_scl.std()*100:3f}')

results_minmax=cross_val_score(model, X_minmax, y, cv=kfold)

print(f'SVM ( 0..1) - Accuracy {results_minmax.mean()*100:.3f}% std {results_minmax.std()*100:3f}')

# the importance of scaling depends on the kernel used

res_w1["Res"]=results
res_w1["Type"]="SVM"

res_w2["Res"]=results_scl
res_w2["Type"]="SVM -1..1"

res_w3["Res"]=results_minmax
res_w3["Type"]="SVM 0..1"

resall=pd.concat([resall,res_w1,res_w2,res_w3], ignore_index=True)

# Algorithm Comparison

In [None]:
# Now let's compare them all 

plt.figure(figsize=(15,9))

sns.boxplot(data=resall, x="Type", y="Res")

sns.swarmplot(data=resall, x="Type", y="Res", color="royalblue")

<b><font color="red" size=6>Mission 1</font>

a) Do the same with the Times Ranking predicting to be among the 10 best Business Schools.<br><br>
b) Try the Titanic dataset (you'll find all the info that you need in Kaggle). 
<br><br>
</b>

In [None]:
# A) Do the same with the Times Ranking predicting to be among the 10 best Business Schools.

# Import dataset and do corresponding adjustments (just copy-pasted from my 'Feature Selection' Notebook)
times = pd.read_csv('timesData.csv')
times.world_rank.replace('=.','.', regex=True, inplace=True)
times['world_rank'] = times['world_rank'].str.split('-').str.get(0).astype(float)
times.world_rank = pd.to_numeric(times.world_rank, errors='coerce')
times.dropna(subset=['world_rank'], axis=0, inplace=True) # DROPNA only of 'world_rank'
times.international = pd.to_numeric(times.international, errors='coerce')
times.income = pd.to_numeric(times.income, errors='coerce')
times.num_students.replace('\D','', regex=True, inplace=True)
times.num_students = pd.to_numeric(times.num_students, errors='coerce')
times.international_students.replace('\D','', regex=True, inplace=True)
times.international_students = pd.to_numeric(times.international_students, errors='coerce')
times.international_students = times.international_students/100
times['females'] = times.female_male_ratio.str.split(':').str.get(0) # manipulate so I can use the data of this column.
times['males'] = times.female_male_ratio.str.split(':').str.get(1)
times.females = pd.to_numeric(times.females, errors='coerce')
times.males = pd.to_numeric(times.males, errors='coerce')
times['ratio_male_to_female'] = times.males/times.females
times.drop(columns='total_score',inplace=True)
times.dropna(inplace=True)
times.head()

# Select variables and transform to numpy arrays
X = pd.concat([times.iloc[:,3:11],times['ratio_male_to_female']],axis=1).values
times['Top10'] = times['world_rank']<11 # TOP 10
y = times['Top10'].values # ¡vigilar mayúsculas y minúsculas! (debido a esto me daba error)

# Another way of defining the 'y' in this case, would be with a binarizer this way:
    # Y = times.iloc[:,0:1].values
    # new_pbinarizer=Binarizer(threshold=10).fit(Y) # TOP 10
    # times['new_pbinaryY']=new_pbinarizer.transform(Y)
    # times['new_new_pbinaryY'] = times['new_pbinaryY'].apply(lambda x: np.abs(x - 1))
    # Y = times['new_new_pbinaryY']


# Now we standarize our data (taking the data from above, since the questions asks to do the 'same' and variable names coincide)

std_scaler=preprocessing.StandardScaler()
X_std=std_scaler.fit_transform(X)

minmax_scaler=preprocessing.MinMaxScaler()
X_minmax=minmax_scaler.fit_transform(X)

# Create the DataFrames for plotting (right now empty, but will be filled later)
resall=pd.DataFrame()
res_w1=pd.DataFrame()
res_w2=pd.DataFrame()
res_w3=pd.DataFrame()

# And now I proceed with the different algorithms, applying each one to the three different (scaled)'version' of my variables.

print()
print("---------------------------------------- LOGISTIC REGRESSION ------------------------------------------------")

# Logistic Regression

# Imports
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# KFold
kfold=KFold(n_splits=10, random_state=7)

model=LogisticRegression(solver="liblinear")

# Accuracy for X
results=cross_val_score(model, X, y, cv=kfold)
print(f'Logistic Regression - Accuracy {results.mean()*100:.3f}% std {results.std()*100:3f}')

# Accuracy for X_std
results_scl=cross_val_score(model, X_std, y, cv=kfold)
print(f'Logistic Regression (-1..1) - Accuracy {results_scl.mean()*100:.3f}% std {results_scl.std()*100:3f}')

# Accuracy for X_minmax
results_minmax=cross_val_score(model, X_minmax, y, cv=kfold)
print(f'Logistic Regression ( 0..1) - Accuracy {results_minmax.mean()*100:.3f}% std {results_minmax.std()*100:3f}')


res_w1["Res"]=results
res_w1["Type"]="log"

res_w2["Res"]=results_scl
res_w2["Type"]="log -1..1"

res_w3["Res"]=results_minmax
res_w3["Type"]="log 0..1"

resall=pd.concat([resall,res_w1,res_w2,res_w3], ignore_index=True)

print()
print("---------------------------------------- LDA ------------------------------------------------")

# LDA - Linear Discriminant Analysis 

# Imports (no need for KFold and so on because they are above)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# KFold
kfold=KFold(n_splits=10, random_state=7)

model=LinearDiscriminantAnalysis()

results=cross_val_score(model, X, y, cv=kfold)

print(f'LDA Linear Discriminant Analysis - Accuracy {results.mean()*100:.3f}% std {results.std()*100:3f}')

results_scl=cross_val_score(model, X_std, y, cv=kfold)

print(f'LDA (-1..1) - Accuracy {results_scl.mean()*100:.3f}% std {results_scl.std()*100:3f}')

results_minmax=cross_val_score(model, X_minmax, y, cv=kfold)

print(f'LDA ( 0..1) - Accuracy {results_minmax.mean()*100:.3f}% std {results_minmax.std()*100:3f}')


res_w1["Res"]=results
res_w1["Type"]="LDA"

res_w2["Res"]=results_scl
res_w2["Type"]="LDA -1..1"

res_w3["Res"]=results_minmax
res_w3["Type"]="LDA 0..1"

resall=pd.concat([resall,res_w1,res_w2,res_w3], ignore_index=True)

print()
print("---------------------------------------- KNN ------------------------------------------------")
# KNN Classification

# Imports (no need for KFold and so on because I have imported them above in this same cell)
from sklearn.neighbors import KNeighborsClassifier

# KFold
kfold=KFold(n_splits=10, random_state=7)
model=KNeighborsClassifier()


results=cross_val_score(model, X, y, cv=kfold)

print(f'KNN - Accuracy {results.mean()*100:.3f}% std {results.std()*100:3f}')

results_scl=cross_val_score(model, X_std, y, cv=kfold)

print(f'KNN (-1..1) - Accuracy {results_scl.mean()*100:.3f}% std {results_scl.std()*100:3f}')

results_minmax=cross_val_score(model, X_minmax, y, cv=kfold)

print(f'KNN ( 0..1) - Accuracy {results_minmax.mean()*100:.3f}% std {results_minmax.std()*100:3f}')

# scaling in knn is necessary ...

res_w1["Res"]=results
res_w1["Type"]="KNN"

res_w2["Res"]=results_scl
res_w2["Type"]="KNN -1..1"

res_w3["Res"]=results_minmax
res_w3["Type"]="KNN 0..1"

resall=pd.concat([resall,res_w1,res_w2,res_w3], ignore_index=True)

print()
print("---------------------------------------- NAIVE BAYES ------------------------------------------------")
# Naive Bayes

from sklearn.naive_bayes import GaussianNB

kfold=KFold(n_splits=10, random_state=7)

model=GaussianNB()

results=cross_val_score(model, X, y, cv=kfold)

print(f'Naive Bayes - Accuracy {results.mean()*100:.3f}% std {results.std()*100:3f}')

results_scl=cross_val_score(model, X_std, y, cv=kfold)

print(f'Naive Bayes (-1..1) - Accuracy {results_scl.mean()*100:.3f}% std {results_scl.std()*100:3f}')

results_minmax=cross_val_score(model, X_minmax, y, cv=kfold)

print(f'Naive Bayes ( 0..1) - Accuracy {results_minmax.mean()*100:.3f}% std {results_minmax.std()*100:3f}')

res_w1["Res"]=results
res_w1["Type"]="NB"

res_w2["Res"]=results_scl
res_w2["Type"]="NB -1..1"

res_w3["Res"]=results_minmax
res_w3["Type"]="NB 0..1"

resall=pd.concat([resall,res_w1,res_w2,res_w3], ignore_index=True)

print()
print("---------------------------------------- DECISION TREE ------------------------------------------------")

# Decision Trees

from sklearn.tree import DecisionTreeClassifier

seed=7

kfold=KFold(n_splits=10, random_state=seed)

model=DecisionTreeClassifier(class_weight="balanced", random_state=seed)


results=cross_val_score(model, X, y, cv=kfold)

print(f'Decision Tree - Accuracy {results.mean()*100:.3f}% std {results.std()*100:3f}')

results_scl=cross_val_score(model, X_std, y, cv=kfold)

print(f'Decision Tree (-1..1) - Accuracy {results_scl.mean()*100:.3f}% std {results_scl.std()*100:3f}')

results_minmax=cross_val_score(model, X_minmax, y, cv=kfold)

print(f'Decision Tree ( 0..1) - Accuracy {results_minmax.mean()*100:.3f}% std {results_minmax.std()*100:3f}')

res_w1["Res"]=results
res_w1["Type"]="DT"

res_w2["Res"]=results_scl
res_w2["Type"]="DT -1..1"

res_w3["Res"]=results_minmax
res_w3["Type"]="DT 0..1"

resall=pd.concat([resall,res_w1,res_w2,res_w3], ignore_index=True)


# Displaying the interactive tree

# Imports
from IPython.display import HTML 
from sklearn import tree
from graphviz import Source
from IPython.display import SVG, display
from ipywidgets import interactive

seed=7

def plot_tree(crit, split, depth, min_split, min_leaf=1): # mejor ponerlo la próxima vez en celda aparte.
    
    times_tree=DecisionTreeClassifier(random_state=seed,
                criterion=crit, 
                splitter=split,
                max_depth=depth,
                min_samples_split=min_split,  
                min_samples_leaf=min_leaf)
    times_tree.fit(X,y)
    
    graph=Source(tree.export_graphviz(times_tree,
            out_file=None,
            feature_names=times.columns[3:-5], # Lo ideal hubiera sido al principio haber dejado el df ya apañado
            class_names=["0","1"], # I only have two classes: top10 or not top10
            filled=True,
            rounded=True))
    display(SVG(graph.pipe(format="svg"))) 
    
    return times_tree

inter=interactive(plot_tree,
        crit=["gini","entropy"],
        split=["best","random"],
        depth=[None,1,2,3,4],
        min_split=(2,100),
        min_leaf=(1,200))

display(inter)

print()
print("---------------------------------------- SVM ------------------------------------------------")

# SVM - Support Vector Machines

from sklearn.svm import SVC

kfold=KFold(n_splits=10, random_state=7)

model=SVC(gamma="scale")

results=cross_val_score(model, X, y, cv=kfold)

print(f'Support Vector Machines - Accuracy {results.mean()*100:.3f}% std {results.std()*100:3f}')

results_scl=cross_val_score(model, X_std, y, cv=kfold)

print(f'SVM (-1..1) - Accuracy {results_scl.mean()*100:.3f}% std {results_scl.std()*100:3f}')

results_minmax=cross_val_score(model, X_minmax, y, cv=kfold)

print(f'SVM ( 0..1) - Accuracy {results_minmax.mean()*100:.3f}% std {results_minmax.std()*100:3f}')

# the importance of scaling depends on the kernel used

res_w1["Res"]=results
res_w1["Type"]="SVM"

res_w2["Res"]=results_scl
res_w2["Type"]="SVM -1..1"

res_w3["Res"]=results_minmax
res_w3["Type"]="SVM 0..1"

resall=pd.concat([resall,res_w1,res_w2,res_w3], ignore_index=True)

print()
print("---------------------------------------------------------------------------------------------------------------------")
print("---------------------------------------- ALGORITHM COMPARISON ------------------------------------------------")

# Now let's compare them all 

plt.figure(figsize=(15,9))

sns.boxplot(data=resall, x="Type", y="Res")

sns.swarmplot(data=resall, x="Type", y="Res", color="royalblue")



# INTERPRETATION: (in next cell:)


Based on the box plots above, I would discard the algorithms that present a low accuracy (the line in the middle of the box plot) and high variance (the box representing the middle 50% of the accuracy results for each of the algorithms.
Therefore, I would totally discard in this case Naive Bayes first.
I would select the KNN without scaling the features because not only the accuracy is high, but also there is low variance and most of the dots (the results of the accuracy) are concentrated at the top as well.
As a second option, the SVM (having scaled the features with minmax or with the standardizer).
And as a third valid option, the logistic regression with the minmax features.

KNN y SVM funcionan con distancias euclideanas, por lo que si no haces estandarización funcionan fatal.
LDA ya hace estandarización automáticamente, por lo que de las 3 formas sale igual.
Naive Bayes funciona por distribuciones, no por distancias.

In [None]:
# B) Try the Titanic dataset (you'll find all the info that you need in Kaggle). 

# IMPORTS
    # Libraries and packages such as Pandas and Numpy are already imported above, in the first cell of this notebook.

# IMPORT DATA SET
data = pd.read_csv('titanic.csv')
    # 1) Visualize and understand data set, in order to drop no needed columns.
data.head()
data.describe()
data.drop('Name', axis=1, inplace=True) # I will not use Name as an input variable
data['Sex'] = data['Sex'].map({'male': 0, 'female': 1}) # Change strings to floats in the 'sex' column.
    # I could have done the 'sex' column transformation with dummy variables, but this works too.
    # 2) Check data types are the ones I expect, and change the ones that are not the correct type.
data.info() # since all of them are integers and floats, it is fine so far.
    # No need to get rid of null values, since according to data.info() there are non-null for each of the columns.
    # 3) Fill any missing values with e.g. mean or median - Drop any observations with too many missing features (none missing)
    # 4) If your data contains categorical values - inspect their values (no, all numerial now)
    # 5) Convert categorical values to 0 and 1 (done that already with 'sex' column)
    # 6) Inspect my data again and make sure everything looks good (since I have not changed it since last .info(), no need)
data.head() # to see final version
# SELECT INPUT AND TARGET/OUTPUT VARIABLES (and transform to numpy array)
X = data.drop('Survived', axis=1).values
y = data['Survived'].values
print(X.shape, y.shape) # check, so I do not get error mistakes later on such as the ones received yesterday.

    # Now we standardize our data (so we can replicate the comparison of algorithms and with/without scaling)

std_scaler=preprocessing.StandardScaler()
X_std=std_scaler.fit_transform(X)

minmax_scaler=preprocessing.MinMaxScaler()
X_minmax=minmax_scaler.fit_transform(X)

    # Create the empty DataFrames for plotting later
resall=pd.DataFrame()
res_w1=pd.DataFrame()
res_w2=pd.DataFrame()
res_w3=pd.DataFrame()

# ALGORITHMS 
    # And now I proceed with the different algorithms, applying each one to the three different (scaled)'version' of my variables.
    # Copy-pasted this part of the code from this same Mission, part 1. Just made some adjustments in the code.

print()
print("---------------------------------------- LOGISTIC REGRESSION ------------------------------------------------")

# Logistic Regression

# Imports
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# KFold
kfold=KFold(n_splits=10, random_state=7)

model=LogisticRegression(solver="liblinear")

# Accuracy for X
results=cross_val_score(model, X, y, cv=kfold)
print(f'Logistic Regression - Accuracy {results.mean()*100:.3f}% std {results.std()*100:3f}')

# Accuracy for X_std
results_scl=cross_val_score(model, X_std, y, cv=kfold)
print(f'Logistic Regression (-1..1) - Accuracy {results_scl.mean()*100:.3f}% std {results_scl.std()*100:3f}')

# Accuracy for X_minmax
results_minmax=cross_val_score(model, X_minmax, y, cv=kfold)
print(f'Logistic Regression ( 0..1) - Accuracy {results_minmax.mean()*100:.3f}% std {results_minmax.std()*100:3f}')


res_w1["Res"]=results
res_w1["Type"]="log"

res_w2["Res"]=results_scl
res_w2["Type"]="log -1..1"

res_w3["Res"]=results_minmax
res_w3["Type"]="log 0..1"

resall=pd.concat([resall,res_w1,res_w2,res_w3], ignore_index=True)

print()
print("---------------------------------------- LDA ------------------------------------------------")

# LDA - Linear Discriminant Analysis 

# Imports (no need for KFold and so on because they are above)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# KFold
kfold=KFold(n_splits=10, random_state=7)

model=LinearDiscriminantAnalysis()

results=cross_val_score(model, X, y, cv=kfold)

print(f'LDA Linear Discriminant Analysis - Accuracy {results.mean()*100:.3f}% std {results.std()*100:3f}')

results_scl=cross_val_score(model, X_std, y, cv=kfold)

print(f'LDA (-1..1) - Accuracy {results_scl.mean()*100:.3f}% std {results_scl.std()*100:3f}')

results_minmax=cross_val_score(model, X_minmax, y, cv=kfold)

print(f'LDA ( 0..1) - Accuracy {results_minmax.mean()*100:.3f}% std {results_minmax.std()*100:3f}')


res_w1["Res"]=results
res_w1["Type"]="LDA"

res_w2["Res"]=results_scl
res_w2["Type"]="LDA -1..1"

res_w3["Res"]=results_minmax
res_w3["Type"]="LDA 0..1"

resall=pd.concat([resall,res_w1,res_w2,res_w3], ignore_index=True)

print()
print("---------------------------------------- KNN ------------------------------------------------")
# KNN Classification

# Imports (no need for KFold and so on because I have imported them above in this same cell)
from sklearn.neighbors import KNeighborsClassifier

# KFold
kfold=KFold(n_splits=10, random_state=7)
model=KNeighborsClassifier()


results=cross_val_score(model, X, y, cv=kfold)

print(f'KNN - Accuracy {results.mean()*100:.3f}% std {results.std()*100:3f}')

results_scl=cross_val_score(model, X_std, y, cv=kfold)

print(f'KNN (-1..1) - Accuracy {results_scl.mean()*100:.3f}% std {results_scl.std()*100:3f}')

results_minmax=cross_val_score(model, X_minmax, y, cv=kfold)

print(f'KNN ( 0..1) - Accuracy {results_minmax.mean()*100:.3f}% std {results_minmax.std()*100:3f}')

# scaling in knn is necessary ...

res_w1["Res"]=results
res_w1["Type"]="KNN"

res_w2["Res"]=results_scl
res_w2["Type"]="KNN -1..1"

res_w3["Res"]=results_minmax
res_w3["Type"]="KNN 0..1"

resall=pd.concat([resall,res_w1,res_w2,res_w3], ignore_index=True)

print()
print("---------------------------------------- NAIVE BAYES ------------------------------------------------")
# Naive Bayes

from sklearn.naive_bayes import GaussianNB

kfold=KFold(n_splits=10, random_state=7)

model=GaussianNB()

results=cross_val_score(model, X, y, cv=kfold)

print(f'Naive Bayes - Accuracy {results.mean()*100:.3f}% std {results.std()*100:3f}')

results_scl=cross_val_score(model, X_std, y, cv=kfold)

print(f'Naive Bayes (-1..1) - Accuracy {results_scl.mean()*100:.3f}% std {results_scl.std()*100:3f}')

results_minmax=cross_val_score(model, X_minmax, y, cv=kfold)

print(f'Naive Bayes ( 0..1) - Accuracy {results_minmax.mean()*100:.3f}% std {results_minmax.std()*100:3f}')

res_w1["Res"]=results
res_w1["Type"]="NB"

res_w2["Res"]=results_scl
res_w2["Type"]="NB -1..1"

res_w3["Res"]=results_minmax
res_w3["Type"]="NB 0..1"

resall=pd.concat([resall,res_w1,res_w2,res_w3], ignore_index=True)

print()
print("---------------------------------------- DECISION TREE ------------------------------------------------")

# Decision Trees

from sklearn.tree import DecisionTreeClassifier

seed=7

kfold=KFold(n_splits=10, random_state=seed)

model=DecisionTreeClassifier(class_weight="balanced", random_state=seed)


results=cross_val_score(model, X, y, cv=kfold)

print(f'Decision Tree - Accuracy {results.mean()*100:.3f}% std {results.std()*100:3f}')

results_scl=cross_val_score(model, X_std, y, cv=kfold)

print(f'Decision Tree (-1..1) - Accuracy {results_scl.mean()*100:.3f}% std {results_scl.std()*100:3f}')

results_minmax=cross_val_score(model, X_minmax, y, cv=kfold)

print(f'Decision Tree ( 0..1) - Accuracy {results_minmax.mean()*100:.3f}% std {results_minmax.std()*100:3f}')

res_w1["Res"]=results
res_w1["Type"]="DT"

res_w2["Res"]=results_scl
res_w2["Type"]="DT -1..1"

res_w3["Res"]=results_minmax
res_w3["Type"]="DT 0..1"

resall=pd.concat([resall,res_w1,res_w2,res_w3], ignore_index=True)


# Displaying the interactive tree

# Imports
from IPython.display import HTML 
from sklearn import tree
from graphviz import Source
from IPython.display import SVG, display
from ipywidgets import interactive

seed=7

def plot_tree(crit, split, depth, min_split, min_leaf=1):
    
    data_tree=DecisionTreeClassifier(random_state=seed,
                criterion=crit, 
                splitter=split,
                max_depth=depth,
                min_samples_split=min_split,  
                min_samples_leaf=min_leaf)
    data_tree.fit(X,y)
    
    graph=Source(tree.export_graphviz(data_tree,
            out_file=None,
            feature_names=data.columns[1:], # All the columns I have in the preprocessed version of data, except 'Survived'
            class_names=["0","1"], # only two classes: survived or not
            filled=True,
            rounded=True))
    display(SVG(graph.pipe(format="svg"))) 
    
    return data_tree

inter=interactive(plot_tree,
        crit=["gini","entropy"],
        split=["best","random"],
        depth=[None,1,2,3,4],
        min_split=(2,100),
        min_leaf=(1,200))

display(inter)

print()
print("---------------------------------------- SVM ------------------------------------------------")

# SVM - Support Vector Machines

from sklearn.svm import SVC

kfold=KFold(n_splits=10, random_state=7)

model=SVC(gamma="scale")

results=cross_val_score(model, X, y, cv=kfold)

print(f'Support Vector Machines - Accuracy {results.mean()*100:.3f}% std {results.std()*100:3f}')

results_scl=cross_val_score(model, X_std, y, cv=kfold)

print(f'SVM (-1..1) - Accuracy {results_scl.mean()*100:.3f}% std {results_scl.std()*100:3f}')

results_minmax=cross_val_score(model, X_minmax, y, cv=kfold)

print(f'SVM ( 0..1) - Accuracy {results_minmax.mean()*100:.3f}% std {results_minmax.std()*100:3f}')

# the importance of scaling depends on the kernel used

res_w1["Res"]=results
res_w1["Type"]="SVM"

res_w2["Res"]=results_scl
res_w2["Type"]="SVM -1..1"

res_w3["Res"]=results_minmax
res_w3["Type"]="SVM 0..1"

resall=pd.concat([resall,res_w1,res_w2,res_w3], ignore_index=True)

print()
print("---------------------------------------------------------------------------------------------------------------------")
print("---------------------------------------- ALGORITHM COMPARISON ------------------------------------------------")

# Now let's compare them all 

plt.figure(figsize=(15,9))

sns.boxplot(data=resall, x="Type", y="Res")

sns.swarmplot(data=resall, x="Type", y="Res", color="royalblue")



# INTERPRETATION: (in next cell:)

In order to choose an algorithm, there are two main factors that need to be taken into account:
- Average of the accuracy (as high as possible, the line in the middle of the boxplot/whisker diagram)
- Low variance (which can be seen with the range in the box, from the 25%-75% percentile, and the lines outside for the rest, as well as the outliers).

Therefore, I discard KNN and SVM without standardization.
Although KNN minmax is the highest in average accuracy, the variance is great as well, and therefore would not opt for it.
My first options would be SVM with the features standardized (either with MinMax or the StandardScaler).
Second option would be Naive Bayes, and third LDA.