In [1]:
#importing standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#importing libraries for modeling
from sklearn.model_selection import train_test_split
from prepare import tts
from sklearn.tree import plot_tree
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from env import get_connection
import acquire

In [2]:
# function to get the titanic data from my acquire file

def prep_titanic():
    titan=acquire.get_titanic()
    titan.drop(columns=['passenger_id', 'embarked', 'deck', 'age', 'class'], inplace=True)
    dummy_var=pd.get_dummies(titan[['embark_town', 'sex']], drop_first=True)
    titan=pd.concat([titan, dummy_var], axis=1)
    return titan

In [3]:
#assigning the titanic data to a dataframe

df=prep_titanic()

df.head()

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,embark_town_Queenstown,embark_town_Southampton,sex_male
0,0,3,male,1,0,7.25,Southampton,0,0,1,1
1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,1,3,female,0,0,7.925,Southampton,1,0,1,0
3,1,1,female,1,0,53.1,Southampton,0,0,1,0
4,0,3,male,0,0,8.05,Southampton,1,0,1,1


In [4]:
df.dtypes

survived                     int64
pclass                       int64
sex                         object
sibsp                        int64
parch                        int64
fare                       float64
embark_town                 object
alone                        int64
embark_town_Queenstown       uint8
embark_town_Southampton      uint8
sex_male                     uint8
dtype: object

In [5]:
#dropping columns that have been split into dummy variables

df = df.drop(columns=['embark_town', 'sex'])
df.head()

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,embark_town_Queenstown,embark_town_Southampton,sex_male
0,0,3,1,0,7.25,0,0,1,1
1,1,1,1,0,71.2833,0,0,0,0
2,1,3,0,0,7.925,1,0,1,0
3,1,1,1,0,53.1,0,0,1,0
4,0,3,0,0,8.05,1,0,1,1


In [6]:
#Adding another dummy variable for pclass and dropping the original column

df=pd.get_dummies(df, columns=['pclass'], drop_first=True)

df.head()

Unnamed: 0,survived,sibsp,parch,fare,alone,embark_town_Queenstown,embark_town_Southampton,sex_male,pclass_2,pclass_3
0,0,1,0,7.25,0,0,1,1,0,1
1,1,1,0,71.2833,0,0,0,0,0,0
2,1,0,0,7.925,1,0,1,0,0,1
3,1,1,0,53.1,0,0,1,0,0,0
4,0,0,0,8.05,1,0,1,1,0,1


In [7]:
t_train, t_val, t_test = tts(df)


stratify=survived


In [8]:
#assigning the X and y train variables for modeling

X_train = t_train.drop(columns=['survived'])
y_train = t_train['survived']

X_val = t_val.drop(columns=['survived'])
y_val = t_val['survived']

X_test = t_test.drop(columns=['survived'])
y_test = t_test['survived']

# Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)



In [9]:
# building the model
knn = KNeighborsClassifier(n_neighbors=3)

In [10]:
#fit the model
knn.fit(X_train, y_train)

In [11]:
#making predictions using the model
y_pred= knn.predict(X_train)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


# Evaluate your results using the model score, confusion matrix, and classification report.



In [12]:
knn.score(X_train, y_train)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


0.8514056224899599

In [13]:
cmt=pd.DataFrame(data=confusion_matrix(y_train, y_pred), 
             columns= ['pred died', 'pred survived'], 
             index= ['actually died', 'actually survived'])
cmt

Unnamed: 0,pred died,pred survived
actually died,270,37
actually survived,37,154


In [58]:
traindf = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True, 
                                   target_names=['died', 'survived']))
traindf

Unnamed: 0,died,survived,accuracy,macro avg,weighted avg
precision,0.879479,0.806283,0.851406,0.842881,0.851406
recall,0.879479,0.806283,0.851406,0.842881,0.851406
f1-score,0.879479,0.806283,0.851406,0.842881,0.851406
support,307.0,191.0,0.851406,498.0,498.0


# Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.



In [37]:
#assigning data results to variables

accuracy= traindf.iloc[1:]['accuracy'][0]
precision= traindf.iloc[0:2]['survived'][0]
recall= traindf.iloc[1:2]['survived'][0]
f1= traindf.iloc[2:3]['survived'][0]
support= traindf.iloc[3:4]['survived'][0]
tn, fp, fn, tp= confusion_matrix(y_train, y_pred).ravel()
neg= tn+fp
pos= fn+tp
tpr=(tp/pos).round(4)*100
fpr=(fp/pos).round(4)*100
tnr=(tn/neg).round(3)*100
fnr=(fn/neg).round(3)*100
#printing results using vairables

print(f'Accuracy is {accuracy.round(4)*100}%')
print(f'Precision is {precision.round(4)*100}%')
print(f'Recall is {recall.round(4)*100}%')
print(f'F1-score is {f1.round(4)*100}%')
print(f'Support is {support}')
print(f'True positive rate is {tpr}%')
print(f'False positive rate is {fpr}%')
print(f'True negative rate is {tnr}%')
print(f'False negative rate is {fnr}%')

Accuracy is 85.14%
Precision is 80.63%
Recall is 80.63%
F1-score is 80.63%
Support is 191.0
True positive rate is 80.63%
False positive rate is 19.37%
True negative rate is 87.9%
False negative rate is 12.1%


# Run through steps 1-3 setting k to 10



In [16]:
# setting number of nearest neighbors to 10
knn10 = KNeighborsClassifier(n_neighbors=10)

In [17]:
#fitting the new model
knn10.fit(X_train,y_train)

In [18]:
#making predictions using the new model
y_pred10= knn10.predict(X_train)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [19]:
#evaluating the model
knn10.score(X_train, y_train)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


0.7991967871485943

In [20]:
#making a confusion matrix with the new model
cmt10=pd.DataFrame(data=confusion_matrix(y_train, y_pred10), 
             columns= ['pred died', 'pred survived'], 
             index= ['actually died', 'actually survived'])
cmt10

Unnamed: 0,pred died,pred survived
actually died,264,43
actually survived,57,134


In [21]:
#making a dataframe using the classification report
traindf10 = pd.DataFrame(classification_report(y_train, y_pred10, output_dict=True, 
                                   target_names=['died', 'survived']))
traindf10

Unnamed: 0,died,survived,accuracy,macro avg,weighted avg
precision,0.82243,0.757062,0.799197,0.789746,0.797359
recall,0.859935,0.701571,0.799197,0.780753,0.799197
f1-score,0.840764,0.728261,0.799197,0.784513,0.797615
support,307.0,191.0,0.799197,498.0,498.0


In [39]:
#assigning data results to variables

accuracy10= traindf10.iloc[1:]['accuracy'][0]
precision10= traindf10.iloc[0:2]['survived'][0]
recall10= traindf10.iloc[1:2]['survived'][0]
f110= traindf10.iloc[2:3]['survived'][0]
support10= traindf10.iloc[3:4]['survived'][0]
tn10, fp10, fn10, tp10= confusion_matrix(y_train, y_pred10).ravel()
neg10= tn10+fp10
pos10= fn10+tp10

tpr10=(tp10/pos10).round(4)*100
fpr10=(fp10/pos10).round(4)*100
tnr10=(tn10/neg10).round(3)*100
fnr10=(fn10/neg10).round(3)*100

#printing results using vairables

print(f'Accuracy is {accuracy10.round(4)*100}%')
print(f'Precision is {precision10.round(4)*100}%')
print(f'Recall is {recall10.round(4)*100}%')
print(f'F1-score is {f110.round(4)*100}%')
print(f'Support is {support10}')
print(f'True positive rate is {(tp10/pos10).round(4)*100}%')
print(f'False positive rate is {(fp10/pos10).round(3)*100}%')
print(f'True negative rate is {(tn10/neg10).round(3)*100}%')
print(f'False negative rate is {(fn10/neg10).round(3)*100}%')

Accuracy is 79.92%
Precision is 75.71%
Recall is 70.16%
F1-score is 72.83%
Support is 191.0
True positive rate is 70.16%
False positive rate is 22.5%
True negative rate is 86.0%
False negative rate is 18.6%


# Run through steps 1-3 setting k to 20



In [23]:
#setting the new model to 20 nearest neighbors
knn20 = KNeighborsClassifier(n_neighbors=20)

In [24]:
#fitting the new model
knn20.fit(X_train,y_train)

In [25]:
#making predictions using the new model
y_pred20= knn20.predict(X_train)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [26]:
#evaluating the model
knn20.score(X_train, y_train)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


0.7449799196787149

In [27]:
#making a confusion matrix with the new model
cmt20=pd.DataFrame(data=confusion_matrix(y_train, y_pred20), 
             columns= ['pred died', 'pred survived'], 
             index= ['actually died', 'actually survived'])
cmt20

Unnamed: 0,pred died,pred survived
actually died,262,45
actually survived,82,109


In [28]:
#making a dataframe using the classification report
traindf20 = pd.DataFrame(classification_report(y_train, y_pred20, output_dict=True, 
                                   target_names=['died', 'survived']))
traindf20

Unnamed: 0,died,survived,accuracy,macro avg,weighted avg
precision,0.761628,0.707792,0.74498,0.73471,0.74098
recall,0.85342,0.570681,0.74498,0.71205,0.74498
f1-score,0.804916,0.631884,0.74498,0.7184,0.738552
support,307.0,191.0,0.74498,498.0,498.0


In [40]:
#assigning data results to variables

accuracy20= traindf20.iloc[1:]['accuracy'][0]
precision20= traindf20.iloc[0:2]['survived'][0]
recall20= traindf20.iloc[1:2]['survived'][0]
f120= traindf20.iloc[2:3]['survived'][0]
support20= traindf20.iloc[3:4]['survived'][0]
tn20, fp20, fn20, tp20= confusion_matrix(y_train, y_pred20).ravel()
neg20= tn20+fp20
pos20= fn20+tp20


tpr20=(tp20/pos20).round(4)*100
fpr20=(fp20/pos20).round(4)*100
tnr20=(tn20/neg20).round(3)*100
fnr20=(fn20/neg20).round(3)*100

#printing results using vairables

print(f'Accuracy is {accuracy20.round(4)*100}%')
print(f'Precision is {precision20.round(4)*100}%')
print(f'Recall is {recall20.round(4)*100}%')
print(f'F1-score is {f120.round(3)*100}%')
print(f'Support is {support20}')
print(f'True positive rate is {(tp20/pos20).round(4)*100}%')
print(f'False positive rate is {(fp20/pos20).round(4)*100}%')
print(f'True negative rate is {(tn20/neg20).round(3)*100}%')
print(f'False negative rate is {(fn20/neg20).round(1)*100}%')

Accuracy is 74.5%
Precision is 70.78%
Recall is 57.07%
F1-score is 63.2%
Support is 191.0
True positive rate is 57.07%
False positive rate is 23.56%
True negative rate is 85.3%
False negative rate is 30.0%


# What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?



In [43]:
pd.DataFrame({'accuracy': [accuracy, accuracy10, accuracy20], 
              'precision': [precision, precision10, precision20], 
             'recall': [recall, recall10, recall20], 
             'f1': [f1, f110, f120], 
             'support': [support, support10, support20], 
             'true_positive_rate': [tpr, tpr10, tpr20], 
             'false_positive_rate': [fpr, fpr10, fpr20], 
             'true_negative_rate': [tnr, tnr10, tnr20],
             'false_negaive_rate': [fnr, fnr10, fnr20]}, 
            index = ['3_nearest_neighbors', '10_nearest_neighbors', '20_nearest_neighbors'])

Unnamed: 0,accuracy,precision,recall,f1,support,true_positive_rate,false_positive_rate,true_negative_rate,false_negaive_rate
3_nearest_neighbors,0.851406,0.806283,0.806283,0.806283,191.0,80.63,19.37,87.9,12.1
10_nearest_neighbors,0.799197,0.757062,0.701571,0.728261,191.0,70.16,22.51,86.0,18.6
20_nearest_neighbors,0.74498,0.707792,0.570681,0.631884,191.0,57.07,23.56,85.3,26.7


In [44]:
#everything decreases as we increase the number of nearest neighbors. 
#the knn=3 is the best model for our in-samole data

# Which model performs best on our out-of-sample data from validate?



In [48]:
y_val_pred= knn.predict(X_val)
y_val_pred10= knn10.predict(X_val)
y_val_pred20= knn20.predict(X_val)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


# making a confusion matrix and classification report for each model using the validate sample

In [49]:
vcmt=pd.DataFrame(data=confusion_matrix(y_val, y_val_pred), 
             columns= ['pred died', 'pred survived'], 
             index= ['actually died', 'actually survived'])
vcmt

Unnamed: 0,pred died,pred survived
actually died,108,24
actually survived,30,52


In [56]:
vtraindf = pd.DataFrame(classification_report(y_val, y_val_pred, output_dict=True, 
                                   target_names=['died', 'survived']))
vtraindf

Unnamed: 0,died,survived,accuracy,macro avg,weighted avg
precision,0.782609,0.684211,0.747664,0.73341,0.744905
recall,0.818182,0.634146,0.747664,0.726164,0.747664
f1-score,0.8,0.658228,0.747664,0.729114,0.745676
support,132.0,82.0,0.747664,214.0,214.0


In [51]:
vcmt10=pd.DataFrame(data=confusion_matrix(y_val, y_val_pred10), 
             columns= ['pred died', 'pred survived'], 
             index= ['actually died', 'actually survived'])
vcmt10

Unnamed: 0,pred died,pred survived
actually died,112,20
actually survived,36,46


In [52]:
vtraindf10 = pd.DataFrame(classification_report(y_val, y_val_pred10, output_dict=True, 
                                   target_names=['died', 'survived']))
vtraindf10

Unnamed: 0,died,survived,accuracy,macro avg,weighted avg
precision,0.756757,0.69697,0.738318,0.726863,0.733848
recall,0.848485,0.560976,0.738318,0.70473,0.738318
f1-score,0.8,0.621622,0.738318,0.710811,0.731649
support,132.0,82.0,0.738318,214.0,214.0


In [53]:
vcmt20=pd.DataFrame(data=confusion_matrix(y_val, y_val_pred20), 
             columns= ['pred died', 'pred survived'], 
             index= ['actually died', 'actually survived'])
vcmt20

Unnamed: 0,pred died,pred survived
actually died,113,19
actually survived,42,40


In [54]:
vtraindf20 = pd.DataFrame(classification_report(y_val, y_val_pred20, output_dict=True, 
                                   target_names=['died', 'survived']))
vtraindf20

Unnamed: 0,died,survived,accuracy,macro avg,weighted avg
precision,0.729032,0.677966,0.714953,0.703499,0.709465
recall,0.856061,0.487805,0.714953,0.671933,0.714953
f1-score,0.787456,0.567376,0.714953,0.677416,0.703127
support,132.0,82.0,0.714953,214.0,214.0


# setting variables for each models validate results

In [57]:
#knn=3
vaccuracy= vtraindf.iloc[1:]['accuracy'][0]
vprecision= vtraindf.iloc[0:2]['survived'][0]
vrecall= vtraindf.iloc[1:2]['survived'][0]
vf1= vtraindf.iloc[2:3]['survived'][0]
vsupport= vtraindf.iloc[3:4]['survived'][0]
vtn, vfp, vfn, vtp= confusion_matrix(y_val, y_val_pred).ravel()
vneg= vtn+vfp
vpos= vfn+vtp

vtpr=(vtp/vpos).round(4)*100
vfpr=(vfp/vpos).round(4)*100
vtnr=(vtn/vneg).round(3)*100
vfnr=(vfn/vneg).round(3)*100


#knn=10
vaccuracy10= vtraindf10.iloc[1:]['accuracy'][0]
vprecision10= vtraindf10.iloc[0:2]['survived'][0]
vrecall10= vtraindf10.iloc[1:2]['survived'][0]
vf110= vtraindf10.iloc[2:3]['survived'][0]
vsupport10= vtraindf10.iloc[3:4]['survived'][0]
vtn10, vfp10, vfn10, vtp10= confusion_matrix(y_val, y_val_pred10).ravel()
vneg10= tn10+fp10
vpos10= fn10+tp10

vtpr10=(vtp10/vpos10).round(4)*100
vfpr10=(vfp10/vpos10).round(4)*100
vtnr10=(vtn10/vneg10).round(3)*100
vfnr10=(vfn10/vneg10).round(3)*100


#knn=20
vaccuracy20= vtraindf20.iloc[1:]['accuracy'][0]
vprecision20= vtraindf20.iloc[0:2]['survived'][0]
vrecall20= vtraindf20.iloc[1:2]['survived'][0]
vf120= vtraindf20.iloc[2:3]['survived'][0]
vsupport20= vtraindf20.iloc[3:4]['survived'][0]
vtn20, vfp20, vfn20, vtp20= confusion_matrix(y_val, y_val_pred20).ravel()
vneg20= vtn20+vfp20
vpos20= vfn20+vtp20


vtpr20=(vtp20/vpos20).round(4)*100
vfpr20=(vfp20/vpos20).round(4)*100
vtnr20=(vtn20/vneg20).round(3)*100
vfnr20=(vfn20/vneg20).round(3)*100


# Dataframe of results

In [59]:
pd.DataFrame({'val_accuracy': [vaccuracy, vaccuracy10, vaccuracy20], 
              'val_precision': [vprecision, vprecision10, vprecision20], 
             'val_recall': [vrecall, vrecall10, vrecall20], 
             'val_f1': [vf1, vf110, vf120], 
             'val_support': [vsupport, vsupport10, vsupport20], 
             'val_true_positive_rate': [vtpr, vtpr10, vtpr20], 
             'val_false_positive_rate': [vfpr, vfpr10, vfpr20], 
             'val_true_negative_rate': [vtnr, vtnr10, vtnr20],
             'val_false_negaive_rate': [vfnr, vfnr10, vfnr20]}, 
            index = ['3_nearest_neighbors', '10_nearest_neighbors', '20_nearest_neighbors'])

Unnamed: 0,val_accuracy,val_precision,val_recall,val_f1,val_support,val_true_positive_rate,val_false_positive_rate,val_true_negative_rate,val_false_negaive_rate
3_nearest_neighbors,0.747664,0.684211,0.634146,0.658228,82.0,63.41,29.27,81.8,22.7
10_nearest_neighbors,0.738318,0.69697,0.560976,0.621622,82.0,24.08,10.47,36.5,11.7
20_nearest_neighbors,0.714953,0.677966,0.487805,0.567376,82.0,48.78,23.17,85.6,31.8


In [60]:
#the knn=3 is still our best model for the validate sample