In [1]:
#importing standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#importing libraries for modeling
from sklearn.model_selection import train_test_split
from prepare import tts
from sklearn.tree import plot_tree
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from env import get_connection
import acquire

In [2]:
# function to get the titanic data from my acquire file

def prep_titanic():
    titan=acquire.get_titanic()
    titan.drop(columns=['passenger_id', 'embarked', 'deck', 'age', 'class'], inplace=True)
    dummy_var=pd.get_dummies(titan[['embark_town', 'sex']], drop_first=True)
    titan=pd.concat([titan, dummy_var], axis=1)
    return titan

In [3]:
#assigning the titanic data to a dataframe

df=prep_titanic()

df.head()

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,embark_town_Queenstown,embark_town_Southampton,sex_male
0,0,3,male,1,0,7.25,Southampton,0,0,1,1
1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,1,3,female,0,0,7.925,Southampton,1,0,1,0
3,1,1,female,1,0,53.1,Southampton,0,0,1,0
4,0,3,male,0,0,8.05,Southampton,1,0,1,1


In [4]:
df.dtypes

survived                     int64
pclass                       int64
sex                         object
sibsp                        int64
parch                        int64
fare                       float64
embark_town                 object
alone                        int64
embark_town_Queenstown       uint8
embark_town_Southampton      uint8
sex_male                     uint8
dtype: object

In [5]:
#dropping columns that have been split into dummy variables

df = df.drop(columns=['embark_town', 'sex'])
df.head()

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,embark_town_Queenstown,embark_town_Southampton,sex_male
0,0,3,1,0,7.25,0,0,1,1
1,1,1,1,0,71.2833,0,0,0,0
2,1,3,0,0,7.925,1,0,1,0
3,1,1,1,0,53.1,0,0,1,0
4,0,3,0,0,8.05,1,0,1,1


In [6]:
#Adding another dummy variable for pclass and dropping the original column

df=pd.get_dummies(df, columns=['pclass'], drop_first=True)

df.head()

Unnamed: 0,survived,sibsp,parch,fare,alone,embark_town_Queenstown,embark_town_Southampton,sex_male,pclass_2,pclass_3
0,0,1,0,7.25,0,0,1,1,0,1
1,1,1,0,71.2833,0,0,0,0,0,0
2,1,0,0,7.925,1,0,1,0,0,1
3,1,1,0,53.1,0,0,1,0,0,0
4,0,0,0,8.05,1,0,1,1,0,1


In [7]:
t_train, t_val, t_test = tts(df)


stratify=survived


In [8]:
#assigning the X and y train variables for modeling

X_train = t_train.drop(columns=['survived'])
y_train = t_train['survived']

X_val = t_val.drop(columns=['survived'])
y_val = t_val['survived']

X_test = t_test.drop(columns=['survived'])
y_test = t_test['survived']

# Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)



In [17]:
# building the model
knn = KNeighborsClassifier(n_neighbors=3)

In [18]:
#fit the model
knn.fit(X_train, y_train)

In [19]:
y_pred= knn.predict(X_train)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


# Evaluate your results using the model score, confusion matrix, and classification report.



In [20]:
knn.score(X_train, y_train)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


0.8514056224899599

In [28]:
cmt=pd.DataFrame(data=confusion_matrix(y_train, y_pred), 
             columns= ['pred died', 'pred survived'], 
             index= ['actually died', 'actually survived'])
cmt

Unnamed: 0,pred died,pred survived
actually died,270,37
actually survived,37,154


In [27]:
traindf = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True, 
                                   target_names=['died', 'survived']))
traindf

Unnamed: 0,died,survived,accuracy,macro avg,weighted avg
precision,0.879479,0.806283,0.851406,0.842881,0.851406
recall,0.879479,0.806283,0.851406,0.842881,0.851406
f1-score,0.879479,0.806283,0.851406,0.842881,0.851406
support,307.0,191.0,0.851406,498.0,498.0


# Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.



In [31]:
#assigning data results to variables

accuracy= traindf.iloc[1:]['accuracy'][0]
precision= traindf.iloc[0:2]['survived'][0]
recall= traindf.iloc[1:2]['survived'][0]
f1= traindf.iloc[2:3]['survived'][0]
support= traindf.iloc[3:4]['survived'][0]
tn, fp, fn, tp= confusion_matrix(y_train, y_pred).ravel()
neg= tn+fp
pos= fn+tp

#printing results using vairables

print(f'Accuracy is {accuracy.round(4)*100}%')
print(f'Precision is {precision.round(4)*100}%')
print(f'Recall is {recall.round(4)*100}%')
print(f'F1-score is {f1.round(4)*100}%')
print(f'Support is {support}')
print(f'True positive rate is {(tp/pos).round(4)*100}%')
print(f'False positive rate is {(fp/pos).round(4)*100}%')
print(f'True negative rate is {(tn/neg).round(3)*100}%')
print(f'False negative rate is {(fn/neg).round(3)*100}%')

Accuracy is 85.14%
Precision is 80.63%
Recall is 80.63%
F1-score is 80.63%
Support is 191.0
True positive rate is 80.63%
False positive rate is 19.37%
True negative rate is 87.9%
False negative rate is 12.1%


# Run through steps 1-3 setting k to 10



# Run through steps 1-3 setting k to 20



# What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?



# Which model performs best on our out-of-sample data from validate?

