In [41]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from prepare_notebook import prep_titanic
from prepare_notebook import prep_titanic_unscaled_age
from prepare_notebook import scale_titanic
from prepare_notebook import prep_iris
from acquire import get_iris_data
from acquire import get_titanic_data

In [42]:
iris_df=prep_iris(get_iris_data())
y=iris_df[['species']]
X=iris_df[['petal_length','petal_width','sepal_length','sepal_width']]
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=.7,random_state=123)

#Random Forest

#1) Fit the Random Forest classifier to your training sample and transform(i.e. make predictions on the sample training)
#setting the random_state accordingly and setting min_samples_leaf=1 and max_depth=20.
rf=RandomForestClassifier(min_samples_leaf=1,max_depth=20)
rf.fit(X_train,y_train)
pred_default=rf.predict(X_train)

In [43]:
#2)Evaluate your in-sample results using the model score,confusion matrix,and classification report.
labels=sorted(list(iris_df.species_name.unique()))
print(rf.score(X_train,y_train))
print(pd.DataFrame(confusion_matrix(y_train,pred_default),index=labels,columns=labels))
print(classification_report(y_train,pred_default))

1.0
            setosa  versicolor  virginica
setosa          32           0          0
versicolor       0          40          0
virginica        0           0         33
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        32
           1       1.00      1.00      1.00        40
           2       1.00      1.00      1.00        33

   micro avg       1.00      1.00      1.00       105
   macro avg       1.00      1.00      1.00       105
weighted avg       1.00      1.00      1.00       105



In [44]:
#3) Print and clearly label the following:Accuracy,true positive rate,false positive rate,true negative rate,false negative rate,
# false negative rate,precision,recall,f1-score,and support. 
confusion_matrix1=pd.DataFrame(confusion_matrix(y_train,pred_default),index=labels,columns=labels)
FP = confusion_matrix1.sum(axis=0) - np.diag(confusion_matrix1)  
FN = confusion_matrix1.sum(axis=1) - np.diag(confusion_matrix1)
TP = np.diag(confusion_matrix1)
TN = confusion_matrix1.values.sum() - (FP + FN + TP)

#True positive rate/recall
TPR = TP/(TP+FN)
# True negative rate
TNR = TN/(TN+FP) 
# Precision
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# False positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

# F1-score
f1_score=2*TP/(2*TP+FP+FN)

#support
support=len(y_train)

In [45]:
#4)Run through steps increasing your min_samples_leaf to 5 and decreasing your max_depth to 3
rf_shallow=RandomForestClassifier(min_samples_leaf=5,max_depth=3)
rf_shallow.fit(X_train,y_train)
pred_shallow=rf.predict(X_train)

print(rf.score(X_train,y_train))
print(pd.DataFrame(confusion_matrix(y_train,pred_default),index=labels,columns=labels))
print(classification_report(y_train,pred_default))

1.0
            setosa  versicolor  virginica
setosa          32           0          0
versicolor       0          40          0
virginica        0           0         33
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        32
           1       1.00      1.00      1.00        40
           2       1.00      1.00      1.00        33

   micro avg       1.00      1.00      1.00       105
   macro avg       1.00      1.00      1.00       105
weighted avg       1.00      1.00      1.00       105



In [46]:
#5) What are the differences in the evaluation metrics? Which performs better on your in-sample data?
#Why?

#6) Save the best model in forest_fit


## KNN

In [47]:
#1)Fit the K-Nearest Neighbors classifier to your training sample and transform
#(make predictions on the training sample)
knn=KNeighborsClassifier()
knn.fit(X_train,y_train)
pred_default2=knn.predict(X_train)

In [48]:
#2)Evaluate your in-sample results using the model score,confusion matrix,and classification report.
print(knn.score(X_train,y_train))
print(pd.DataFrame(confusion_matrix(y_train,pred_default2),index=labels,columns=labels))
print(classification_report(y_train,pred_default2))

0.9809523809523809
            setosa  versicolor  virginica
setosa          32           0          0
versicolor       0          39          1
virginica        0           1         32
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        32
           1       0.97      0.97      0.97        40
           2       0.97      0.97      0.97        33

   micro avg       0.98      0.98      0.98       105
   macro avg       0.98      0.98      0.98       105
weighted avg       0.98      0.98      0.98       105



In [49]:
#3) Print and clearly label the following:Accuracy,true positive rate,false positive rate,true negative rate,false negative rate,
# false negative rate,precision,recall,f1-score,and support. 
#3) Print and clearly label the following:Accuracy,true positive rate,false positive rate,true negative rate,false negative rate,
# false negative rate,precision,recall,f1-score,and support. 
confusion_matrix2=pd.DataFrame(confusion_matrix(y_train,pred_default2),index=labels,columns=labels)
FP = confusion_matrix2.sum(axis=0) - np.diag(confusion_matrix2)  
FN = confusion_matrix2.sum(axis=1) - np.diag(confusion_matrix2)
TP = np.diag(confusion_matrix2)
TN = confusion_matrix2.values.sum() - (FP + FN + TP)

#True positive rate/recall
TPR = TP/(TP+FN)
# True negative rate
TNR = TN/(TN+FP) 
# Precision
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# False positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

# F1-score
f1_score=2*TP/(2*TP+FP+FN)

#support
support=len(y_train)

In [50]:
#4)Run through steps 2-4 setting k=10
knn_10=KNeighborsClassifier(n_neighbors=10)
knn_10.fit(X_train,y_train)
pred_10=knn_10.predict(X_train)


print(knn_10.score(X_train,y_train))
print(pd.DataFrame(confusion_matrix(y_train,pred_10),index=labels,columns=labels))
print(classification_report(y_train,pred_10))

0.9714285714285714
            setosa  versicolor  virginica
setosa          32           0          0
versicolor       0          39          1
virginica        0           2         31
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        32
           1       0.95      0.97      0.96        40
           2       0.97      0.94      0.95        33

   micro avg       0.97      0.97      0.97       105
   macro avg       0.97      0.97      0.97       105
weighted avg       0.97      0.97      0.97       105



In [51]:
#5)Run through steps 2-4 setting k=20
knn_20=KNeighborsClassifier(n_neighbors=20)
knn_20.fit(X_train,y_train)
pred_20=knn_20.predict(X_train)

print(knn_20.score(X_train,y_train))
print(pd.DataFrame(confusion_matrix(y_train,pred_20),index=labels,columns=labels))
print(classification_report(y_train,pred_20))

0.9619047619047619
            setosa  versicolor  virginica
setosa          32           0          0
versicolor       0          39          1
virginica        0           3         30
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        32
           1       0.93      0.97      0.95        40
           2       0.97      0.91      0.94        33

   micro avg       0.96      0.96      0.96       105
   macro avg       0.97      0.96      0.96       105
weighted avg       0.96      0.96      0.96       105



In [52]:
#6) What are the differences in the evaluation metrics? Which performs better on your in-sample
#data? Why?

#7) Save the best model in `knn_fit`

In [89]:
#Feature Engineering
#Titanic Data
#Create a feature named who, this should be either man, woman, or child. How does including this feature affect your model's performance?
titanic_df=prep_titanic_unscaled_age(get_titanic_data())
titanic_df.dropna(inplace=True)
titanic_df['who'] = np.where(titanic_df['age']<18, 'child', np.where(titanic_df['sex']=='male','man','woman'))
titanic_df['adult_male']=np.where((titanic_df['age']>=18) & (titanic_df['sex']=='male') ,1,0)


#encode the values in who
encoder_titanic=LabelEncoder()
encoder_titanic.fit(titanic_df.who)
titanic_df['who']=encoder_titanic.transform(titanic_df.who)

#scale age and fare columns and drop some `sex`,`embark_town`,`class`,and target
titanic_df=scale_titanic(titanic_df)
y=titanic_df[['survived']]
X=titanic_df.drop(columns=['survived','sex','embark_town','class','passenger_id','adult_male'])
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=.7,random_state=123)

knn_who=KNeighborsClassifier()
knn_who.fit(X_train,y_train)
who_predictions=knn_who.predict(X_train)
labels=['survived','died']
print(knn_who.score(X_train,y_train))
print(pd.DataFrame(confusion_matrix(y_train,who_predictions),index=labels,columns=labels))
print(classification_report(y_train,who_predictions))

0.843687374749499
          survived  died
survived       266    27
died            51   155
              precision    recall  f1-score   support

           0       0.84      0.91      0.87       293
           1       0.85      0.75      0.80       206

   micro avg       0.84      0.84      0.84       499
   macro avg       0.85      0.83      0.84       499
weighted avg       0.84      0.84      0.84       499



In [90]:
#Create a feature named adult_male that is either a 1 or a 0. How does this affect your model's predictions?
#titanic_df['adult_male']=np.where((titanic_df['age']>=18) & (titanic_df['sex']=='male') ,1,0)
y=titanic_df[['survived']]
X=titanic_df.drop(columns=['survived','sex','embark_town','class','passenger_id','who'])
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=.7,random_state=123)
knn_male=KNeighborsClassifier()
knn_male.fit(X_train,y_train)
male_predictions=knn_who.predict(X_train)
labels=['survived','died']
print(knn_who.score(X_train,y_train))
print(pd.DataFrame(confusion_matrix(y_train,male_predictions),index=labels,columns=labels))
print(classification_report(y_train,male_predictions))

0.7615230460921844
          survived  died
survived       273    20
died            99   107
              precision    recall  f1-score   support

           0       0.73      0.93      0.82       293
           1       0.84      0.52      0.64       206

   micro avg       0.76      0.76      0.76       499
   macro avg       0.79      0.73      0.73       499
weighted avg       0.78      0.76      0.75       499



In [None]:
#Iris Data
#Create features named petal_area and sepal_area.
iris_df['petal_area']=iris_df['petal_length']*iris_df['petal_width']
iris_df['sepal_area']=iris_df['sepal_length']*iris_df['sepal_width']
