# Work with titanic data to do the following:

1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

2. Evaluate your results using the model score, confusion matrix, and classification report.

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.

5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

6. After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import acquire as acq
import prepare as prep
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier


In [60]:
titanic=acq.titanic_data()
titanic=titanic.fillna(0)
titanic.head()
titanic=prep.prep_titanic(titanic)
train,validate,test=prep.split_data(titanic,'survived')

In [3]:
baseline_accuracy=329/(329+205)
baseline_accuracy*100

61.61048689138576

In [4]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embark_town,sex_male,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
455,455,1,3,male,29.0,0,0,7.8958,Cherbourg,1,1,0,0
380,380,1,1,female,42.0,0,0,227.525,Cherbourg,0,1,0,0
492,492,0,1,male,55.0,0,0,30.5,Southampton,1,0,0,1
55,55,1,1,male,0.0,0,0,35.5,Southampton,1,0,0,1
243,243,0,3,male,22.0,0,0,7.125,Southampton,1,0,0,1


In [61]:
x_train=train.drop(columns=['survived','passenger_id','sex','embark_town'])
y_train= train.survived
x_validate=validate.drop(columns=['survived','passenger_id','sex','embark_town'])
y_validate= validate.survived
x_test=test.drop(columns=['survived','passenger_id','sex','embark_town'])
y_test= test.survived

1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [6]:
tree=RandomForestClassifier(min_samples_leaf=1,max_depth=10,random_state=123)
tree.fit(x_train,y_train)
tree.score(x_train,y_train)*100

97.19101123595506

In [7]:
y_true=y_train
y_pred=tree.predict(x_train)

In [8]:
confusion_matrix(y_true,y_pred)

array([[329,   0],
       [ 15, 190]])

In [9]:
print(classification_report(y_true,y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       329
           1       1.00      0.93      0.96       205

    accuracy                           0.97       534
   macro avg       0.98      0.96      0.97       534
weighted avg       0.97      0.97      0.97       534



In [10]:
tree.score(x_validate,y_validate)*100

80.33707865168539

In [11]:
labels=sorted(y_train.unique())
pd.DataFrame(confusion_matrix(y_true,y_pred),
            index=[str(label) +'_actual' for label in labels],
            columns=[str(label) +'_predict' for label in labels])

Unnamed: 0,0_predict,1_predict
0_actual,329,0
1_actual,15,190


In [12]:
conf=confusion_matrix(y_true,y_pred)
conf.ravel()
TN, FP, FN, TP = conf.ravel()

In [13]:
all_ = (TP + TN + FP + FN)

accuracy = (TP + TN) / all_

TPR = recall = TP / (TP + FN)
FPR = FP / (FP + TN)

TNR = TN / (FP + TN)
FNR = FN / (FN + TP)

precision =  TP / (TP + FP)
f1 =  2 * ((precision * recall) / ( precision + recall))

support_pos = TP + FN
support_neg = FP + TN

In [14]:
print(f"Accuracy: {accuracy}\n")
print(f"True Positive Rate/Sensitivity/Recall/Power: {TPR}")
print(f"False Positive Rate/False Alarm Ratio/Fall-out: {FPR}")
print(f"True Negative Rate/Specificity/Selectivity: {TNR}")
print(f"False Negative Rate/Miss Rate: {FNR}\n")
print(f"Precision/PPV: {precision}")
print(f"F1 Score: {f1}\n")
print(f"Support (0): {support_pos}")
print(f"Support (1): {support_neg}")

Accuracy: 0.9719101123595506

True Positive Rate/Sensitivity/Recall/Power: 0.926829268292683
False Positive Rate/False Alarm Ratio/Fall-out: 0.0
True Negative Rate/Specificity/Selectivity: 1.0
False Negative Rate/Miss Rate: 0.07317073170731707

Precision/PPV: 1.0
F1 Score: 0.9620253164556963

Support (0): 205
Support (1): 329


In [63]:
score_all=[]

min_samples_leaf_values = range(1,12)
max_depth_values = range(10,0,-1)

# Iterate over the parameter combinations
for min_samples_leaf, max_depth in zip(min_samples_leaf_values, max_depth_values):
    # Define the Random Forest Classifier with the current parameters
    clf = RandomForestClassifier(n_estimators=100, min_samples_leaf=min_samples_leaf, max_depth=max_depth,random_state=123)
    # Train the model on the training data
    clf.fit(x_train, y_train)
    
    # Evaluate the model on the validation data
    accuracy_train = clf.score(x_train, y_train)
    accuracy_val=clf.score(x_validate, y_validate)
    accuracy_test=clf.score(x_test, y_test)
    
    # Print the results
    print(f"BASELINE ACCURACY={baseline_accuracy:.2f}")
    print(f"min_samples_leaf={min_samples_leaf}, max_depth={max_depth}, accuracy_TRAIN={accuracy_train:.2f}")
    print(f"min_samples_leaf={min_samples_leaf}, max_depth={max_depth}, accuracy_VALIDATE={accuracy_val:.2f}")
    print(f"min_samples_leaf={min_samples_leaf}, max_depth={max_depth}, accuracy_TEST={accuracy_test:.2f}")

score_all.append([min_samples_leaf,max_depth,accuracy_train,accuracy_val])

BASELINE ACCURACY=0.73
min_samples_leaf=1, max_depth=10, accuracy_TRAIN=0.97
min_samples_leaf=1, max_depth=10, accuracy_VALIDATE=0.80
min_samples_leaf=1, max_depth=10, accuracy_TEST=0.78
BASELINE ACCURACY=0.73
min_samples_leaf=2, max_depth=9, accuracy_TRAIN=0.92
min_samples_leaf=2, max_depth=9, accuracy_VALIDATE=0.80
min_samples_leaf=2, max_depth=9, accuracy_TEST=0.79
BASELINE ACCURACY=0.73
min_samples_leaf=3, max_depth=8, accuracy_TRAIN=0.91
min_samples_leaf=3, max_depth=8, accuracy_VALIDATE=0.79
min_samples_leaf=3, max_depth=8, accuracy_TEST=0.79
BASELINE ACCURACY=0.73
min_samples_leaf=4, max_depth=7, accuracy_TRAIN=0.89
min_samples_leaf=4, max_depth=7, accuracy_VALIDATE=0.80
min_samples_leaf=4, max_depth=7, accuracy_TEST=0.77
BASELINE ACCURACY=0.73
min_samples_leaf=5, max_depth=6, accuracy_TRAIN=0.87
min_samples_leaf=5, max_depth=6, accuracy_VALIDATE=0.80
min_samples_leaf=5, max_depth=6, accuracy_TEST=0.78
BASELINE ACCURACY=0.73
min_samples_leaf=6, max_depth=5, accuracy_TRAIN=0.84
m

In [64]:
pd.DataFrame(score_all,columns=['min_leaf','max_depth','train','validate'])

Unnamed: 0,min_leaf,max_depth,train,validate
0,10,1,0.773408,0.769663


5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

6. After making a few models, which one has the best performance (or closest metrics) on both train and validate?
4 through 6, there is less overfitting and less accuracy difference

# Telco Data:

In [16]:
telco=acq.get_telco_data()
telco=prep.prep_telco(telco)
train,validate,test=prep.split_data(telco,'churn')

In [17]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4225 entries, 5911 to 3586
Data columns (total 48 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   customer_id                            4225 non-null   object 
 1   gender                                 4225 non-null   object 
 2   senior_citizen                         4225 non-null   int64  
 3   partner                                4225 non-null   object 
 4   dependents                             4225 non-null   object 
 5   tenure                                 4225 non-null   int64  
 6   phone_service                          4225 non-null   object 
 7   multiple_lines                         4225 non-null   object 
 8   online_security                        4225 non-null   object 
 9   online_backup                          4225 non-null   object 
 10  device_protection                      4225 non-null   object 
 11  t

In [18]:
x_train=train.iloc[:,-27:].drop(columns=['churn_encoded'])
y_train= train.churn
x_validate=validate.iloc[:,-27:].drop(columns=['churn_encoded'])
y_validate= validate.churn
x_test=test.iloc[:,-27:].drop(columns=['churn_encoded'])
y_test= test.churn


In [19]:
train.churn.value_counts()

No     3104
Yes    1121
Name: churn, dtype: int64

In [20]:
baseline_accuracy=3104/(3104+1121)
baseline_accuracy*100

73.46745562130178

In [21]:
tree=RandomForestClassifier(min_samples_leaf=1,max_depth=10,random_state=123)
tree.fit(x_train,y_train)
tree.score(x_train,y_train)*100

83.33727810650888

In [22]:
y_true=y_train
y_pred=tree.predict(x_train)
confusion_matrix(y_true,y_pred)
labels=sorted(y_train.unique())
pd.DataFrame(confusion_matrix(y_true,y_pred),
            index=[str(label) +'_actual' for label in labels],
            columns=[str(label) +'_predict' for label in labels])

Unnamed: 0,No_predict,Yes_predict
No_actual,2875,229
Yes_actual,475,646


In [23]:
conf=confusion_matrix(y_true,y_pred)
conf.ravel()
TN, FP, FN, TP = conf.ravel()


In [24]:
all_ = (TP + TN + FP + FN)

accuracy = (TP + TN) / all_

TPR = recall = TP / (TP + FN)
FPR = FP / (FP + TN)

TNR = TN / (FP + TN)
FNR = FN / (FN + TP)

precision =  TP / (TP + FP)
f1 =  2 * ((precision * recall) / ( precision + recall))

support_pos = TP + FN
support_neg = FP + TN

print(f"Accuracy: {accuracy}\n")
print(f"True Positive Rate/Sensitivity/Recall/Power: {TPR}")
print(f"False Positive Rate/False Alarm Ratio/Fall-out: {FPR}")
print(f"True Negative Rate/Specificity/Selectivity: {TNR}")
print(f"False Negative Rate/Miss Rate: {FNR}\n")
print(f"Precision/PPV: {precision}")
print(f"F1 Score: {f1}\n")
print(f"Support (0): {support_pos}")
print(f"Support (1): {support_neg}")

Accuracy: 0.8333727810650887

True Positive Rate/Sensitivity/Recall/Power: 0.576271186440678
False Positive Rate/False Alarm Ratio/Fall-out: 0.07377577319587629
True Negative Rate/Specificity/Selectivity: 0.9262242268041238
False Negative Rate/Miss Rate: 0.423728813559322

Precision/PPV: 0.7382857142857143
F1 Score: 0.6472945891783568

Support (0): 1121
Support (1): 3104


In [25]:
min_samples_leaf_values = range(1,21)
max_depth_values = range(20,0,-1)

# Iterate over the parameter combinations
for min_samples_leaf, max_depth in zip(min_samples_leaf_values, max_depth_values):
    # Define the Random Forest Classifier with the current parameters
    clf = RandomForestClassifier(n_estimators=100, min_samples_leaf=min_samples_leaf, max_depth=max_depth,)
    
    # Train the model on the training data
    clf.fit(x_train, y_train)
    
    # Evaluate the model on the validation data
    accuracy_train = clf.score(x_train, y_train)
    accuracy_val=clf.score(x_validate, y_validate)
    accuracy_test=clf.score(x_test, y_test)
    
    # Print the results
    print(f"BASELINE ACCURACY={baseline_accuracy:.2f}")
    print(f"min_samples_leaf={min_samples_leaf}, max_depth={max_depth}, accuracy_TRAIN={accuracy_train:.2f}")
    print(f"min_samples_leaf={min_samples_leaf}, max_depth={max_depth}, accuracy_VALIDATE={accuracy_val:.2f}")
    print(f"min_samples_leaf={min_samples_leaf}, max_depth={max_depth}, accuracy_TEST={accuracy_test:.2f}")

BASELINE ACCURACY=0.73
min_samples_leaf=1, max_depth=20, accuracy_TRAIN=0.92
min_samples_leaf=1, max_depth=20, accuracy_VALIDATE=0.75
min_samples_leaf=1, max_depth=20, accuracy_TEST=0.76
BASELINE ACCURACY=0.73
min_samples_leaf=2, max_depth=19, accuracy_TRAIN=0.85
min_samples_leaf=2, max_depth=19, accuracy_VALIDATE=0.78
min_samples_leaf=2, max_depth=19, accuracy_TEST=0.79
BASELINE ACCURACY=0.73
min_samples_leaf=3, max_depth=18, accuracy_TRAIN=0.83
min_samples_leaf=3, max_depth=18, accuracy_VALIDATE=0.79
min_samples_leaf=3, max_depth=18, accuracy_TEST=0.78
BASELINE ACCURACY=0.73
min_samples_leaf=4, max_depth=17, accuracy_TRAIN=0.82
min_samples_leaf=4, max_depth=17, accuracy_VALIDATE=0.78
min_samples_leaf=4, max_depth=17, accuracy_TEST=0.78
BASELINE ACCURACY=0.73
min_samples_leaf=5, max_depth=16, accuracy_TRAIN=0.81
min_samples_leaf=5, max_depth=16, accuracy_VALIDATE=0.78
min_samples_leaf=5, max_depth=16, accuracy_TEST=0.78
BASELINE ACCURACY=0.73
min_samples_leaf=6, max_depth=15, accuracy