# <center>Decision Tree Exercises</center>

In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

Evaluate your in-sample results using the model score, confusion matrix, and classification report.

Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

Run through steps 2-4 using a different max_depth value.

Which model performs better on your in-sample data?

Which model performs best on your out-of-sample data, the validate set?

Work through these same exercises using the Telco dataset.
Experiment with this model on other datasets with a higher number of output classes.

### Using the titanic data, in your classification-exercises repository, create a notebook, model.ipynb where you will do the following:

In [2]:
import pandas as pd
import acquire, prepare

titanic_df = acquire.get_titanic_data()
titanic_df.drop(columns=['Unnamed: 0'], inplace=True)

### What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [25]:
def confusion_table(df: pd.DataFrame) -> str:
    '''Takes DataFrame and prints a formatted Confusion Table/Matrix in
    markdown for Juypter notebooks. The first column must be the actual values and all
    the other columns have to be model values or predicted values.
    
    Parameters
    ----------
    
    df : pandas DataFrame
        Requires the 'actual' values to be the first column 
        and all other columns to be the predicted values.
        
    Returns
    -------
    str 
        string that is formatted with HTML and markdown
        for Juypter Notebooks so that it can be copied and pasted into a 
        markdown cell and easier to view the values.
        
    '''
    result = str()
    table_names = str()
    tables = str()
    actual = df.columns[0]
    col_names = [str(col) for col in df.columns if col != actual]
    for col in col_names:
        table_names += f'<th><center>{str(col.capitalize())}</center></th>'
    for col in col_names:
        
        # Crosstab the model row vs the actual values
        val = pd.crosstab(df[col], df[actual], rownames=['Pred'], colnames=['Actual']).reset_index()
        
        # Generate report values, precision, recall, accuracy
        report = pd.DataFrame(classification_report(df[actual], df[col], output_dict=True))
        
        # Get all the uniques in a list
        uniques = [str(col) for col in val.columns if col not in ['Pred']]
        
        # Make a line break in table for Accuracy
        accuracy_row = ['Accuracy']
        accuracy_row.extend(['-----' for n in range(len(uniques))])
        accuracy_row[-1] = report.accuracy[0] * 100
        
        # Ensure all columns names are strings
        val = val.rename(columns=lambda x: str(x))
        
        # Create a divider of len n
        divider = ['-----' for n in range(len(uniques)+1)]
        val.loc[len(val.index)] = divider
        # Input the accuracy
        val.loc[len(val.index)] = accuracy_row
        val.loc[len(val.index)] = divider
        
        for unique in uniques:
            # Iterate through all uniques and fetch their precision and 
            # Recall values to put into the table.
            precision = report[str(unique)][0] * 100
            recall = report[str(unique)][1] * 100
            df2 = [{'Pred': 'Precision', unique: precision},
                  {'Pred': 'Recall', unique: recall}]
            
            # Add the values to the bottom of the table
            val = val.append(df2, ignore_index=True)
        
        # Collapse the index under Pred to have the table smaller
        new_df = val.set_index('Pred')
        # Put the table to markdown
        tab = new_df.to_markdown()
        
        
        tables += f'<td>\n\n{tab}\n\n</td>\n\n'

    result += f'''<table>
    <tr>{table_names}</tr>
    <tr>{tables}</tr></table>'''

    return result

In [26]:
# from data_evaluation import confusion_table
from IPython.display import Markdown
train, validate, test = acquire.train_validate_test_split(titanic_df, target='survived')


# Create X & y version of train, validate, test where y is just the
# Series with the target variable and X is all of the features

train['baseline'] = 1

X_train= train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

ipt = train[['survived', 'baseline']]

ipt
pd.DataFrame(classification_report(train.survived, train.baseline, output_dict=True))
Markdown(confusion_table(ipt))

# pd.crosstab(train.baseline, train.survived, rownames=['Pred'], colnames=['Actual'])


<table>
    <tr><th><center>Baseline</center></th></tr>
    <tr><td>

| Pred      | 0     | 1                 |
|:----------|:------|:------------------|
| 1         | 307   | 191               |
| -----     | ----- | -----             |
| Accuracy  | ----- | 38.35341365461847 |
| -----     | ----- | -----             |
| Precision | 0.0   | nan               |
| Recall    | 0.0   | nan               |
| Precision | nan   | 38.35341365461847 |
| Recall    | nan   | 100.0             |

</td>

</tr></table>