# NLP Project: Modeling

In [26]:
# general imports
import numpy as np
import pandas as pd
import itertools

# classification tools
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# local modules
import wrangle as w

#### **Goal:** To use *classification* algorithms to build a model that accurately predicts the primary coding language based on vectorized README text and features engineered from that text.

#### **Success Metric:** Accuracy (Total Correct Predictions/ Total Predictions)
- Accuracy is a good metric since we the cost/benefit of false positive predictions and false negative predictions are the same.
- The classes are split roughly in thirds so they are not terribly imbalanced.

#### **Methodology:**
- For each class(language) we will evaluate model accuracy at predicting that class(positive).
- The accuracy data for all three classes will then be averaged using a weighted average by size of class.
- The model with the best balance of highest weighted average accuracy and least dropoff in accuracy on the validation set will be selected.
- The selected model will then be assessed for production against the test dataset.

#### **Algorithms:**
- Decision Tree Classifier
- Random Forest Classifier
- K Nearest Neighbors
- Logistic Regression
- Naive Bayes

#### **Features:** the elements the model will use to make predictions
    
- Count-vectorized words
- TF/IDF vectorized words
- Total document word count


#### **Baseline Model:**

- While the classes are fairly evenly balanced, 'Python' has the most observations at 58 and will be used as the prediction for the baseline model.

#### **Algorithm 1: Random Forest Classifier:**

##### I want to test the random forest classifier for a range of parameters:
- bootstrap = True or False
- min_samples_leaf = range from 1-9
- max_depth = range from 1-9


In [44]:
def rf_classification(df, df_val, target, least_min_samples_leaf=1, 
                      most_min_samples_leaf=10, min_max_depth=1, max_max_depth=10):
    '''
    Perform random forest classification on the given data.

    Args:
        df (DataFrame): The training DataFrame containing the feature columns and the 
        target column.
        df_val (DataFrame): The validation (or test) DataFrame containing the feature 
        columns and the target column.
        target (str): The name of the target column.
        least_min_samples_leaf (int): The low range of the minimum samples per leaf.
        most_min_samples_leaf (int): The high range of the minimum  samples per leaf.
        min_max_depth (int): The low range of the minimum depth of the tree.
        max_max_depth (int): The high range of the minimum depth of the tree.

  
    Returns:
        DataFrame: A DataFrame containing the combinations of hyperparameters and their 
        corresponding accuracy scores on the training and validation data. An 'algorithm' 
        column is added to denote DecisionTree.
    '''
    X_train = df.drop(columns=target)
    y_train = df[target]
    X_val = df_val.drop(columns=target)
    y_val = df_val[target]
    
    rf_models = {}
    bootstrap = (True, False)
    sizes = range(least_min_samples_leaf, most_min_samples_leaf)
    depths = range(min_max_depth, max_max_depth)
    hyper_list = list(itertools.product(bootstrap, sizes, depths))
    
    for hyperparams in hyper_list:
        rf = RandomForestClassifier(bootstrap=hyperparams[0], 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=hyperparams[1],
                            n_estimators=100,
                            max_depth=hyperparams[2], 
                            random_state=9751)
        rf.fit(X_train, y_train)
        preds = rf.predict(X_train)
        accuracy_train = rf.score(X_train, y_train)
        accuracy_val = rf.score(X_val, y_val)
        rf_models[hyperparams] = accuracy_train, accuracy_val
   
    df = pd.DataFrame([{'hyperparams': k, 'accuracy_train': v[0],
                          'accuracy_val': v[1]} for k, v in rf_models.items()])
    df['algorithm'] = 'random_forest'
    
    return df

In [45]:
def dt_classification(df, df_val, target, least_min_samples_leaf=1, 
                      most_min_samples_leaf=10, min_max_depth=1, max_max_depth=10):
    '''
    Perform decision tree classification on the given data.

    Args:
        df (DataFrame): The training DataFrame containing the feature columns and the 
        target column.
        df_val (DataFrame): The validation (or test) DataFrame containing the feature 
        columns and the target column.
        target (str): The name of the target column.
        least_min_samples_leaf (int): The low range of the minimum samples per leaf.
        most_min_samples_leaf (int): The high range of the minimum  samples per leaf.
        min_max_depth (int): The low range of the minimum depth of the tree.
        max_max_depth (int): The high range of the minimum depth of the tree.

    Returns:
        DataFrame: A DataFrame containing the combinations of hyperparameters and their 
        corresponding accuracy scores on the training and validation data. An 'algorithm' 
        column is added to denote DecisionTree.
    '''
    X_train = df.drop(columns=target)
    y_train = df[target]
    X_val = df_val.drop(columns=target)
    y_val = df_val[target]
    
    dt_models = {}
    sizes = range(least_min_samples_leaf, most_min_samples_leaf)
    depths = range(min_max_depth, max_max_depth)
    hyper_list = list(itertools.product(sizes, depths))
    
    for hyperparams in hyper_list:
        dt = DecisionTreeClassifier( 
                            criterion='gini',
                            min_samples_leaf=hyperparams[0],
                            max_depth=hyperparams[1], 
                            random_state=9751)
        dt.fit(X_train, y_train)
        preds = dt.predict(X_train)
        accuracy_train = dt.score(X_train, y_train)
        accuracy_val = dt.score(X_val, y_val)
        dt_models[hyperparams] = accuracy_train, accuracy_val
   
    df = pd.DataFrame([{'hyperparams': k, 'accuracy_train': v[0],
                          'accuracy_val': v[1]} for k, v in dt_models.items()])
    df['algorithm'] = 'decision_tree'
    
    return df

In [3]:
from pydataset import data
df = data('iris')

In [15]:
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


In [19]:
from sklearn.model_selection import train_test_split

In [23]:
train, test = train_test_split(df, train_size=.7, stratify=df['Species'])

In [24]:
train.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
18,5.1,3.5,1.4,0.3,setosa
137,6.3,3.4,5.6,2.4,virginica
35,4.9,3.1,1.5,0.2,setosa
45,5.1,3.8,1.9,0.4,setosa
149,6.2,3.4,5.4,2.3,virginica


In [46]:
rf_model_performance = rf_classification(train, test, 'Species')

In [48]:
rf_model_performance.head()

Unnamed: 0,hyperparams,accuracy_train,accuracy_val,algorithm
0,"(True, 1, 1)",0.972603,0.96875,random_forest
1,"(True, 1, 2)",0.958904,0.96875,random_forest
2,"(True, 1, 3)",0.986301,0.96875,random_forest
3,"(True, 1, 4)",1.0,0.96875,random_forest
4,"(True, 1, 5)",1.0,0.96875,random_forest


In [49]:
rf_model_performance.sort_values('accuracy_val', ascending=False).max()

hyperparams        (True, 9, 9)
accuracy_train              1.0
accuracy_val            0.96875
algorithm         random_forest
dtype: object

In [50]:
dt_model_performance = dt_classification(train, test, 'Species')

In [52]:
dt_model_performance.head()

Unnamed: 0,hyperparams,accuracy_train,accuracy_val,algorithm
0,"(1, 1)",0.671233,0.65625,decision_tree
1,"(1, 2)",0.958904,0.96875,decision_tree
2,"(1, 3)",0.972603,0.96875,decision_tree
3,"(1, 4)",1.0,0.96875,decision_tree
4,"(1, 5)",1.0,0.96875,decision_tree


In [53]:
dt_model_performance.sort_values('accuracy_val', ascending=False).max()

hyperparams              (9, 9)
accuracy_train              1.0
accuracy_val            0.96875
algorithm         decision_tree
dtype: object