# NLP Project: Modeling

In [86]:
# general imports
import numpy as np
import pandas as pd
import itertools

# classification tools
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import ComplementNB, MultinomialNB

# local modules
import wrangle as w

# warnings
import warnings
warnings.filterwarnings('ignore')

#### **Goal:** To use *classification* algorithms to build a model that accurately predicts the primary coding language based on vectorized README text and features engineered from that text.

#### **Success Metric:** Accuracy (Total Correct Predictions/ Total Predictions)
- Accuracy is a good metric since we the cost/benefit of false positive predictions and false negative predictions are the same.
- The classes are split roughly in thirds so they are not terribly imbalanced.

#### **Methodology:**
- For each class(language) we will evaluate model accuracy at predicting that class(positive).
- The accuracy data for all three classes will then be averaged using a weighted average by size of class.
- The model with the best balance of highest weighted average accuracy and least dropoff in accuracy on the validation set will be selected.
- The selected model will then be assessed for production against the test dataset.

#### **Algorithms:**
- Decision Tree Classifier
- Random Forest Classifier
- K Nearest Neighbors
- Logistic Regression
- Naive Bayes

#### **Features:** the elements the model will use to make predictions
    
- Count-vectorized words
- TF/IDF vectorized words
- Total document word count


#### **Baseline Model:**

- While the classes are fairly evenly balanced, 'Python' has the most observations at 58 and will be used as the prediction for the baseline model.

#### **Algorithm 1: Random Forest Classifier:**

##### I want to test the random forest classifier for a range of hyperparameters:
- bootstrap = True or False
- min_samples_leaf = range from 1-9
- max_depth = range from 1-9


In [55]:
def rf_classification(df, df_val, target, least_min_samples_leaf=1, 
                      most_min_samples_leaf=10, min_max_depth=1, max_max_depth=10):
    '''
    Perform random forest classification on the given data.

    Args:
        df (DataFrame): The training DataFrame containing the feature columns and the 
        target column.
        df_val (DataFrame): The validation (or test) DataFrame containing the feature 
        columns and the target column.
        target (str): The name of the target column.
        least_min_samples_leaf (int): The low range of the minimum samples per leaf.
        most_min_samples_leaf (int): The high range of the minimum  samples per leaf.
        min_max_depth (int): The low range of the minimum depth of the tree.
        max_max_depth (int): The high range of the minimum depth of the tree.

  
    Returns:
        DataFrame: A DataFrame containing the combinations of hyperparameters and their 
        corresponding accuracy scores on the training and validation data. An 'algorithm' 
        column is added to denote random_forest.
    '''
    X_train = df.drop(columns=target)
    y_train = df[target]
    X_val = df_val.drop(columns=target)
    y_val = df_val[target]
    
    rf_models = {}
    bootstrap = (True, False)
    sizes = range(least_min_samples_leaf, most_min_samples_leaf)
    depths = range(min_max_depth, max_max_depth)
    hyper_list = list(itertools.product(bootstrap, sizes, depths))
    
    for hyperparams in hyper_list:
        rf = RandomForestClassifier(bootstrap=hyperparams[0], 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=hyperparams[1],
                            n_estimators=100,
                            max_depth=hyperparams[2], 
                            random_state=9751)
        rf.fit(X_train, y_train)
        preds = rf.predict(X_train)
        accuracy_train = rf.score(X_train, y_train)
        accuracy_val = rf.score(X_val, y_val)
        rf_models[hyperparams] = accuracy_train, accuracy_val
   
    df = pd.DataFrame([{'hyperparams': k, 'accuracy_train': v[0],
                          'accuracy_val': v[1]} for k, v in rf_models.items()])
    df['algorithm'] = 'random_forest'
    
    return df

#### **Algorithm 2: Decision Tree Classifier:**

##### I want to test the decision tree classifier for a range of hyperparameters:
- min_samples_leaf = range from 1-9
- max_depth = range from 1-9


In [94]:
def dt_classification(df, df_val, target, least_min_samples_leaf=1, 
                      most_min_samples_leaf=10, min_max_depth=1, max_max_depth=10):
    '''
    Perform decision tree classification on the given data.

    Args:
        df (DataFrame): The training DataFrame containing the feature columns and the 
        target column.
        df_val (DataFrame): The validation (or test) DataFrame containing the feature 
        columns and the target column.
        target (str): The name of the target column.
        least_min_samples_leaf (int): The low range of the minimum samples per leaf.
        most_min_samples_leaf (int): The high range of the minimum  samples per leaf.
        min_max_depth (int): The low range of the minimum depth of the tree.
        max_max_depth (int): The high range of the minimum depth of the tree.

    Returns:
        DataFrame: A DataFrame containing the combinations of hyperparameters and their 
        corresponding accuracy scores on the training and validation data. An 'algorithm' 
        column is added to denote decision_tree.
    '''
    X_train = df.drop(columns=target)
    y_train = df[target]
    X_val = df_val.drop(columns=target)
    y_val = df_val[target]
    
    dt_models = {}
    sizes = range(least_min_samples_leaf, (most_min_samples_leaf +1))
    depths = range(min_max_depth, (max_max_depth + 1))
    hyper_list = list(itertools.product(sizes, depths))
    
    for hyperparams in hyper_list:
        dt = DecisionTreeClassifier( 
                            criterion='gini',
                            min_samples_leaf=hyperparams[0],
                            max_depth=hyperparams[1], 
                            random_state=9751)
        dt.fit(X_train, y_train)
        preds = dt.predict(X_train)
        accuracy_train = dt.score(X_train, y_train)
        accuracy_val = dt.score(X_val, y_val)
        dt_models[hyperparams] = accuracy_train, accuracy_val
   
    df = pd.DataFrame([{'hyperparams': k, 'accuracy_train': v[0],
                          'accuracy_val': v[1]} for k, v in dt_models.items()])
    df['algorithm'] = 'decision_tree'
    
    return df

#### **Algorithm 3: K-Nearest Neighbors(KNN) Classifier:**

##### I want to test the KNN classifier for a range of hyperparameters:
- n_neighbors = range from 2 - 20 step of 2


In [90]:
def knn_classification(df, df_val, target, min_n_neighbors=2, max_n_neighbors=20):
    '''
    Perform KNN classification on the given data.

    Args:
        df (DataFrame): The training DataFrame containing the feature columns and the 
        target column.
        df_val (DataFrame): The validation (or test) DataFrame containing the feature 
        columns and the target column.
        target (str): The name of the target column.
        min_n_neighbors: The minimum number of neighbors to test
        max_n_neighbors: The maximum number of neighbors to test
    Returns:
        DataFrame: A DataFrame containing the combinations of hyperparameters and their 
        corresponding accuracy scores on the training and validation data. An 'algorithm' 
        column is added to denote knn.
    '''
    X_train = df.drop(columns=target)
    y_train = df[target]
    X_val = df_val.drop(columns=target)
    y_val = df_val[target]
    
    knn_models = {}
    hyper_list = range(min_n_neighbors, (max_n_neighbors +1), 2)
    
    for hyperparams in hyper_list:
        knn = KNeighborsClassifier(n_neighbors=hyperparams)
        knn.fit(X_train, y_train)
        preds = knn.predict(X_train)
        accuracy_train = knn.score(X_train, y_train)
        accuracy_val = knn.score(X_val, y_val)
        knn_models[hyperparams] = accuracy_train, accuracy_val
   
    df = pd.DataFrame([{'hyperparams': k, 'accuracy_train': v[0],
                          'accuracy_val': v[1]} for k, v in knn_models.items()])
    df['algorithm'] = 'knn'
    
    return df

#### **Algorithm 4: Naive Bayes Classifier:**

##### I want to test the Naive Bayes classifier for a range of hyperparameters:
- alpha = range from 0.1 to 5.0


In [111]:
def nb_classification(df, df_val, target, min_alpha=0.1, max_alpha=5.0):
    '''
    Perform Naive Bayes classification on the given data.

    Args:
        df (DataFrame): The training DataFrame containing the feature columns and the 
        target column.
        df_val (DataFrame): The validation (or test) DataFrame containing the feature 
        columns and the target column.
        target (str): The name of the target column.
        min_alpha: The minimum alpha to test
        max_alpha: The maximum alpha to test
    Returns:
        DataFrame: A DataFrame containing the combinations of hyperparameters and their 
        corresponding accuracy scores on the training and validation data. An 'algorithm' 
        column is added to denote naive_bayes.
    '''
    X_train = df.drop(columns=target)
    y_train = df[target]
    X_val = df_val.drop(columns=target)
    y_val = df_val[target]
    
    nb_models = {}
    hyperparams = 'default'
    nb = MultinomialNB()
    nb.fit(X_train, y_train)
    preds = nb.predict(X_train)
    accuracy_train = nb.score(X_train, y_train)
    accuracy_val = nb.score(X_val, y_val)
    nb_models[hyperparams] = accuracy_train, accuracy_val
   
    df = pd.DataFrame([{'hyperparams': k, 'accuracy_train': v[0],
                          'accuracy_val': v[1]} for k, v in nb_models.items()])
    df['algorithm'] = 'Multinomial_naive_bayes'
    
    return df

In [64]:
from pydataset import data
df = data('iris')

In [65]:
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


In [66]:
from sklearn.model_selection import train_test_split

In [67]:
train, test = train_test_split(df, train_size=.7, stratify=df['Species'])

In [68]:
train.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
79,6.0,2.9,4.5,1.5,versicolor
8,5.0,3.4,1.5,0.2,setosa
111,6.5,3.2,5.1,2.0,virginica
4,4.6,3.1,1.5,0.2,setosa
123,7.7,2.8,6.7,2.0,virginica


In [69]:
rf_model_performance = rf_classification(train, test, 'Species')

In [96]:
rf_model_performance.sort_values('accuracy_val', ascending = False).head(10)

Unnamed: 0,hyperparams,accuracy_train,accuracy_val,algorithm
92,"(False, 2, 3)",0.980952,0.977778,random_forest
116,"(False, 4, 9)",0.980952,0.977778,random_forest
94,"(False, 2, 5)",0.980952,0.977778,random_forest
95,"(False, 2, 6)",0.980952,0.977778,random_forest
96,"(False, 2, 7)",0.980952,0.977778,random_forest
97,"(False, 2, 8)",0.980952,0.977778,random_forest
98,"(False, 2, 9)",0.980952,0.977778,random_forest
101,"(False, 3, 3)",0.971429,0.977778,random_forest
102,"(False, 3, 4)",0.971429,0.977778,random_forest
103,"(False, 3, 5)",0.971429,0.977778,random_forest


In [71]:
rf_model_performance.sort_values('accuracy_val', ascending=False).max()

hyperparams        (True, 9, 9)
accuracy_train              1.0
accuracy_val           0.977778
algorithm         random_forest
dtype: object

In [72]:
dt_model_performance = dt_classification(train, test, 'Species')

In [73]:
dt_model_performance.head()

Unnamed: 0,hyperparams,accuracy_train,accuracy_val,algorithm
0,"(1, 1)",0.666667,0.666667,decision_tree
1,"(1, 2)",0.961905,0.955556,decision_tree
2,"(1, 3)",0.980952,0.977778,decision_tree
3,"(1, 4)",0.990476,0.955556,decision_tree
4,"(1, 5)",1.0,0.955556,decision_tree


In [74]:
dt_model_performance.sort_values('accuracy_val', ascending=False).max()

hyperparams              (9, 9)
accuracy_train              1.0
accuracy_val           0.977778
algorithm         decision_tree
dtype: object

In [91]:
knn_model_performance = knn_classification(train, test, 'Species')

In [93]:
knn_model_performance.sort_values('accuracy_val', ascending=False).head(10)

Unnamed: 0,hyperparams,accuracy_train,accuracy_val,algorithm
1,4,0.971429,0.977778,knn
3,8,0.971429,0.977778,knn
4,10,0.971429,0.977778,knn
5,12,0.971429,0.977778,knn
8,18,0.971429,0.977778,knn
0,2,0.971429,0.955556,knn
2,6,0.980952,0.955556,knn
6,14,0.971429,0.955556,knn
7,16,0.971429,0.955556,knn
9,20,0.971429,0.955556,knn


In [112]:
nb_model_performance = nb_classification(train, test, 'Species')

In [113]:
nb_model_performance

Unnamed: 0,hyperparams,accuracy_train,accuracy_val,algorithm
0,default,0.952381,0.955556,naive_bayes


In [None]:
# ComplementNB(*, alpha=1.0)

In [None]:
# MultinomialNB(*, alpha=1.0, force_alpha='warn', fit_prior=True, class_prior=None)