In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import re
from nltk import FreqDist


from wordcloud import WordCloud
import ast # used for converting column values to lists post-import from csv

from nltk import FreqDist
from nltk.corpus import stopwords

pd.set_option("display.max_columns", None)



from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, plot_confusion_matrix, plot_roc_curve, accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from imblearn.under_sampling import RandomUnderSampler

import warnings
warnings.filterwarnings('ignore') # supresses errors



In [2]:
# creating a list of the nltk's English-language stopwords
stop_words = stopwords.words('english')
stop_words[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [3]:
# a small function to quickly remove stopwords from the 'step_tokens' column 
def remove_stop_words(count, stop_words):
    for x in count.index:
        if x in stop_words:
            count = count.drop(x)
    
    return count

In [4]:
def evaluate(estimator, X_tr, X_te, y_tr, y_te, cv=5):
    '''
Function takes in estimator, training data, test data, 
and the cross validation splitting strategy, and returns the accuracy, precision, recall, f1 and the ROC-AUC
scores for the model as well as a confusion matrix visualization.  From Phase 3 Project: 
https://github.com/Nindorph/TanzanianWaterWells/blob/main/Modeling_Final.ipynb and Lindsey Berlin’s evaluate function
found at: 
https://github.com/lindseyberlin/Cat-in-the-Dat-Project/blob/main/notebooks/Lindsey/EDA-Initial-Models.ipynb
------------------------------------------------------------------------------------------
Inputs: 
-Estimator - Estimator object  
-X_tr – X_train dataframe
-X_te – X_test dataframe
-Y_tr – y_train dataframe
-Y_te – y_test dataframe
-Cv – If cross_val  set to true this determines the cross-validation splitting strategy.  
        Takes in all value options for sklearn.model_selection_cross_val_score “cv” parameter:
        - None, to use the default 5-fold cross validation,
        - int, to specify the number of folds in a (Stratified)KFold,
        - CV splitter,
        - An iterable yielding (train, test) splits as arrays of indices


Returns – nothing is returned 


    '''

    output = cross_validate(estimator, X_tr, y_tr, cv=cv,
                            scoring=['accuracy', 'precision','recall', 'f1', 'roc_auc'])
    #Printing out the mean of all of our evaluating metrics across the cross validation. 
    #Accuracy, precisionc recall, f1, and roc auc
    print('Results of Cross-Validation:\n')
    print(f'Average accuracy: {output["test_accuracy"].mean()}\
    +/- {output["test_accuracy"].std()}')
    print(f'Average precision: {output["test_precision"].mean()}\
    +/- {output["test_precision"].std()}')
    print(f'**Average recall: {output["test_recall"].mean()}\
    +/- {output["test_recall"].std()}')
    print(f'Average f1 score: {output["test_f1"].mean()}\
    +/- {output["test_f1"].std()}')
    print(f'Average roc_auc: {output["test_roc_auc"].mean()}\
    +/- {output["test_roc_auc"].std()}\n')
    print('+'*20)


    #Fitting the estimator to our X and y train data
    estimator.fit(X_tr, y_tr)
    #getting predictions for X train
    tr_preds = estimator.predict(X_tr)
    #getting predictions for X test
    te_preds = estimator.predict(X_te)

    #Creating a confusion matrix from our data with custom labels
    print('\nResults of Train-Test Split Validation:')
    plot_confusion_matrix(estimator, X_te, y_te, cmap='mako')
    plt.show()

    #Printing our final evaluating metrics across X train
    #Evaluating using accuracy, precision, recall, f1, roc auc
    print("\nTraining Scores:")
    print(f"Train accuracy: {accuracy_score(y_tr, tr_preds)}")
    print(f"Train precision: {precision_score(y_tr, tr_preds)}")
    print(f"**Train recall: {recall_score(y_tr, tr_preds)}")
    print(f"Train f1 score: {f1_score(y_tr, tr_preds)}")
    print(f"Train roc_auc: {roc_auc_score(y_tr, tr_preds)}\n")
    print("<>"*10)
    #Printing our final evaluating metrics across X test
    #Evaluating using accuracy, precision, recall, f1, roc auc
    print("\nTesting Scores:")
    print(f"Test accuracy: {accuracy_score(y_te, te_preds)}")
    print(f"Test precision: {precision_score(y_te, te_preds)}")
    print(f"**Test recall: {recall_score(y_te, te_preds)}")
    print(f"Test f1 score: {f1_score(y_te, te_preds)}")
    print(f"Test roc_auc: {roc_auc_score(y_te, te_preds)}")


In [5]:
X_train = pd.read_csv('../data/X_train.csv', index_col=0)
X_test = pd.read_csv('../data/X_test.csv', index_col=0)
y_train = pd.read_csv('../data/y_train.csv', index_col=0)
y_test = pd.read_csv('../data/y_test.csv', index_col=0)

In [7]:
X_holdout = pd.read_csv('../data/X_holdout.csv', index_col=0)
y_holdout = pd.read_csv('../data/y_holdout.csv', index_col=0)

In [8]:
print(f'Train {y_train.value_counts()}')
print(f'Test  {y_test.value_counts()}')
print(f'Holdout {y_holdout.value_counts()}')

Train target
0         68716
1         68577
dtype: int64
Test  target
1         20714
0         20474
dtype: int64
Holdout target
0         8877
1         8776
dtype: int64


In [11]:
X_train.head()

Unnamed: 0,cleaned_description,cleaned_steps,cleaned_ingredients
47827,i tried some recipes for making sweet sour sau...,add the oil sugar and vinegar to a small sauce...,sugar vinegar ketchup water pineapple juice co...
129467,this is the soup that moroccans traditionally ...,place the lamb turmeric black pepper cinnamon ...,lamb ground turmeric ground black pepper groun...
184947,i really loved my friends lemon chicken pasta ...,cook chicken your favorite way i usually salt ...,chicken pasta avocados olive oil green onion f...
98991,a nice change to the tradtional gratin \r \r w...,preheat the oven to cfan cgas put the cream ga...,double cream garlic cloves fresh thyme leave p...
159123,this has amazed our guests for years but is so...,in a large nonreactive pot with lid melt butte...,unsalted butter garlic cloves fresh ground bla...


To prepare the data for vectorization the columns containing text will have to be combined into a single string that will be used in the model to meet the input requirements for the vectorizers.

In [12]:
# creating a new column compatible with vectorizer inputs 
X_train['combined'] = X_train['cleaned_description'].str.cat(X_train[['cleaned_steps',
                                                                      'cleaned_ingredients']],sep=" ")

X_train.head()

Unnamed: 0,cleaned_description,cleaned_steps,cleaned_ingredients,combined
47827,i tried some recipes for making sweet sour sau...,add the oil sugar and vinegar to a small sauce...,sugar vinegar ketchup water pineapple juice co...,i tried some recipes for making sweet sour sau...
129467,this is the soup that moroccans traditionally ...,place the lamb turmeric black pepper cinnamon ...,lamb ground turmeric ground black pepper groun...,this is the soup that moroccans traditionally ...
184947,i really loved my friends lemon chicken pasta ...,cook chicken your favorite way i usually salt ...,chicken pasta avocados olive oil green onion f...,i really loved my friends lemon chicken pasta ...
98991,a nice change to the tradtional gratin \r \r w...,preheat the oven to cfan cgas put the cream ga...,double cream garlic cloves fresh thyme leave p...,a nice change to the tradtional gratin \r \r w...
159123,this has amazed our guests for years but is so...,in a large nonreactive pot with lid melt butte...,unsalted butter garlic cloves fresh ground bla...,this has amazed our guests for years but is so...


In [13]:
# repeating this with the test data
X_test['combined'] = X_test['cleaned_description'].str.cat(X_test[['cleaned_steps',
                                                                'cleaned_ingredients']],sep=" ")
X_test.head()

Unnamed: 0,cleaned_description,cleaned_steps,cleaned_ingredients,combined
97436,my favorite dinner party soup cooking the drie...,cover mushrooms with cold water and soak overn...,dried wild mushrooms beef stock butter onion c...,my favorite dinner party soup cooking the drie...
147954,this is from the wsu extension office i havent...,saute in pan on medium heat in oil zucchini mu...,zucchini sliced mushrooms onions flour tortill...,this is from the wsu extension office i havent...
10086,recipe by tyler florencethis is soooo good the...,first thing to do is to steam the artichokes i...,fresh parsley water garlic cloves bay leaves d...,recipe by tyler florencethis is soooo good the...
178888,im very picky about my chili as i get terrible...,note for a soupier chili use a oz bottle of v ...,v vegetable juice chili seasoning mix chili st...,im very picky about my chili as i get terrible...
136500,this is a very quick recipe so easy my husband...,thaw frozen tilapia soak in milk for hour disc...,tilapia fillet lowfat milk panko breadcrumbs m...,this is a very quick recipe so easy my husband...


In [17]:
# X_test['combined'].iloc[136500]

IndexError: single positional indexer is out-of-bounds

## Modeling  

To begin the modeling process basic, nlp appropriate,  models are created and run with both types of vectorized datasets.  This will identify which models deserve more attention and fine tuning.

In [None]:
from sklearn.naive_bayes import (
    BernoulliNB,
    ComplementNB,
    MultinomialNB,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis


### CountVectorizer

In [None]:
# having the CountVectorizer remove stop words
countvect = CountVectorizer(stop_words=stop_words, ngram_range=(1,1))

In [None]:
X_train_CV = countvect.fit_transform(X_train.combined)
X_test_CV = countvect.transform(X_test.combined)

In [None]:
classifiers = {
    "BernoulliNB": BernoulliNB(),
    "ComplementNB": ComplementNB(),
    "MultinomialNB": MultinomialNB(),
    "DecisionTreeClassifier": DecisionTreeClassifier(max_depth=3), #to keep the initial modeling quick
    "RandomForestClassifier": RandomForestClassifier(max_depth=3),
    "LogisticRegression": LogisticRegression(penalty = 'elasticnet',l1_ratio =.5, solver='saga'),
    "AdaBoostClassifier": AdaBoostClassifier(),
}

In [None]:
for name, sklearn_classifier in classifiers.items():
    classifier = sklearn_classifier
    print(name)
    evaluate(classifier, X_train_CV, X_test_CV, y_train, y_test)

### TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer(stop_words = stop_words)
X_train_tfidf = tfidf.fit_transform(X_train.combined)
X_test_tfidf = tfidf.transform(X_test.combined)

In [None]:
type(X_train_tfidf)

In [None]:
# removing KNeighborsClassifier from the list of options as it takes too long to run
# with mediocre results when used with the CountVectorizer

classifiers = {
    "BernoulliNB": BernoulliNB(),
    "ComplementNB": ComplementNB(),
    "MultinomialNB": MultinomialNB(),
    "DecisionTreeClassifier": DecisionTreeClassifier(max_depth=3), #to keep the initial modeling quick
    "RandomForestClassifier": RandomForestClassifier(max_depth=3),
    "LogisticRegression": LogisticRegression(penalty = 'elasticnet',l1_ratio =.5, solver='saga'),
    "AdaBoostClassifier": AdaBoostClassifier(),
}

In [None]:
for name, sklearn_classifier in classifiers.items():
    classifier = sklearn_classifier
    print(name)
    evaluate(classifier, X_train_tfidf, X_test_tfidf, y_train, y_test)

An interesting result of running the basic models with both CountVectorized and TdifdVectorized data is seeing how both types produce near identical results with this data set, meaning that we could proceed with using either on the "scores" front.  That being said the data transformed with TdifdVectorizer did not trigger a convergence warning while being run through the LogisticRegression model, our best performing one, so we will proceed with using that dataset as the default. A possible next step would be to use both vectorizers on the dataset to see if that improves performance.

## Delete This

adaboost  
Average accuracy: 0.678410389011874        +/- 0.0019428737357940935  
Average precision: 0.6644931152129286        +/- 0.002818424532377354  
Average recall: 0.7194833158918691        +/- 0.006592810246041841  
Average f1 score: 0.6908708797403835        +/- 0.002527923487022844  
Average roc_auc: 0.7486999508644392     

LogisticRegression  
Results of Cross-Validation:

Average accuracy: 0.7102109937320492        +/- 0.0013818965212887846  
Average precision: 0.699253318003423        +/- 0.0016925889402733649  
Average recall: 0.7366901576238443        +/- 0.0026499199864471216  
Average f1 score: 0.7174799331804131        +/- 0.0014341735471591994  
Average roc_auc: 0.7845830512199936        +/- 0.0016728732962070642  
  
DecisionTreeClassifier  
Results of Cross-Validation:  

Average accuracy: 0.6305128365040875        +/- 0.0020716757364631884  
Average precision: 0.5942676548309842        +/- 0.0018375529576019885  
Average recall: 0.8204499451239758        +/- 0.003048503280868797  
Average f1 score: 0.6892732858430838        +/- 0.0014567472373081848  
Average roc_auc: 0.663055875539024        +/- 0.0024670581470042884  

MultinomialNB  
Results of Cross-Validation:  

Average accuracy: 0.6767569931389056        +/- 0.0026251702785135564  
Average precision: 0.6414577304862454        +/- 0.0025648688226974745  
Average recall: 0.8000787507892781        +/- 0.0018291914138574743  
Average f1 score: 0.7120383996765376        +/- 0.0019157732080198335  
Average roc_auc: 0.749902220537832        +/- 0.003027434533991026  

ComplementNB  
Results of Cross-Validation:  

Average accuracy: 0.6767569923431171        +/- 0.002685799123032517  
Average precision: 0.6413717247113541        +/- 0.0026018594467296205  
Average recall: 0.8004578828241925        +/- 0.0018746067605576115  
Average f1 score: 0.7121355622030373        +/- 0.001978348558231771  
Average roc_auc: 0.749902220537832        +/- 0.003027434533991026    

BernoulliNB  
Results of Cross-Validation:  

Average accuracy: 0.6679437230396654        +/- 0.002355052279753311  
Average precision: 0.6288082257136919        +/- 0.0020276220377248248  
**Average recall: 0.8182334449353144        +/- 0.002507729848902346  
Average f1 score: 0.7111205451612952        +/- 0.0019129888678486272  
Average roc_auc: 0.7372718944460759        +/- 0.002784791836307464  



The evaluation results for each model show Logistic Regression, MultinomialNB, and ComplementNB having the best performance with the current dataset.  Interestingly the Naieve Bayes' models had near identical outputs and so moving forward we'll only work with ComplementNB as that's best suited for inbalanced datasets, which is not an issue.  Per the documentation MultinomialNB nominally requires integer feature counts even though it will word with the TdifdVectorizer's fractional outputs. Despite this we'll continue to use the Tdifd dataset as MultinomialNB performed slightly better using it. 

Although it didn't fare particularly well when compared with the other models the DecisionTree will also undergo some hyperparameter tuning as the initial model was in part designed to be processed quickly so simply increasing the max_depth outputs may provide better results.

## Hyperparameter Tuning

In [None]:
from sklearn.pipeline import Pipeline

### MultinomialNB  

The MultinomialNB model takes in only 3 parameters: alpha, fit_prior, and class_prior, making it an excellent starting point for hyperparameter tuning. 

In [None]:
#creating a pipeline including the TfidfVectorizer as well as the model

mnb_pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('mnb', MultinomialNB())
])

In [None]:
mnb_params = {
    'tfidf__max_df': [.5, .95, 1],
    'tfidf__min_df': [.05, .1, 1],
    'mnb__fit_prior': [True, False]
}



In [None]:
mnb_clf = GridSearchCV(mnb_pipe, mnb_params)
mnb_clf.fit(X_train.combined, y_train)

mnb_results = pd.DataFrame(mnb_clf.cv_results_)
mnb_results

In [None]:
mnb_results.sort_values(['rank_test_score'])

Looking at the top performing model the common hyperparameter was a max_df of .95 in the vectorization stage - this is a parameter that causes any word appearing more then a certain amount or percentage of times in the dataset to be dropped. This parameter was paired with a min_df, the smallest number or percentage of times a word can appear before being dropped, in the best performing values but since this is the default value of the hyperparameter it can be excluded from future GridSearches.

In [None]:
mnb_params2 = {
    'tfidf__max_df': [.9, .95, .98],
    'mnb__alpha' : [.5,1,1.5]
}

In [None]:
mnb_clf2 = GridSearchCV(mnb_pipe, mnb_params)
mnb_clf2.fit(X_train.combined, y_train)


In [None]:

mnb_results2 = pd.DataFrame(mnb_clf2.cv_results_)
mnb_results2.sort_values('rank_test_score').head()

In order to facilitate evaluation a dataframe of scores is created to track GridSearchCV results.

In [None]:
def score_tracker( gscv_results, model_name, score_df=None):
    '''Takes in  GridSearchCV results  from 'cv_results_' attribute in dataframe form and cleans it up so it can be 
    appended to other results output, labeling each row with the other required attribute, 'model_name'. Has a score
    tracking dataframe as an optional argument - if passed the search results will be appended to the tracker. '''
    
    gscv_results['model'] = model_name
    results = gscv_results[['mean_fit_time','params','mean_test_score','std_test_score','model']]
    
    if score_df is not None:
        all_scores = pd.concat([score_df, results]).copy()
        return all_scores.sort_values('mean_test_score')
    else:
        return results

In [None]:
mnb2 = score_tracker(mnb_results2, 'mnb2')
all_scores = score_tracker(mnb_results, 'mnb', mnb2)


In [None]:
all_scores.sort_values('mean_test_score', ascending=False).head()

### Logistic Regression

In [None]:
lr_pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lr', LogisticRegression())
])

In their article about hyperparameter tuning Machine Learning Mastery notes that "Logistic regression does not really have any critical hyperparameters to tune." and only calls out the 'solver', 'penalty', and 'C', parameters as having the potential to significantly affect the model's performance. Not having found additional sources to contradict the LogisticRegression GridSearch will only focus on these three. Special care needs to be taken as not all solvers work with all the penalties, however additional GridSearches will not need to be performed as the sklearn documentation for LogisticRegresssion states that the l1 penalty only works with the 'libliner' and 'saga' solvers, and the former is best suited for small datasets. While this is not true for the 'saga' solver if it proves to be the best an additional model will be run using 'l1' with 'saga'.

In [None]:
lr_params = mnb_params = {
    'tfidf__max_df': [.95],
    'lr__penalty': ['none','l2'],
    'lr__solver': ['saga','sag','newton_cg'],
    'lr__C': [100, 1, .001],
    'lr__max_iter': [500, 1000]

}

In [None]:
lr_results = pd.DataFrame(lr_clf.cv_results_)
all_scores = score_tracker(lr_results, 'lr', all_scores)
all_scores.sort_values('mean_test_score', ascending=False).head(10)

In [None]:
#display full contents of columns so can see all params
pd.set_option('display.max_colwidth', -1)

In [None]:
# since of the top models used the saga sovler I am using those parameters with the 'elasticnet' and 'l1' values
lr2_pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lr', LogisticRegression())
])

# no need to include the C parameter in the dictionary as the best models had C =1, the default value for the model
lr2_params = {
    'tfidf__max_df': [.95],
    'lr__penalty': ['elasticnet','l1'],
    'lr__solver': ['saga'],
    'lr__max_iter': [500, 1000]
}

lr2_clf = GridSearchCV(lr2_pipe, lr2_params)
lr2_clf.fit(X_train.combined, y_train)

In [None]:
lr2_results = pd.DataFrame(lr2_clf.cv_results_)
all_scores = score_tracker(lr2_results, 'lr2', all_scores)
all_scores.sort_values('mean_test_score', ascending=False).head(10)

The additional solvers and penalties failed to improve the model's performance 

In [None]:
lr3_pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lr', LogisticRegression())
])

# no need to include the C parameter in the dictionary as the best models had C =1, the default value for the model
lr3_params = {
    'tfidf__max_df': [.95],
    'lr__penalty': ['elasticnet'],
    'lr__solver': ['saga'],
    'lr__l1_ratio': [.25,.5,.75] 
}

lr3_clf = GridSearchCV(lr3_pipe, lr3_params)
lr3_clf.fit(X_train.combined, y_train)

In [None]:
lr3_results = pd.DataFrame(lr3_clf.cv_results_)
all_scores = score_tracker(lr3_results, 'lr3', all_scores)
all_scores.sort_values('mean_test_score', ascending=False).head(10)

In [None]:
all_scores.to_csv('data/all_scores2.csv')