In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Business Problem  
Man cannot live on Seamless alone but after a long day at work they may want to try.  Going to the internet for a recipe idea can be overwhelming, especially if you aren't a practiced cook. I hope to make cooking an easier option for people to choose by creating a model that will, when given a recipe, determine if it is easy to prepare or not.  This can be used by both home chefs and by websites that host recipes, enabling the former to know what they're getting into when they decide to make dinner and providing the latter with the means to auto-classify their catalogs and new submissions. 

Something about how this makes their recipes more accessible, potentially attractive to new users. Focus on the business side not the home users

## Data Understanding  

The data used to build my model comes from KAGGLE LINK and consists of ~500,000 user-submitted recipes scraped from Food.com.  In addition to the text of the recipes' description and instructions the dataset also contains columns breaking out the ingredients, search terms, tags, and individual steps for each recipe. The tag data comes from the recipe author from a list of options provided by Food.com whereas the search terms are, from all evidence, assigned by Food.com. There is also an "id" column that can be used to search for the recipe on Food.com.

This dataset does not have a target variable included so one needs to be constructed for it by leveraging the tag and search term data to find dinner recipes that can be called easy, be it because they're quick, simple to make, or have very few steps. 

## Preliminary Data Cleaning   

The primary challenge in cleaning the dataset lay in the fact that a number of columns have string data that appears to be in list format:  
- "['apple', 'orange']"  
    vs.
- ['apple', 'orange'] 

Having this information in actual list format was a priority as it greatly simplified the EDA phase. 

This being said the overall dataset required very little cleaning - the information in each column was formated consistently and there were very few null values or other missing data. 

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator


from wordcloud import WordCloud
import ast # used for converting column values to lists post-import from csv

from nltk import FreqDist

In [3]:
df = pd.read_csv('../input/foodcom-recipes-with-search-terms-and-tags/recipes_w_search_terms.csv')
df.head()

In [4]:
df.info()

I chose to simply remove the null values in the "description" column; while this removes ~9,600 rows, or just under 2%, of data the overall scale of the dataset can support the loss.

In [5]:
# confirm number of null values in 'description'
df['description'].isna().sum()

In [6]:
# drop columns with nulls
df = df.dropna(subset=['description'])

In [7]:
df['description'].value_counts()



In [8]:
# recipes using '.' as the descriiption appear to be unique and so I will not be dropping these
# from the model 
df_period = df[df['description'] =='.']
df_period.head()

In [9]:
# confirming nulls removed 
df.info()

In [10]:
# reviewing other columns to see what other steps need to be taken
df.describe(include = 'object')


Reviewing the above several things stand out, such as the fact that Banana Bread is a very popular recipe, or that there are at least 63 recipes whose ingredients consist of 'paper' and 'cloth'.  Also noteworthy is that at least 32 recipes seem to be duplicates based on the 'steps' output.  Since neither paper or cloth are edible I removed those from the dataset and further investigated the 'steps' column to see if the recipes truly were duplicates - "Blend all ingredients until smooth" could apply to many different smoothie or milkshake recipies after all.

In [11]:
df_inedible = df[df['ingredients'] == "['paper', 'cloth']"]
df_inedible.head()

A number of napkin folding 'recipes' have been included in the dataset, hence the ingredients 'paper' and 'cloth'.  THese will be removed but it's worth noting that other crafts or non-edible recipes may have snuck through and something to keep an eye out for while working with the data.

In [12]:
# removing napkin folding instructions from the dataframe

df = df[df['ingredients'] != "['paper', 'cloth']"]

In [13]:
# checking the dataframe again to see how the removal of the craft instructions has changed 
# its makeup
df.describe(include = 'object')


In [14]:
df_no_ingredients = df[df['ingredients'] == "[]"]
df_no_ingredients.head()

In [15]:
# checking the steps of a specific recipe to see if the ingredients are listed within that column
df_no_ingredients['steps'].iloc[0]

In [16]:
# since there are only 20 recipes that do not have their ingredients broken out these will
# be dropped from the dataframe
df = df[df['ingredients'] != "[]"]
df.describe(include = 'object')


In [17]:
df_steps = df['steps'].value_counts().to_frame()
df_steps.head(20)

In [18]:
df_example = df[df['steps'].str.contains("In a large bowl combine flour, yeast and salt. Add 1 5")] 
df_example.head()


Reading the descriptions of one of the duplicate recipes, a "No-Knead Bread", it looks like at least some of the duplicates come from Food.com users uplaoding recipes from other sources - in this case the New York Times.  

In [19]:
df_example = df[df['steps'] =="['Blend all ingredients until smooth.']"] 
df_example.head()
	

In [20]:
# you can see that other recipes with identical steps are for unique dishes, below we see 
# all the options for recipes whose steps are '['Blend all ingredients until smooth.']', 
# the most common string in 'steps'

df_example['name'].value_counts()

In [21]:
# Using the df_steps dataframe to find out how many recipes in total have duplicate steps. 

df_steps_dupe = df_steps[df_steps['steps'] > 1]
df_steps_dupe['steps'].sum()

2,944 recipes have duplicate steps. Given that we've established this is a mixture of true duplicates, in the cases of recipes copied from other sources, and legitimately unique recipes these will be deleted as the number of recipes affected does not justify the work necessary to further evaluate each's status (although if I were to do this it would be based on steps length to begin with)

In [22]:
# removing potential duplicate recipes. Code thanks to first answer on 
# https://stackoverflow.com/questions/49735683/python-removing-rows-on-count-condition
df = df[df.groupby('steps').steps.transform('count')==1].copy() 
# add copy for future warning when you need to modify the sub df
len(df)

In [23]:
df.describe(include = 'object')


### Converting Column Data to Lists  

The two code blocks below highlight the primary issue with the data: the strings that look like lists. This was resolved by creating a custom function that uses the ast library to take in a string with list-like formating and returns an actual list. 

In [24]:
# looking at an example from  the 'ingredients' column and comparing it against its type
df['ingredients'].iloc[0]

In [25]:
# although the above output looks like a list checking the type confirms it is a string
type(df['ingredients'].iloc[0]) 

In order to avoid mistakenly overwriting data or having to concatenate the function's output with the original dataframe return_to_list creates a new column in the dataframe with the converted strings. 

In [26]:

def return_to_list(df, column_names):
    ''' Takes in list of names of columns containing strings and the dataframe they sit in and returns converts each column's contents into a new
    column, called '<original column name>_list', now as lists. May only work on strings that look like lists.... 
    
    Inputs:
    df = dataframe with columns being converted to lists
    column_names = list of columns whose contents need to be transformed
    
    Returns: updated dataframe
    '''
    for col in column_names:
        col_name = col + '_list'
        df[col_name] = [ast.literal_eval(x) for x in df[col] ]
    
    return df

In [27]:
# list of list-appearing columns
col_to_list = ['ingredients', 'ingredients_raw_str', 'steps', 'tags', 'search_terms']

In [28]:
# running dataframe through custom function 
df = return_to_list(df, col_to_list)

In [29]:
# confirming new columns have been created
df.head()

In [30]:
# confirming type of data in new columns
type(df['ingredients_list'].iloc[0]) 

In [31]:
# one addtional step needs to be taken - becasue the data in the "search_terms" column 
# was enclosed in curly brackets their contents were transformed into a set.  
type(df['search_terms_list'].iloc[0]) 

In [32]:
# For the sake of uniformity "search_terms_list" is converted into a list
df['search_terms_list'] = df['search_terms_list'].apply(lambda x: list(x))
type(df['search_terms_list'].iloc[0])

## Setting the Target Variable 

Data from two different columns was used to develope the target variable.  The model seeks to find easy dinner recipes and the target included recipes that listed 'dinner' as one of their search terms.  The list of tags was reviewed to find those that fit our target recipe and marking all recipes that contain at least one of the "easy indicator" tags.  A recipe had to have at least one of the relevant tags and have 'dinner' as a search term to qualify as a target. 

A custom function, lists_to_count, was created to facilitate reviewing the data as well as visualizing it. 


In [33]:
def lists_to_count(df, column, series = False):
    ''' takes in a column of lists and returns counts for all unique values. 
    
    Inputs:
    df - dataframe with column being converted
    column - column of lists
    series - if set to True returns pandas Series instead of a FreqDist object 
    
    Returns: 
    Series with unique value counts or FreqDist object, depending on setting of 'series' parameter
    '''

    all_col = df[column].explode()
    col_count = FreqDist(all_col)
    
    if series:
        return pd.Series(dict(col_count))
    else:
        return col_count

### Tag Data Targets

In [34]:
# find the count of each unique tag in the dataset

tag_dist = lists_to_count(df, 'tags_list')
type(tag_dist)

In [35]:
# number of unique tabs
len(tag_dist)

In [36]:
# the below code displays the counts, ordered from most to least frequent, for each tag.
tag_dist.items()

In [37]:
wordcloud = WordCloud(
    width = 3000,
    height = 2000,
    background_color = 'black').generate_from_frequencies(dict(tag_dist))
fig = plt.figure(
    figsize = (40, 30),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [38]:

#function for visualizing the most common tokens within a frequency distribution

def visualize_tokens(freq_dist, number, title):
    '''
    From Phase 4 Project: 
    https://github.com/CGPinDC/Tweet_NLP_Project/blob/main/Tweet_Sentiment_%20Analysis_Notebook.ipynb
    
    Inputs:
    freq_dist: pass in frequency dictionary of tokens. 
    number: number as integer of the top tokens to return
    
    title: title of graph
    '''
    

    # get tokens and frequency counts from freq_dist
    top = list(zip(*freq_dist.most_common(number)))
    tokens = top[0]
    counts = top[1]
    
    print(f'Top Tokens: {tokens[:number]}')

    # Set up plot and plot data
    fig, ax = plt.subplots(figsize = (15, 10))
    ax.bar(tokens, counts)

    # Customize plot appearance
    ax.set_title(title)
    ax.set_ylabel("Count")
    ax.yaxis.set_major_locator(MaxNLocator(integer=True))
    ax.tick_params(axis="x", rotation=90)

In [39]:
visualize_tokens(tag_dist, 50, "Top 50 Tags")

Since there are only 631 distinct tags I could review them one-by-one to select the ones that best fit into the "easy" category.

In [40]:
# prints each tag in an easy to read fashion
for tag in list(tag_dist):
    print(tag)

Recipes tagged with the following will be included in the target:

* easy     
* 30-minutes-or-less  
* 3-steps-or-less  
* 15-minutes-or-less                        
* beginner-cook                                     

The following tags were also discovered during the review.  While consisting of a small number of recipes overall since they are not for food recipes containing these tags will be removed, as we did with the napkin folding instructions. 

* bath-beauty                                            
* household-cleansers               
* homeopathy-remedies  

In [41]:
target_tags = ['easy','30-minutes-or-less', '3-steps-or-less', 
               '15-minutes-or-less', 'beginner-cook']
tags_to_remove = ['bath-beauty', 'household-cleaners', 'homeopathy-remedies']

In [44]:
# we'll remove the homeopathic and bath/beauty related recipes by indicating which
# recipes contain the undesirable tags
df['remove'] = df['tags_list'].map(lambda x: any(tag in x for tag in tags_to_remove))
df['remove'].value_counts()

In [45]:
df = df[df['remove'] != True]
df['remove'].value_counts()

### Search Term Targets

In [46]:
# begin by generating the count of unique search terms
search_term_dist = lists_to_count(df, 'search_terms_list')
type(search_term_dist)

In [47]:
# Wordcloud visalization of count

wordcloud = WordCloud(
    width = 3000,
    height = 2000,
    background_color = 'black').generate_from_frequencies(dict(search_term_dist))
fig = plt.figure(
    figsize = (40, 30),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [48]:
# show the most common search terms
visualize_tokens(search_term_dist, 50, "Top 50 Search Terms")

In [49]:
len(search_term_dist)

In [50]:
for term in list(search_term_dist):
    print(term)

In [51]:
# generating a count of 'quick'recipes to see if that should be included in the search term target
search_term_dist['quick']

Reviewing the search terms confirms that the only one that's going to be used to create this target variable is "dinner".  While there is a "quick" search term it applies to only 2,600 recipes, not a material amount. Additionally "quick" recipes are not automatically "dinner" recipes so their inclusion as a target parameter potentially adds bad data to the target set.

In [52]:
# number of recipes labeled with 'dinner' as a search term
search_term_dist['dinner']

In [53]:
# creating a list with the search term target label
target_search_term = ['dinner']

### Creating Target Variable

In [54]:
# creating a column indicating if a target tag is present in the 'tags_list' column
df['target_tag'] = df['tags_list'].map(lambda x: any(tag in x for tag in target_tags))
df['target_tag'].value_counts(normalize=True)

In [55]:
df

Reviewing the above dataframe output I saw the last row contained a recipe with the '60-minutes-or-less' tag but also had one of the target tags.  Seeing how a recipe cold conceivably, if over-optimistically, be tagged with both 'easy' and '60-minutes-or-less', and because this model seeks to find recipes that are both quick and simple, going to reset the "target_tag" value to "False" if the tag list contains "60-minutes-or-less" . 

In [56]:
# small function to relabled the 'target_tag' value for recipes with '60-minutes-or-less' 
# tags
def hour_check(x):
    if '60-minutes-or-less' in x: 
        return False
    else:
        return True

In [57]:
# credit to first response on https://stackoverflow.com/questions/58562662/apply-function-on-subset-of-dataframe-rows-in-column-based-on-value-in-other-col
# for helping me figure out how to re-label the 'target_tag' column
df['target_tag'] = df.apply(lambda row:
    hour_check(row.tags_list) if row.target_tag == True else row.target_tag, axis=1)

In [58]:
df['target_tag'].value_counts(normalize=True)

In [59]:
# identifying the recipes with the target search term
df['target_search_term'] = df['search_terms_list'].map(lambda x: any(term in x for term in target_search_term))
df['target_search_term'].value_counts(normalize=True)

Once the recipes were labeled with the target conditions the below code identifies recipes that meet both the search term condition and the tag condition.

In [60]:
def target_check(x):
    ''' dataframe specific function to set rows as meeting the conditions for the 
    target variable or not'''
    if (x['target_tag'] == True) and (x['target_search_term'] == True):
        return 1
    else:
        return 0

In [61]:
df['target'] = df.apply(target_check, axis=1)
df.head()

In [62]:
# checking the number of 'easy' recipes in the dataset
df['target'].value_counts(normalize=True)

Although the distribution of the search term target and the tag target were roughly even with both conditions applied to the recipe we now have an imbalanced dataset, which will have to be addressed prior to modeling.

The above establishes the modeless baseline for subsequent model performance - needs to correctly pick "easy" recipes at least 20% of the time to be better then guessing.

In [63]:
# final clean-up action (for now) 
df.drop(['remove'], axis=1, inplace = True)

In [64]:
df.info()

## EDA

Now that the data has had it's initial cleaning (because there's always more you can do on that front) EDA could be performed to further understand the dataset's contents. To begin with we'll create some features that will be used in our first, baseline, model, which will only use length/count data in order to ensure that the target variable can't be identified by these factors alone and to provide a goal when modeling. 

In [65]:
import seaborn as sns

In [66]:
# create a number of new columns with counts of list-items and characters. 

df['num_ingredients'] = df['ingredients_list'].apply(lambda x: len(x))
df['num_steps'] = df['steps_list'].apply(lambda x: len(x))
df['num_char_description'] = df['description'].apply(lambda x: len(x))

In [67]:
# visualizing our imbalanced target variables

df.target.value_counts().plot(kind ='bar')

In [68]:
# creating Series of counts to facilitate visualizations
ingredients_count = lists_to_count(df, 'ingredients_list', series = True).sort_values(ascending=False)
search_terms_count = lists_to_count(df, 'search_terms_list', series = True).sort_values(ascending=False)
tags_count = lists_to_count(df, 'tags_list', series = True).sort_values(ascending=False)

In [69]:
fig, ax = plt.subplots(nrows=1, ncols=2,figsize=(15,7))
fig.suptitle('20 Most Common Search Terms')

sns.barplot(ax = ax[0], x=search_terms_count.index[:20], y=ingredients_count.values[:20]/ingredients_count.sum())
ax[0].set_title('20 Most Common Search Terms')
ax[0].tick_params(axis = 'x', rotation = 90);

sns.barplot(ax = ax[1], x=search_terms_count.index[:20], y=search_terms_count.values[:20]/search_terms_count.sum())
ax[1].set_title('% of Recipes Listed w/ 20 Most Common Search Terms')
ax[1].tick_params(axis = 'x', rotation = 90);

In [70]:
fig, ax = plt.subplots(figsize=(10,10))

# plotting the 20 most frequently used ingredients 
all_plot = sns.barplot(x=tags_count.index[:20], y=tags_count.values[:20], ax=ax)
plt.xticks(rotation=90);
plt.title('20 Most Common Tags')

In [71]:
fig, ax = plt.subplots(nrows=1, ncols=2,figsize=(15,7))
fig.suptitle('20 Most Common Ingredients')

# plotting the 20 most frequently used ingredients 
sns.barplot(ax = ax[0],x=ingredients_count.index[:20], y=ingredients_count.values[:20])
ax[0].set_title('20 Most Common Ingredients')
ax[0].tick_params(axis = 'x', rotation = 90);


sns.barplot(ax=ax[1], x=ingredients_count.index[:20], y=ingredients_count.values[:20]/ingredients_count.sum())
ax[1].set_title('% Recipies w/the 20 Most Common Ingredients')
ax[1].tick_params(axis = 'x', rotation = 90);


The most common ingredients are, as to be expected, cooking essentials: salt, butter, sugar, etc... Decided to investigate this further as we may want to add very common ingredients to our stopwords list prior to vectorization if they are common in both target and non-target recipes

In [72]:
# creating ingredient counts for target and non-target recipes
easy_ingredient_count = lists_to_count(df[df['target'] == 1], 'ingredients_list', series = True).sort_values(ascending=False)
not_easy_ingredient_count = lists_to_count(df[df['target'] == 0], 'ingredients_list', series = True).sort_values(ascending=False)


In [73]:
# plotting the 20 most frequently used ingredients in each type of recipe 

fig, ax = plt.subplots(nrows=1, ncols=2,figsize=(15,7))
fig.suptitle('20 Most Common Ingredients in Target and Non-Target Recipes')

sns.barplot(ax=ax[0], x=easy_ingredient_count.index[:20], y=easy_ingredient_count.values[:20])
ax[0].set_title('Target Recipes')
ax[0].tick_params(axis = 'x', rotation = 90);


sns.barplot(ax=ax[1], x=not_easy_ingredient_count.index[:20], y=not_easy_ingredient_count.values[:20])
ax[1].set_title('Non-Target Recipes')
ax[1].tick_params(axis = 'x', rotation = 90);

Comparing the two graphs you can see that the Non-Target recipes have more baking related ingredients, such as flour, brown sugar, and baking powder, then the Target recipes do.  This makes sense given that in general baking takes more time then cooking and because the Non-Target column contains recipes with 'dessert' as a search term, which we can see above is the second most common search term after 'dinner'.

# TO DO

Make the cool graph from the challenge  
Revise all graph pairings to show Target and Non-Target counts

### Exploring the 'steps' and 'description' Columns

The 'steps' and 'description' column text will be the core of the information used in the modeling process and so requires a better understanding of their raw data as well as what emerges after the initial pre-processing steps are taken (converting to lowercase, removing punctuation, etc.). 

In [74]:
import re


In [75]:
def basic_cleaning(df, column):
    ''' Takes in a dataframe and the name of the column to be cleaned.  The contents of the column 
    which need to be strings, are converted to lowercase, have their punctuation and numbers removed,
    and are finally stripped of whitespaces
    
    Input:
    df - dataframe with column to be cleaned
    column - column containing strings
    
    Returns: 
    Dataframe with new, cleaned, column added'''
    new_col = 'cleaned_' +column
    # convert to lowercase
    df[new_col] = df[column].apply(lambda x: x.lower())
    
    # remove punctuation and non-characters
    df[new_col] = df[new_col].apply(lambda x: re.sub(r'[^\w\s]','',x))
    df[new_col] = df[new_col].apply(lambda x: re.sub('[0-9\n]',' ',x))

    #strip whitespace
    df[new_col] = df[new_col].apply(lambda x: re.sub('[ ]{2,}',' ',x))
    
    return df

In [76]:
df = basic_cleaning(df, 'steps')

In [77]:
# review a cleaned step to confirm it appears as expected
step = df['cleaned_steps'][0]
step

One interesting thing to note in the above 'steps' example is that because the recipes are user submitted they may contain spelling errors - in the above case you can see the first word is missing an 'n' -  as they aren't required to meet any proofreading standards. The Spark NLP library contains a spell checker that's used after tokenization and which I may use to resolve this issue. 

In [78]:
df['step_tokens'] = df['cleaned_steps'].apply(lambda x: x.split())
df.head()

In [79]:
token_count = lists_to_count(df, 'step_tokens', series = True).sort_values(ascending=False)

In [80]:
#visualization of the most common words in 'steps'

fig, ax = plt.subplots(figsize=(12,4))

sns.barplot(ax=ax, x=token_count.index[:20], y=token_count.values[:20])
ax.set_title('Most Common Words in Steps')
ax.set_xticklabels(token_count.index[:20])
ax.tick_params(axis='x', labelrotation=70);

The most common words in 'steps' at this point are, unsurprisingly, stopwords. In order to get a better picture of the data these will be removed and the top 20 words will be replotted. 

In [81]:
from nltk.corpus import stopwords

In [82]:
# creating a list of the nltk's English-language stopwords
stop_words = stopwords.words('english')
stop_words[:10]

In [83]:
# a small function to quickly remove stopwords from the 'step_tokens' column 
def remove_stop_words(count, stop_words):
    for x in count.index:
        if x in stop_words:
            count = count.drop(x)
    
    return count

In [84]:
token_count = remove_stop_words(token_count, stop_words)


In [85]:
# re-plotting the most common words in steps

fig, ax = plt.subplots(figsize=(12,4))

sns.barplot(ax=ax, x=token_count.index[:20], y=token_count.values[:20])
ax.set_title('Most Common Words in Steps, Excluding Stopwords')
ax.set_xticklabels(token_count.index[:20])
ax.tick_params(axis='x', labelrotation=70);

Having reviewed the common contents of 'steps' we'll now do the same for the 'description' column 

In [86]:
df = basic_cleaning(df, 'description')
df['description_tokens'] = df['cleaned_description'].apply(lambda x: x.split())
df.head()

In [87]:
description_count = lists_to_count(df, 'description_tokens', series = True).sort_values(ascending=False)

In [88]:
#visualization of the most common words in 'description'

fig, ax = plt.subplots(figsize=(12,4))

sns.barplot(ax=ax, x=description_count.index[:20], y=description_count.values[:20])
ax.set_title('Most Common Words in Description')
ax.set_xticklabels(description_count.index[:20])
ax.tick_params(axis='x', labelrotation=70);

Once again we see that the initial visualization is predominantly stopwords. 

In [89]:
# remove stopwords from 'description_count'
description_count = remove_stop_words(description_count, stop_words)


In [90]:
fig, ax = plt.subplots(figsize=(12,4))

sns.barplot(ax=ax, x=description_count.index[:20], y=description_count.values[:20])
ax.set_title('Most Common Words in Description, Excluding Stopwords')
ax.set_xticklabels(description_count.index[:20])
ax.tick_params(axis='x', labelrotation=70);

Once stopwords were removed from both 'steps' and 'description' there were no words remaining that seemed to need to be added to the stopwords list. 

# TO DO  
Do graphs comparing target/non-target recipes

## Baseline Model

This model uses the length data generated earlier to ensure that NLP is an appropriate approach for predicting the difficulty level of a recipe. To begin with we'll create a dataframe that only uses the 'num' columns in the current dataframe.

In [91]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, plot_confusion_matrix, plot_roc_curve, accuracy_score, precision_score, recall_score, f1_score

from imblearn.under_sampling import RandomUnderSampler

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, plot_confusion_matrix, plot_roc_curve, accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from imblearn.under_sampling import RandomUnderSampler


In [92]:
# Creating a numbers only db

X_numbers_df = df[['num_ingredients','num_steps', 'num_char_description']]
y_numbers_df = df['target']
X_numbers_df.head()

In [93]:
y_numbers_df.value_counts()

The first action that needs to be taken with the 'numbers-only' data is to address the fact that it is unbalanced.  Traditionally the two primary means of handling this are:

- creating more examples of the minority class using SMOTE or something like it
- removing examples from the majority class so that it has the same number of records as the minority class

Because the imbalance in this dataset is great enough that using SMOTE would result in there being three times as much synthetic target data as there is real data, and because the dataset is large enough to remove examples that does not match the target and still have almost 200,000 records, undersampling will be used to balance the dataset using Imbalanced Learn's RandomUnderSampler, which under-samples the majority class by randomly picking samples, in this case without replacement. 

In [94]:
# please note that this step can be taken prior to the train/test split as no data is 
# transformed, only removed. 
rus = RandomUnderSampler(random_state=50)
X_res, y_res = rus.fit_resample(X_numbers_df, y_numbers_df)
y_res.value_counts()

The balanced dataset can now be spit into train and test sets.  Because this is only a baseline model a valedation set is not being created. 

In [95]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size = .3)

In [96]:
y_train.value_counts()

In [97]:
y_test.value_counts()

In [98]:
# creating the StandardScaler object to run the data through so that undue weight isn't given 
# to columns with higher numbers. 

scalar = StandardScaler()
X_train_numbers_scaled = scalar.fit_transform(X_train, y_train)
X_train_numbers_scaled


In [99]:
# transforming the testing data
X_test_numbers_scaled = scalar.transform(X_test)
X_test_numbers_scaled

We will be using the below custom evaluate function to judge the performance of this, and other, models. 

In [100]:
def evaluate(estimator, X_tr, X_te, y_tr, y_te, cv=5, grid_search=False):
    '''
Function takes in estimator, training data, test data, a Boolean value indicating if the estimator is a grid search, 
and the cross validation splitting strategy if the estimator is not a grid search, 
and returns the accuracy, precision, recall, f1, and the ROC-AUC scores for the model 
and a confusion matrix visualization.  From Phase 3 Project: https://github.com/Nindorph/TanzanianWaterWells/blob/main/Modeling_Final.ipynb

If ‘grid_search parameter is set to “True” then the function will not perform cross validation on the model. 
Based off of Lindsey Berlin’s evaluate function found at: 
https://github.com/lindseyberlin/Cat-in-the-Dat-Project/blob/main/notebooks/Lindsey/EDA-Initial-Models.ipynb
------------------------------------------------------------------------------------------
Inputs: 
-Estimator - Estimator object  
-X_tr – X_train dataframe
-X_te – X_test dataframe
-Y_tr – y_train dataframe
-Y_te – y_test dataframe
-Cv – If cross_val  set to true this determines the cross-validation splitting strategy.  
        Takes in all value options for sklearn.model_selection_cross_val_score “cv” parameter:
        - None, to use the default 5-fold cross validation,
        - int, to specify the number of folds in a (Stratified)KFold,
        - CV splitter,
        - An iterable yielding (train, test) splits as arrays of indices
- grid_search – “ Boolean indicating whether a the estimator is a GridSearchCV object, 
        if set to “False” a cross validation will be performed with the number of iterations set by the “cv” parameter.  
        Default value is “False”.

Returns – nothing is returned 


    '''
    #If no grid search is being performed, go through evaluation steps as normal, including cross validation
    if grid_search == False:
        #Cross-Validate
        output = cross_validate(estimator, X_tr, y_tr, cv=cv,
                                scoring=['accuracy', 'precision','recall', 'f1', 'roc_auc'])
        #Printing out the mean of all of our evaluating metrics across the cross validation. 
        #Accuracy, precisionc recall, f1, and roc auc
        print('Results of Cross-Validation:\n')
        print(f'Average accuracy: {output["test_accuracy"].mean()}\
        +/- {output["test_accuracy"].std()}')
        print(f'Average precision: {output["test_precision"].mean()}\
        +/- {output["test_precision"].std()}')
        print(f'**Average recall: {output["test_recall"].mean()}\
        +/- {output["test_recall"].std()}')
        print(f'Average f1 score: {output["test_f1"].mean()}\
        +/- {output["test_f1"].std()}')
        print(f'Average roc_auc: {output["test_roc_auc"].mean()}\
        +/- {output["test_roc_auc"].std()}\n')
        print('+'*20)
    
        
        #Fitting the estimator to our X and y train data
        estimator.fit(X_tr, y_tr)
        #getting predictions for X train
        tr_preds = estimator.predict(X_tr)
        #getting predictions for X test
        te_preds = estimator.predict(X_te)
        
        #Creating a confusion matrix from our data with custom labels
        print('\nResults of Train-Test Split Validation:')
        plot_confusion_matrix(estimator, X_te, y_te, cmap='mako')
        
        #Printing our final evaluating metrics across X train
        #Evaluating using accuracy, precision, recall, f1, roc auc
        print("\nTraining Scores:")
        print(f"Train accuracy: {accuracy_score(y_tr, tr_preds)}")
        print(f"Train precision: {precision_score(y_tr, tr_preds)}")
        print(f"**Train recall: {recall_score(y_tr, tr_preds)}")
        print(f"Train f1 score: {f1_score(y_tr, tr_preds)}")
        print(f"Train roc_auc: {roc_auc_score(y_tr, tr_preds)}\n")
        print("<>"*10)
        #Printing our final evaluating metrics across X test
        #Evaluating using accuracy, precision, recall, f1, roc auc
        print("\nTesting Scores:")
        print(f"Test accuracy: {accuracy_score(y_te, te_preds)}")
        print(f"Test precision: {precision_score(y_te, te_preds)}")
        print(f"**Test recall: {recall_score(y_te, te_preds)}")
        print(f"Test f1 score: {f1_score(y_te, te_preds)}")
        print(f"Test roc_auc: {roc_auc_score(y_te, te_preds)}")
    
    #If a grid search is being performed, do not perform a cross validate.
    else:
        #Fitting the estimator to our X and y train data
        estimator.fit(X_tr, y_tr)
        #getting predictions for X train
        tr_preds = estimator.predict(X_tr)
        #getting predictions for X test
        te_preds = estimator.predict(X_te)
        
        #Creating a confusion matrix from our data with custom labels
        print('\nResults of Train-Test Split Validation:')
        plot_confusion_matrix(estimator, X_te, y_te, cmap='mako')
                              
        
        #Printing our final evaluating metrics across X train 
        #Evaluating using accuracy, precision, recall, f1, roc auc
        print("\nTraining Scores:")
        print(f"Train accuracy: {accuracy_score(y_tr, tr_preds)}")
        print(f"Train precision: {precision_score(y_tr, tr_preds)}")
        print(f"Train recall: {recall_score(y_tr, tr_preds)}")
        print(f"Train f1 score: {f1_score(y_tr, tr_preds)}")
        print(f"Train roc_auc: {roc_auc_score(y_tr, tr_preds)}\n")
        print("<>"*10)
        
        #Printing our final evaluating metrics across X test
        #Evaluating using accuracy, precision, recall, f1, roc auc
        print("\nTesting Scores:")
        print(f"Test accuracy: {accuracy_score(y_te, te_preds)}")
        print(f"Test precision: {precision_score(y_te, te_preds)}")
        print(f"Test recall: {recall_score(y_te, te_preds)}")
        print(f"Test f1 score: {f1_score(y_te, te_preds)}")
        print(f"Test roc_auc: {roc_auc_score(y_te, te_preds)}")
        
        print('\nGrid Search Results (you animal):\n')
        return pd.DataFrame(estimator.cv_results_)

We'll be using a LogisticRegression model for our baseline as I've had good results with those in past NLP projects and anticipate it performing well with the text data. 

In [101]:
logreg = LogisticRegression()

In [102]:
evaluate(logreg, X_train_numbers_scaled, X_test_numbers_scaled, 
         y_train, y_test)

The numbers-only model has an accuracy score of 56%, which with the balanced dataset means that this model only works slightly better then guessing. 

## Preprocessing

Now that we've establishted the baseline for a model using recipe text it's time to move onwards towards the real modeling. Before this can be done the dataframe needs to be vectorized - I'll be using both CountVectorizer and TfidfVectorizer because I want to see if one of the two has a better performance. 

In [103]:
# review what columns we need to vectorize
df.info()

In [104]:
# creating a dataframe with the text that will be used in the model as well as the target
df_strings = df[['description','steps','search_terms','tags', 'target']]
df_strings.head()

In [105]:
# use the basic_cleaning function on each non-list column of the dataframe 
df_strings = basic_cleaning(df_strings, 'description')
df_strings = basic_cleaning(df_strings, 'steps')
df_strings = basic_cleaning(df_strings, 'tags')
df_strings = basic_cleaning(df_strings, 'search_terms')

In [106]:
# confirming the cleaning process worked as expected
df_strings.head()

Note that for search terms and tags removal of the '-' has insered words like 'timetomake' into those categories.  I'm not concerend that they aren't real words because they appear consistently throughout their columns and thus add value to the model.

In [107]:
# dropping unnecessary columns
df_strings.drop(columns = ['description','steps','search_terms','tags'], inplace=True)
df_strings

In [108]:
# splitting the data to X and y 
y = df_strings['target']
X = df_strings.drop('target', axis=1)

In [109]:
# undersampling the data in the same manner used with the baseline model
rus = RandomUnderSampler(random_state=50)
X_res, y_res = rus.fit_resample(X, y)
y_res.value_counts()

In [110]:
# splitting the data, including creating a holdout set
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size = .3, 
                                                    random_state=50)
X_test, X_holdout, y_test, y_holdout = train_test_split(X_test, y_test, test_size = .3,
                                                        random_state=50)

To prepare the data for vectorization we need to combine the columns containing text that will be used in the model to meet the input requirements for the vectorizers.

In [111]:
X_train['combined'] = X_train['cleaned_description'].str.cat(X_train[['cleaned_steps',
                                                                      'cleaned_tags',
                                                                      'cleaned_search_terms']],sep=" ")

X_train.head()

In [112]:
# repeating this with the test data
X_test['combined'] = X_test['cleaned_description'].str.cat(X_test[['cleaned_steps',
                                                                'cleaned_tags',
                                                                'cleaned_search_terms']],sep=" ")
X_test.head()

## Modeling

In [113]:
from sklearn.naive_bayes import (
    BernoulliNB,
    ComplementNB,
    MultinomialNB,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis


### CountVectorizer

In [114]:
# having the CountVectorizer remove stop words
countvect = CountVectorizer(stop_words=stop_words, ngram_range=(1,1))

In [115]:
X_train_CV = countvect.fit_transform(X_train.combined)
X_test_CV = countvect.transform(X_test.combined)

In [116]:
classifiers = {
    "BernoulliNB": BernoulliNB(),
    "ComplementNB": ComplementNB(),
    "MultinomialNB": MultinomialNB(),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(max_depth=3), #to keep the initial modeling quick
    "RandomForestClassifier": RandomForestClassifier(max_depth=3),
    "LogisticRegression": LogisticRegression(),
    "AdaBoostClassifier": AdaBoostClassifier(),
}

In [None]:
for name, sklearn_classifier in classifiers.items():
    classifier = sklearn_classifier
    print(name)
    evaluate(classifier, X_train_CV, X_test_CV, y_train, y_test)