In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
# Basic Libraries needed

import re
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
# Libraries needed for text pre-processing 
# and vectorizing

def basic_text_preprocessor(text_df, col_name):
    """
    # this function takes in a data frame column
    # and processes the text data
    # into a format that is more easily analyzed for prediction
    # It then returns the converted text column also in dataframe format 

    Args:
        text_df = a column in dataframe format of text data
        col_name = a string that matches the name of the column to be processed
    Returns:
        TXT = a column in dataframe form of converted text
    """
    ###########################################################################
    # INITIALIZE PRE_PROCESSING TOOLS
    ###########################################################################
    num_rows = len(text_df)
    # getting number of rows and columns in data frame      
    punctuation = ",!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"    
    # Saving a list of commonly used punctuation to a string in order to remove
    # them from the text columns   
    stop_words = stopwords.words("english")    
    # getting list of common stopwords to remove from the text entries    
    lemma = WordNetLemmatizer()    
    # setting up lemmatizing function    
    stemm =  PorterStemmer()
    # stemming function        
    converted_text = []        
    # initializing list to store 
    # processed text in
    ###########################################################################
    # PRE-PROCESS EACH ROW OF TEXT
    ###########################################################################
    for i in range(num_rows):            
      # go through each row            
        new_text = text_df.iloc[i].lower()                        
        # makes text lower case             
        for char in new_text:                
            if char in punctuation:                    
              new_text = new_text.replace(char, " ")            
              # removing punctuation                        
        new_text = new_text.split()            
            # splitting up the text into individual words            
        for word in new_text:    
            if word in stop_words:        
              new_text.remove(word)            
              # removing stop words            
        for word in new_text:                
            lemma.lemmatize(word)                
            # lemmatizing words            
        for word in new_text:                
            stemm.stem(word)            
            # stemming words                        
        new_text = " ".join(new_text)            
        converted_text.append(new_text)                  
        # add to converted_text list     
    ###########################################################################
    # CONVERT LIST TO DATA FRAME AND RETURN COLUMN
    ###########################################################################
    txt_ppr = pd.DataFrame(converted_text, 
                       columns = [col_name], index = text_df.index)
    # setting up a new data frame to fill with pre-processed text  
    return txt_ppr

def Model_Stacking_String_Matrix(df, col_name, X, Y, **params):
    """
    # This function will take in a full data frame 
    # that contains a column containing string data, along with a  
    # text column from the training set and the corresponding labels.
    # In the new data frame it will isolate the text
    # column, preprocess the string data, use TF_IDF to vectorize the column, and 
    # finally fit a Logistic Regression model with the passed parameters 
    # to the training set. It will use this Logistic Regression model trained on
    # the X and Y data to make predictions on the specified column(col_name) in
    # the data frame (df) passed to the function, and then finally put the 
    # columns together and return a dataframe contiaining the text column 
    # predictions in the exact same order 

    ASSUMPTIONS: User is passing a dataframe where the missing values have 
    been dealt with and standard scaling has either taken place or is not
    needed. Ideally, Logisitic Regression and the parameters passed have been
    determined to be the best fit for the string data in the column

    Args: 
          df: A dataframe containing a column of string data
          col_name: A string matching the column name of the column to be 
          transformed
          X: a column containing string data from the training set 
          Y: a column of integers indicating the target column from the 
          training set used
          params: A dictionary of parameters for the Logistic Regression model

    Return: 
          text_df: the same dataframe as df except with the colmmn referenced by
          col_name replaced by a column of predictions
    """
    ###########################################################################
    # PRE_PROCESS STRING COLUMN
    ###########################################################################
    df_copy = df.copy() 
    col_index = df.columns.get_loc(col_name)
    # saving both copy of data frame and index number 
    # for column for use at the end of the function 
    for col in df.columns:        
        # Isolating the text column       
        # note - col is the column name not an index here   
        if col != col_name:             
            del df[col]             
            # only keep 'col_name' column                   
    df = basic_text_preprocessor(df[col_name], col_name)    
        # using basic_text_preprocessor function from above
        # to process the string data before vectorizing it   
    ###########################################################################
    # VECTORIZE STRING DATA
    ###########################################################################
    TF_IDF = TfidfVectorizer() 
    TF_IDF.fit(X)         
    # Initialize TF_IDF and  fit to train text column X           
    XT = TF_IDF.transform(X)
    v_txt = TF_IDF.transform(df[col_name])
    # Transform both X and the data frame column 'col_name'
    # into TF_IDF vectors to be fit with Logistic Regression
    ##########################################################################
    # GET PREDICTION COLUMNS FROM MODEL
    ##########################################################################            
    MODEL1 = LogisticRegression(**params).fit(XT, np.ravel(Y))
    # Fit model with tf_idf transformed train data X1T and train labels Y                     
    predictions = MODEL1.predict_proba(v_txt)[:,1]
    # Get prediction probabilities for vectorized text df column 
    new_col = col_name + str(' (predictions)')
    # Naming a new column using original name plus the word 'predictions'
    ##########################################################################
    # FIT PREDICTION PROBABILITIES BACK INTO DATA FRAME AND RETURN
    ##########################################################################
    predictions = pd.DataFrame(predictions, columns = [new_col])
    # Make predictions on new data column and convert into a dataframe column
    predictions.index = df.index
    # Keep index from original data frame
    text_df = pd.concat([df_copy, predictions], axis = 1, join='outer')     
    # Recreating original dataframe with prediction probabilities added
    text_df.drop(columns = [col_name], inplace = True)
    # removing column matching col_name
    column_to_move = text_df.pop(new_col)
    # getting new_col ready to move into place of col_name
    text_df.insert(col_index, new_col, column_to_move)       
    # inserting 'new_col in place of 'col_name'
    return text_df


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/edwardmiller/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/edwardmiller/nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/edwardmiller/nltk_data...


In [2]:
food_list = ['chocolate chip cookie',
             'baked potato', 
             'creamed corn', 
             'vanilla ice cream', 
             'buttermilk pancakes', 
             'tomato soup',  
             'chocolate fudge sundae',
             'carrot cake',  
             'vanilla shake',
             'chocolate brownie',
             'strawberry cupcake',
             'french toast',
             'vanilla yogurt',
             'banana yogurt',
             'T-Bone Steak',
             'Bratwurst',
             'Slice of Pepperoni Pizza' ]
# Making list of food items, some are desserts and some are not

calories_list = [55,145, 92, 137, 239, 91, 140, 200, 
                 254, 440, 200, 149, 208, 150, 249, 283, 100]
# Making list of calories matching food items

labels = [1,0,0,1,0,0,1,1,1,1,1,0,0,0,0,0,0]
# Making target column of labels 
# saying whether food is dessert or not

food_df = pd.DataFrame(list(zip(food_list, calories_list, labels)), 
                       columns=["Food Type", "Calories", "Dessert Item"])
display(food_df)
print(food_df.dtypes)
# creating training dataframe and looking at result 

food_list_test = ['ice cream cone',
             'pumpkin pie', 
             'hot dog', 
             'vegetable casserole', 
             'Reuben Sandwich', 
             'strawberry popsicle']
calories_list_test = [175,323, 127, 80, 618, 60] 
labels_test = [1,1,0,0,0,1]           

food_df_test = pd.DataFrame(list(zip(food_list_test, calories_list_test, labels_test)), 
                       columns=["Food Type", "Calories", "Dessert Item"])
display(food_df_test)
print(food_df_test.dtypes)
# creating test dataframe and looking at result


Unnamed: 0,Food Type,Calories,Dessert Item
0,chocolate chip cookie,55,1
1,baked potato,145,0
2,creamed corn,92,0
3,vanilla ice cream,137,1
4,buttermilk pancakes,239,0
5,tomato soup,91,0
6,chocolate fudge sundae,140,1
7,carrot cake,200,1
8,vanilla shake,254,1
9,chocolate brownie,440,1


Food Type       object
Calories         int64
Dessert Item     int64
dtype: object


Unnamed: 0,Food Type,Calories,Dessert Item
0,ice cream cone,175,1
1,pumpkin pie,323,1
2,hot dog,127,0
3,vegetable casserole,80,0
4,Reuben Sandwich,618,0
5,strawberry popsicle,60,1


Food Type       object
Calories         int64
Dessert Item     int64
dtype: object


In [3]:
# train data -  food_df
# test data - food_df_test

col_name = "Food Type"
label_name = "Dessert Item"
params = {'C': 1, 'penalty': 'l2'}

display(food_df_test)
# Before functions


Full_test = Model_Stacking_String_Matrix(food_df_test.copy(), 
                                   col_name, 
                                   food_df[col_name].copy(), 
                                   food_df[label_name].copy(),
                                   **params)
display(Full_test)
# After functions


Unnamed: 0,Food Type,Calories,Dessert Item
0,ice cream cone,175,1
1,pumpkin pie,323,1
2,hot dog,127,0
3,vegetable casserole,80,0
4,Reuben Sandwich,618,0
5,strawberry popsicle,60,1


Unnamed: 0,Food Type (predictions),Calories,Dessert Item
0,0.500114,175,1
1,0.397064,323,1
2,0.397064,127,0
3,0.397064,80,0
4,0.397064,618,0
5,0.481061,60,1
