# Logistic Regression, Full Features

In [1]:
import sys
import os
sys.path.append(os.path.abspath('../data'))
sys.path.append(os.path.abspath('../models'))

import pathlib
import json
from datetime import datetime

import numpy as np
import pandas as pd
import pickle

import nltk
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
nltk.download('punkt')

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package punkt to C:\Users\Han-chung
[nltk_data]     Lee\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 1. Loading

In [2]:
data_root = pathlib.Path('../data')

In [3]:
all_json_paths = list(data_root.glob('*.json'))
all_json_paths = [str(path) for path in all_json_paths]

In [4]:
all_json_paths

['..\\data\\Kickstarter_2019-01-17T03_20_02_630Z.json',
 '..\\data\\Kickstarter_2019-02-14T03_20_04_734Z.json',
 '..\\data\\Kickstarter_2019-03-14T03_20_12_200Z.json',
 '..\\data\\Kickstarter_2019-04-18T03_20_02_220Z.json',
 '..\\data\\Kickstarter_2019-05-16T03_20_20_822Z.json',
 '..\\data\\Kickstarter_2019-06-13T03_20_35_801Z.json',
 '..\\data\\Kickstarter_2019-07-18T03_20_05_009Z.json',
 '..\\data\\Kickstarter_2019-08-15T03_20_03_022Z.json']

In [5]:
data = []
for line in open(all_json_paths[0], 'r', encoding='utf8'):
    data.append(json.loads(line))
    
data = [record['data'] for record in data]
raw = pd.DataFrame.from_records(data)

# 2. Preprocessing

In [77]:
def preproc(df: pd.DataFrame) -> pd.DataFrame:
    """
    Naive preprocessing the input data by dropping samples that still have the campaign running,
    impute durations and categories, dropping unnecessary features, and one-hot encoding for
    training.
    
    Parameters
    ----------
    df : pandas.DataFrame
    
    Returns
    ----------
    df : pandas.DataFrame
    """
    # get durations by taking the difference between launch and deadline and transform
    # the seconds integer into days.
    df['duration'] = round((df.deadline - df.launched_at)/(60*60*24))
    
    # parse the category feature's json format and extract the first level categories
    df['category'] = df.category.apply(lambda x: x['slug'].split('/')[0])

    # map states to 1 for success and 0 for others. Also will drop all 'live' records.
    state_dict = {'successful':1, 'failed':0, 'canceled':0, 'suspended':0}
    df = df.replace({"state": state_dict})
    df = df[df.state != 'live']

    # drop unused features
    df = df[['name', 'blurb', 'goal', 'country', 'duration', 'category', 'state']]
    
    return df

In [79]:
df = raw.copy()
df = preproc(df)
X_col = ['goal', 'duration', 'country', 'category']
X = df[X_col]
# need to add .astype('int') to turn it y into int from object. otherise sklearn wont work
# https://stackoverflow.com/questions/45346550/valueerror-unknown-label-type-unknown
y = df.state.astype('int')

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X.to_numpy(), y.to_numpy(), test_size=0.15, random_state=45)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((172516, 4), (30444, 4), (172516,), (30444,))

## 2.2 Transform Text Data Into Embeddings

In [9]:
d2v = Doc2Vec.load("../models/d2v.model")

In [10]:
name_test = word_tokenize(df.name[0].lower())
name_v_test = d2v.infer_vector(name_test)

In [11]:
name_v_test

array([-2.6723077 ,  1.1827933 ,  2.889591  ,  3.2563348 , -1.398114  ,
        0.7121932 ,  3.0553718 , -0.16616717, -0.30374274,  1.5237863 ],
      dtype=float32)

In [12]:
blurb_test = word_tokenize(df.blurb[0].lower())
blurb_v_test = d2v.infer_vector(blurb_test)

In [13]:
blurb_v_test

array([-1.2961464, -3.8206117,  3.3928459,  1.992708 , -3.2097907,
       -1.1067653,  5.9106565,  1.3232312, -1.1453363,  2.450148 ],
      dtype=float32)

In [14]:
%%timeit 
name_v = d2v.infer_vector(word_tokenize(df.name[0].lower()))

4.05 ms ± 76.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [15]:
#
# !!!!!!!!!!!! WARNING! EXPENSIVE CELL, 15 MIN RUNTIMNE !!!!!!!!!!!!!!!!!!
#
%time name_embedding = df.name.apply(lambda x: d2v.infer_vector(word_tokenize(x.lower())))

Wall time: 15min 20s


In [16]:
#
# !!!!!!!!!!!! WARNING! EXPENSIVE CELL, 20 MIN RUNTIME !!!!!!!!!!!!!!!!!!!!!
#
%time blurb_embedding = df.blurb.apply(lambda x: d2v.infer_vector(word_tokenize(x.lower())))

Wall time: 18min 36s


## 2.2 Preprocessing

In [17]:
name = np.vstack(name_embedding.apply(lambda x: x.flatten()).to_list())
blurb = np.vstack(blurb_embedding.apply(lambda x: x.flatten()).to_list())
goal = df.goal.to_numpy().reshape(-1, 1)
country = df.country.to_numpy().reshape(-1, 1)
durations = df.durations.to_numpy().reshape(-1, 1)
cat = df.category.to_numpy().reshape(-1, 1)

In [18]:
name.shape, blurb.shape, goal.shape, country.shape, durations.shape, cat.shape

((202960, 10),
 (202960, 10),
 (202960, 1),
 (202960, 1),
 (202960, 1),
 (202960, 1))

In [19]:
X = np.hstack([name, blurb, goal, country, durations, cat])

In [20]:
X.shape

(202960, 24)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y.to_numpy(), test_size=0.15, random_state=45)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((172516, 24), (30444, 24), (172516,), (30444,))

# 3. Training

In [22]:
def logistic_regression(X, y):
    """
    Logistic regression model using GridSearchCV. Since GridSearchCV does cross validation internally,
    we choose not to split X into training and validation set. We choose to do 5 fold cross validation
    during GridSearch. With that, data issplit three ways: 0.68 train, 0.17 validation, and 0.15 test.
    We will continue to use OneHotEncoding and StandardScaler in our training pipeline. Since some of
    the categorical features have very high cardinality, e.g., funder with 1898 categories, we choose
    to take only the top 6 with high cardinality to reduce training time.
    
    Parameters
    ----------
    X : training data
    y : target data
    
    Returns
    ----------
    search.best_estimator : the best Logistic Regression model produced by the GridSearchCV
    """

    logreg = LogisticRegression(solver='lbfgs', multi_class='ovr', random_state=45, max_iter=500)
    encoder = OneHotEncoder(categories='auto', handle_unknown='ignore')
    scaler = StandardScaler(with_mean=False)

    pipe = Pipeline(steps=[('encoder', encoder),
                           ('scaler', scaler),
                           ('logreg', logreg)
                           ])
    
    param_grid = {
        'logreg__C': np.power(10.0, np.arange(3, 10)),
    }
    
    search = GridSearchCV(pipe, param_grid, n_jobs=-1, cv=5)
    %time search.fit(X, y)
    print("Training Score (accuracy): {}".format(search.best_score_))
    print("Best Parameters: {}".format(search.best_params_))
    
    return search.best_estimator_

## 3.1 Full Features
We will train using logistic regression with all the features. We first convert the text into document embeddings of size 10. Then we compaign them together with other features into a feature matrix. The matrix will be feed into a scikit-learn pipeline with scaling and encoding.

In [23]:
model = logistic_regression(X_train, y_train)

Wall time: 7min 21s
Training Score (accuracy): 0.6213858424725822
Best Parameters: {'logreg__C': 10000.0}


In [24]:
model.score(X_test, y_test)

0.6192024701090527

## 3.2 Text Features only
The full text model has a testing accuracy of only 61.7%, lower than our original 68% logistic regression model without text. Name and blurb might not be have much predictive power to the success of a kickstarter campaign. Lets test it out quickly using name and blurb as input features to the model.

In [25]:
X_text = np.hstack([name, blurb])
X_train, X_test, y_train, y_test = train_test_split(X_text, y.to_numpy(), test_size=0.15, random_state=45)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((172516, 20), (30444, 20), (172516,), (30444,))

In [26]:
logreg = LogisticRegression(solver='lbfgs', multi_class='ovr', random_state=45, max_iter=500) 
pipe = Pipeline(steps=[('logreg', logreg)])

param_grid = {
    'logreg__C': np.power(10.0, np.arange(3, 10)),
}   

search = GridSearchCV(pipe, param_grid, n_jobs=-1, cv=5)
%time search.fit(X_train, y_train)

print("Training Score (accuracy): {}".format(search.best_score_))
print("Best Parameters: {}".format(search.best_params_))

model = search.best_estimator_#(solver='lbfgs', multi_class='ovr', random_state=45, max_iter=500).fit(X_train, y_train)

Wall time: 10.1 s
Training Score (accuracy): 0.6557768554800714
Best Parameters: {'logreg__C': 1000.0}


In [27]:
model.score(X_test, y_test)

0.6544146629877808

## 3.3 Full Features, Modified Pipeline
Interesting. Models with text features only has 65.5% accuracy. So the text features does mean something useful. Perhaps it has something to do with the pipeline, specifically the scalar part. So we will test full features without scalar in the scikit-learn pipeline but scale `goal` and `duration` outside of the pipeline.

In [28]:
goal_norm = (goal - goal.mean())/goal.std()
dur_norm = (durations - durations.mean())/durations.std()
X_text = np.hstack([name, blurb, goal_norm, dur_norm, country, cat])
X_train, X_test, y_train, y_test = train_test_split(X_text, y.to_numpy(), test_size=0.15, random_state=45)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((172516, 24), (30444, 24), (172516,), (30444,))

In [29]:
# logreg = LogisticRegression(solver='lbfgs', multi_class='ovr', random_state=45, max_iter=500)
# encoder = OneHotEncoder(categories='auto', handle_unknown='ignore')

# pipe = Pipeline(steps=[('encoder', encoder),
#                        ('logreg', logreg)
#                        ])

# param_grid = {
#     'logreg__C': np.power(10.0, np.arange(3, 10)),
# }

# search = GridSearchCV(pipe, param_grid, n_jobs=-1, cv=5)
# %time search.fit(X_train, y_train)
# print("Training Score (accuracy): {}".format(search.best_score_))
# print("Best Parameters: {}".format(search.best_params_))

# model = search.best_estimator_

model.score(X_test, y_test)

model.named_steps['logreg'].coef_.shape

df.country.value_counts(dropna=False).index.shape, df.category.value_counts(dropna=False).index.shape

## 3.4 Full Features, Manual Transformation, No Pipeline
Looking at the number of coefficients, something is seriously messed up. We are expecting 10 for name, 10 for blurb, 1 for goal, 1 for duration, 22 for country, and 15 for categories. That is a total of 59 parameters. But looking at our coef_, we have 3,448,327. Something went wrong, most probably encoder. We will try to manually encode each of the columns to make it work. 

In [31]:
ohe = OneHotEncoder(categories='auto', handle_unknown='ignore')
cc = ohe.fit_transform(country).toarray()

In [32]:
cc.shape

(202960, 22)

In [35]:
ohe2 = OneHotEncoder(categories='auto', handle_unknown='ignore')
catcat = ohe.fit_transform(cat).toarray()

In [36]:
catcat.shape

(202960, 15)

In [37]:
goal_norm = (goal - goal.mean())/goal.std()
dur_norm = (durations - durations.mean())/durations.std()

In [38]:
name.shape, blurb.shape, goal_norm.shape, dur_norm.shape, cc.shape, catcat.shape

((202960, 10),
 (202960, 10),
 (202960, 1),
 (202960, 1),
 (202960, 22),
 (202960, 15))

In [39]:
X_text = np.hstack([name, blurb, goal_norm, dur_norm, cc, catcat])
X_train, X_test, y_train, y_test = train_test_split(X_text, y.to_numpy(), test_size=0.15, random_state=45)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((172516, 59), (30444, 59), (172516,), (30444,))

In [40]:
logreg = LogisticRegression(solver='lbfgs', multi_class='ovr', random_state=45, max_iter=500)

pipe = Pipeline(steps=[('logreg', logreg)
                       ])

param_grid = {
    'logreg__C': np.power(10.0, np.arange(3, 10)),
}

search = GridSearchCV(pipe, param_grid, n_jobs=-1, cv=5)
%time search.fit(X_train, y_train)
print("Training Score (accuracy): {}".format(search.best_score_))
print("Best Parameters: {}".format(search.best_params_))

model = search.best_estimator_

Wall time: 2min 45s
Training Score (accuracy): 0.6902606135083122
Best Parameters: {'logreg__C': 1000.0}


In [41]:
model.score(X_test, y_test)

0.6894626198922612

In [42]:
model.named_steps['logreg'].coef_.shape

(1, 59)

In [50]:
model_filename = '../models/20191023_logreg_text_69.sav'
pickle.dump(model, open(model_filename, 'wb'))

## 3.5 Full Features, Production Pipeline
That's better. Not the number of feature parameters now make sense, the testing accuracy is higher as well, at 0.689. Further, the model size is now only 1.9kB. Now we will try to create a column_transformer for features outside of the doc2vec models. The reason we choose not to incorporate doc2vec into the pipeline yet is due to the speed of the transformation. Embedding the whole name and the whole blurb takes around 25 minutes.

In [80]:
numeric_features = ['goal', 'duration']
numeric_transformer = StandardScaler(with_mean=False)
    
categorical_features = ['country', 'category']
categorical_transformer = OneHotEncoder(categories='auto', handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
        ], remainder = 'drop')

In [81]:
df_no_state = df.iloc[:, :-1]
df_no_state.reset_index(inplace=True)
df_no_state.drop("index",axis=1,inplace=True)

In [82]:
df_no_state.columns

Index(['name', 'blurb', 'goal', 'country', 'duration', 'category'], dtype='object')

In [83]:
df.columns

Index(['name', 'blurb', 'goal', 'country', 'duration', 'category', 'state'], dtype='object')

In [84]:
ct_model = preprocessor.fit(df_no_state)
ct_filename = '../models/20191023_preproc.sav'
pickle.dump(ct_model, open(ct_filename, 'wb'))

In [51]:
ct_model = pickle.load(open(ct_filename, 'rb'))
X_transformed = ct_model.transform(df).toarray()

In [52]:
X_FF = np.hstack([name, blurb, X_transformed])
X_FF.shape

(202960, 59)

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X_FF, y.to_numpy(), test_size=0.15, random_state=45)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((172516, 59), (30444, 59), (172516,), (30444,))

In [54]:
logreg = LogisticRegression(solver='lbfgs', multi_class='ovr', random_state=45, max_iter=500)

pipe = Pipeline(steps=[('logreg', logreg)])

param_grid = {
    'logreg__C': np.power(10.0, np.arange(3, 10)),
}

search = GridSearchCV(pipe, param_grid, n_jobs=-1, cv=5)
%time search.fit(X_train, y_train)
print("Training Score (accuracy): {}".format(search.best_score_))
print("Best Parameters: {}".format(search.best_params_))

model = search.best_estimator_

Wall time: 2min 48s
Training Score (accuracy): 0.6903011894548912
Best Parameters: {'logreg__C': 100000.0}


In [55]:
model.score(X_test, y_test)

0.689396925502562

In [56]:
model.named_steps['logreg'].coef_.shape

(1, 59)

In [57]:
filename = '../models/20191023_logreg_text_69.sav'
pickle.dump(model, open(filename, 'wb'))

# 4. Inferencing
Now we have a working model, we need to go through taking a sample and make inferences using the model. The input data will go through transformations. We will use our trained doc2vec model to transform the name and blurb fields into vectors. We will use our saved column transformer model to transform, goal, country, duration, and category. The data are then combined together and sent to the model for generating a predict probability.

In [85]:
test1 = '''{"name": "This is a test Kickstarter header", "blurb": "This is an example description of a kickstarter project to test for the API. I would like to thank my wife, parents, and all my loving family members for this to work. I would also like to thank all the Kickstarter team members and project leads for making this possible.", 
            "goal": 2011.0, 
            "country": "US", 
            "duration":67.0, 
            "category": "publishing"}'''
test1j = json.loads(test1)
test1df = pd.DataFrame.from_records(test1j, index=[0], columns=['name', 'blurb', 'goal', 'country', 'duration', 'category'])

In [86]:
D2V_FILENAME = "../models/d2v.model"
CT_FILENAME = '../models/20191023_preproc.sav'
MODEL_FILENAME = "../models/20191023_logreg_text_69.sav"

d2v = Doc2Vec.load(D2V_FILENAME)
ct = pickle.load(open(CT_FILENAME, 'rb'))
model = pickle.load(open(MODEL_FILENAME, 'rb'))

In [114]:
# preprocessing
X_none_text = ct_model.transform(test1df).toarray()
n = d2v.infer_vector(word_tokenize(test1df.name[0].lower())).reshape(-1, 10)
b = d2v.infer_vector(word_tokenize(test1df.blurb[0].lower())).reshape(-1, 10)
X_inference = np.hstack([n, b, X_none_text])
X_inference.shape

(1, 59)

In [116]:
print(f"Our model predicts that the campaign will be a {model.predict(X_inference)[0]} with probability of {model.predict_proba(X_inference)[:, 1][0]}")

Our model predicts that the campaign will be a 0 with probability of 0.29845840433373644
