# Logistic Regression, Full Features

In [18]:
import sys
import os
sys.path.append(os.path.abspath('../data'))
sys.path.append(os.path.abspath('../models'))

import pathlib
import json
from datetime import datetime

import numpy as np
import pandas as pd
import pickle

import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
nltk.download('punkt')

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package punkt to C:\Users\Han-chung
[nltk_data]     Lee\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 1. Loading

In [2]:
data_root = pathlib.Path('../data')

In [3]:
all_json_paths = list(data_root.glob('*.json'))
all_json_paths = [str(path) for path in all_json_paths]

In [4]:
all_json_paths

['..\\data\\Kickstarter_2019-01-17T03_20_02_630Z.json',
 '..\\data\\Kickstarter_2019-02-14T03_20_04_734Z.json',
 '..\\data\\Kickstarter_2019-03-14T03_20_12_200Z.json',
 '..\\data\\Kickstarter_2019-04-18T03_20_02_220Z.json',
 '..\\data\\Kickstarter_2019-05-16T03_20_20_822Z.json',
 '..\\data\\Kickstarter_2019-06-13T03_20_35_801Z.json',
 '..\\data\\Kickstarter_2019-07-18T03_20_05_009Z.json',
 '..\\data\\Kickstarter_2019-08-15T03_20_03_022Z.json']

In [5]:
data = []
for line in open(all_json_paths[0], 'r', encoding='utf8'):
    data.append(json.loads(line))
    
data = [record['data'] for record in data]
raw = pd.DataFrame.from_records(data)

# 2. Preprocessing

In [6]:
def preproc(df: pd.DataFrame) -> pd.DataFrame:
    """
    Naive preprocessing the input data by dropping samples that still have the campaign running,
    impute durations and categories, dropping unnecessary features, and one-hot encoding for
    training.
    
    Parameters
    ----------
    df : pandas.DataFrame
    
    Returns
    ----------
    df : pandas.DataFrame
    """
    # get durations by taking the difference between launch and deadline and transform
    # the seconds integer into days.
    df['durations'] = round((df.deadline - df.launched_at)/(60*60*24))
    
    # parse the category feature's json format and extract the first level categories
    df['cat_slug'] = df.category.apply(lambda x: x['slug'].split('/')[0])

    # map states to 1 for success and 0 for others. Also will drop all 'live' records.
    state_dict = {'successful':1, 'failed':0, 'canceled':0, 'suspended':0}
    df = df.replace({"state": state_dict})
    df = df[df.state != 'live']

    # drop unused features
    df = df[['name', 'blurb', 'goal', 'country', 'durations', 'cat_slug', 'state']]
    
    return df

In [7]:
cols_names = raw.columns.to_list()

In [8]:
df = raw.copy()
df = preproc(df)
X_col = ['goal', 'durations', 'country', 'cat_slug']
X = df[X_col]
# need to add .astype('int') to turn it y into int from object. otherise sklearn wont work
# https://stackoverflow.com/questions/45346550/valueerror-unknown-label-type-unknown
y = df.state.astype('int')

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X.to_numpy(), y.to_numpy(), test_size=0.15, random_state=45)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((172516, 4), (30444, 4), (172516,), (30444,))

## 2.2 Transform Text Data Into Embeddings

In [25]:
d2v = Doc2Vec.load("../models/d2v.model")

In [33]:
name_test = word_tokenize(df.name[0].lower())
name_v_test = d2v.infer_vector(name_test)

In [34]:
name_v_test

array([-2.7140505 ,  1.157532  ,  2.4612043 ,  2.9914167 , -1.1569723 ,
        0.6713491 ,  3.145516  ,  0.14941326, -0.81775874,  1.805982  ],
      dtype=float32)

In [35]:
blurb_test = word_tokenize(df.blurb[0].lower())
blurb_v_test = d2v.infer_vector(blurb_test)

In [36]:
blurb_v_test

array([-1.0438399, -3.5648508,  3.5880609,  2.0317347, -3.2620676,
       -1.0832196,  6.070351 ,  1.132646 , -1.37094  ,  2.8216355],
      dtype=float32)

In [43]:
%%timeit 
name_v = d2v.infer_vector(word_tokenize(df.name[0].lower()))

4.59 ms ± 257 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [47]:
#
# !!!!!!!!!!!! WARNING! EXPENSIVE CELL, 15 MIN RUNTIMNE !!!!!!!!!!!!!!!!!!
#
%time asdf = df.name.apply(lambda x: d2v.infer_vector(word_tokenize(x.lower())))

Wall time: 15min 1s


In [50]:
#
# !!!!!!!!!!!! WARNING! EXPENSIVE CELL, 20 MIN RUNTIME !!!!!!!!!!!!!!!!!!!!!
#
%time blurb_embedding = df.blurb.apply(lambda x: d2v.infer_vector(word_tokenize(x.lower())))

Wall time: 19min 9s


## 2.2 Preprocessing

In [120]:
name = np.vstack(asdf.apply(lambda x: x.flatten()).to_list())
blurb = np.vstack(blurb_embedding.apply(lambda x: x.flatten()).to_list())
goal = df.goal.to_numpy().reshape(-1, 1)
country = df.country.to_numpy().reshape(-1, 1)
durations = df.durations.to_numpy().reshape(-1, 1)
cat = df.cat_slug.to_numpy().reshape(-1, 1)

In [121]:
name.shape, blurb.shape, goal.shape, country.shape, durations.shape, cat.shape

((202960, 10),
 (202960, 10),
 (202960, 1),
 (202960, 1),
 (202960, 1),
 (202960, 1))

In [125]:
X = np.hstack([name, blurb, goal, country, durations, cat])

In [126]:
X.shape

(202960, 24)

In [128]:
X_train, X_test, y_train, y_test = train_test_split(X, y.to_numpy(), test_size=0.15, random_state=45)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((172516, 24), (30444, 24), (172516,), (30444,))

# 3. Training

In [129]:
def logistic_regression(X, y):
    """
    Logistic regression model using GridSearchCV. Since GridSearchCV does cross validation internally,
    we choose not to split X into training and validation set. We choose to do 5 fold cross validation
    during GridSearch. With that, data issplit three ways: 0.68 train, 0.17 validation, and 0.15 test.
    We will continue to use OneHotEncoding and StandardScaler in our training pipeline. Since some of
    the categorical features have very high cardinality, e.g., funder with 1898 categories, we choose
    to take only the top 6 with high cardinality to reduce training time.
    
    Parameters
    ----------
    X : training data
    y : target data
    
    Returns
    ----------
    search.best_estimator : the best Logistic Regression model produced by the GridSearchCV
    """

    logreg = LogisticRegression(solver='lbfgs', multi_class='ovr', random_state=45, max_iter=500)
    encoder = OneHotEncoder(categories='auto', handle_unknown='ignore')
    scaler = StandardScaler(with_mean=False)

    pipe = Pipeline(steps=[('encoder', encoder),
                           ('scaler', scaler),
                           ('logreg', logreg)
                           ])
    
    param_grid = {
        'logreg__C': np.power(10.0, np.arange(3, 10)),
    }
    
    search = GridSearchCV(pipe, param_grid, n_jobs=-1, cv=5)
    %time search.fit(X, y)
    print("Training Score (accuracy): {}".format(search.best_score_))
    print("Best Parameters: {}".format(search.best_params_))
    
    return search.best_estimator_

## 3.1 Full Features
We will train using logistic regression with all the features. We first convert the text into document embeddings of size 10. Then we compaign them together with other features into a feature matrix. The matrix will be feed into a scikit-learn pipeline with scaling and encoding.

In [130]:
model = logistic_regression(X_train, y_train)

Wall time: 7min 45s
Training Score (accuracy): 0.6209395070602147
Best Parameters: {'logreg__C': 10000.0}


In [131]:
model.score(X_test, y_test)

0.617987123899619

## 3.2 Text Features only
The full text model has a testing accuracy of only 61.7%, lower than our original 68% logistic regression model without text. Name and blurb might not be have much predictive power to the success of a kickstarter campaign. Lets test it out quickly using name and blurb as input features to the model.

In [142]:
X_text = np.hstack([name, blurb])
X_train, X_test, y_train, y_test = train_test_split(X_text, y.to_numpy(), test_size=0.15, random_state=45)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((172516, 20), (30444, 20), (172516,), (30444,))

In [143]:
logreg = LogisticRegression(solver='lbfgs', multi_class='ovr', random_state=45, max_iter=500) 
pipe = Pipeline(steps=[('logreg', logreg)])

param_grid = {
    'logreg__C': np.power(10.0, np.arange(3, 10)),
}   

search = GridSearchCV(pipe, param_grid, n_jobs=-1, cv=5)
%time search.fit(X_train, y_train)

print("Training Score (accuracy): {}".format(search.best_score_))
print("Best Parameters: {}".format(search.best_params_))

model = search.best_estimator_#(solver='lbfgs', multi_class='ovr', random_state=45, max_iter=500).fit(X_train, y_train)

Wall time: 10.4 s
Training Score (accuracy): 0.6550580815692457
Best Parameters: {'logreg__C': 1000.0}


In [145]:
model.score(X_test, y_test)

0.6558270923663119

## 3.3 Full Features, Modified Pipeline
Interesting. Models with text features only has 65.5% accuracy. So the text features does mean something useful. Perhaps it has something to do with the pipeline, specifically the scalar part. So we will test full features without scalar in the scikit-learn pipeline but scale `goal` and `duration` outside of the pipeline.

In [146]:
goal_norm = (goal - goal.mean())/goal.std()
dur_norm = (durations - durations.mean())/durations.std()
X_text = np.hstack([name, blurb, goal_norm, dur_norm, country, cat])
X_train, X_test, y_train, y_test = train_test_split(X_text, y.to_numpy(), test_size=0.15, random_state=45)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((172516, 24), (30444, 24), (172516,), (30444,))

In [1]:
logreg = LogisticRegression(solver='lbfgs', multi_class='ovr', random_state=45, max_iter=500)
encoder = OneHotEncoder(categories='auto', handle_unknown='ignore')

pipe = Pipeline(steps=[('encoder', encoder),
                       ('logreg', logreg)
                       ])

param_grid = {
    'logreg__C': np.power(10.0, np.arange(3, 10)),
}

search = GridSearchCV(pipe, param_grid, n_jobs=-1, cv=5)
%time search.fit(X_train, y_train)
print("Training Score (accuracy): {}".format(search.best_score_))
print("Best Parameters: {}".format(search.best_params_))

model = search.best_estimator_

NameError: name 'LogisticRegression' is not defined

# 4. Saving the Model

filename = '../models/20191022_logreg_68.sav'
pickle.dump(model, open(filename, 'wb'))

# 5. Inferencing

import json
test1 = '''{"name": "asdfasdfasdf", "blurb": "asdfasdfasdfadsdfasdfadfasdf", "goal": 800.0, "country": "US", "duration":15.0, "category": "fashion"}'''
test2 = '''{"goal": 800.0, "country": "US", "duration":15.0, "category": "fashion"}'''

test1j = json.loads(test2)
test1df = pd.DataFrame.from_records(test1j, index=[0])
model.predict_proba(test1df.to_numpy())[:,1]

# 5.1 Distributionof testing data
Look at the distribution of the prediction for the probability of success.

y_pred = model.predict_proba(X_test)
plt.hist(y_pred[:,1], bins=40);

print(f"Highest probability w/in test {np.max(y_pred[:,1])}, highest prob sample location is {np.argmax(y_pred[:,1])}")
x_high = X_test[np.argmax(y_pred[:,1])]
print(x_high)
print(f"The highest probabilty sample is {x_high}, shape is {x_high.shape})")

model.predict_proba(x_high.reshape(1, -1))

model.predict_proba(X_test[np.argmax(y_pred[:,1])].reshape(1, -1))

goal, duration, country, category = 2011.0, 67.0, 'US', 'publishing'

test3 = '''{"goal": 2011, "country": "US", "duration":67, "category": "publishing"}'''
test3j = json.loads(test3)
test3df = pd.DataFrame.from_records(test3j, index=[0], columns=['goal', 'country', 'duration', 'category'])
model.predict_proba(test3df.to_numpy())[:,1]