# Logistic Regression, Full Features

In [1]:
import sys
import os
sys.path.append(os.path.abspath('../data'))
sys.path.append(os.path.abspath('../models'))

import pathlib
import json
from datetime import datetime

import numpy as np
import pandas as pd
import pickle

import nltk
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
nltk.download('punkt')

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package punkt to C:\Users\Han-chung
[nltk_data]     Lee\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 1. Loading

In [2]:
data_root = pathlib.Path('../data')

In [3]:
all_json_paths = list(data_root.glob('*.json'))
all_json_paths = [str(path) for path in all_json_paths]

In [4]:
all_json_paths

['..\\data\\Kickstarter_2019-01-17T03_20_02_630Z.json',
 '..\\data\\Kickstarter_2019-02-14T03_20_04_734Z.json',
 '..\\data\\Kickstarter_2019-03-14T03_20_12_200Z.json',
 '..\\data\\Kickstarter_2019-04-18T03_20_02_220Z.json',
 '..\\data\\Kickstarter_2019-05-16T03_20_20_822Z.json',
 '..\\data\\Kickstarter_2019-06-13T03_20_35_801Z.json',
 '..\\data\\Kickstarter_2019-07-18T03_20_05_009Z.json',
 '..\\data\\Kickstarter_2019-08-15T03_20_03_022Z.json']

In [5]:
data = []
for line in open(all_json_paths[0], 'r', encoding='utf8'):
    data.append(json.loads(line))
    
data = [record['data'] for record in data]
raw = pd.DataFrame.from_records(data)

# 2. Preprocessing

In [6]:
def preproc(df: pd.DataFrame) -> pd.DataFrame:
    """
    Naive preprocessing the input data by dropping samples that still have the campaign running,
    impute durations and categories, dropping unnecessary features, and one-hot encoding for
    training.
    
    Parameters
    ----------
    df : pandas.DataFrame
    
    Returns
    ----------
    df : pandas.DataFrame
    """
    # get durations by taking the difference between launch and deadline and transform
    # the seconds integer into days.
    df['durations'] = round((df.deadline - df.launched_at)/(60*60*24))
    
    # parse the category feature's json format and extract the first level categories
    df['category'] = df.category.apply(lambda x: x['slug'].split('/')[0])

    # map states to 1 for success and 0 for others. Also will drop all 'live' records.
    state_dict = {'successful':1, 'failed':0, 'canceled':0, 'suspended':0}
    df = df.replace({"state": state_dict})
    df = df[df.state != 'live']

    # drop unused features
    df = df[['name', 'blurb', 'goal', 'country', 'durations', 'category', 'state']]
    
    return df

In [7]:
df = raw.copy()
df = preproc(df)
X_col = ['goal', 'durations', 'country', 'category']
X = df[X_col]
# need to add .astype('int') to turn it y into int from object. otherise sklearn wont work
# https://stackoverflow.com/questions/45346550/valueerror-unknown-label-type-unknown
y = df.state.astype('int')

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X.to_numpy(), y.to_numpy(), test_size=0.15, random_state=45)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((172516, 4), (30444, 4), (172516,), (30444,))

## 2.2 Transform Text Data Into Embeddings

In [9]:
d2v = Doc2Vec.load("../models/d2v.model")

In [10]:
name_test = word_tokenize(df.name[0].lower())
name_v_test = d2v.infer_vector(name_test)

In [11]:
name_v_test

array([-2.6723077 ,  1.1827933 ,  2.889591  ,  3.2563348 , -1.398114  ,
        0.7121932 ,  3.0553718 , -0.16616717, -0.30374274,  1.5237863 ],
      dtype=float32)

In [12]:
blurb_test = word_tokenize(df.blurb[0].lower())
blurb_v_test = d2v.infer_vector(blurb_test)

In [13]:
blurb_v_test

array([-1.2961464, -3.8206117,  3.3928459,  1.992708 , -3.2097907,
       -1.1067653,  5.9106565,  1.3232312, -1.1453363,  2.450148 ],
      dtype=float32)

In [14]:
%%timeit 
name_v = d2v.infer_vector(word_tokenize(df.name[0].lower()))

4.05 ms ± 76.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
#
# !!!!!!!!!!!! WARNING! EXPENSIVE CELL, 15 MIN RUNTIMNE !!!!!!!!!!!!!!!!!!
#
%time name_embedding = df.name.apply(lambda x: d2v.infer_vector(word_tokenize(x.lower())))

In [None]:
#
# !!!!!!!!!!!! WARNING! EXPENSIVE CELL, 20 MIN RUNTIME !!!!!!!!!!!!!!!!!!!!!
#
%time blurb_embedding = df.blurb.apply(lambda x: d2v.infer_vector(word_tokenize(x.lower())))

## 2.2 Preprocessing

In [None]:
name = np.vstack(name_embedding.apply(lambda x: x.flatten()).to_list())
blurb = np.vstack(blurb_embedding.apply(lambda x: x.flatten()).to_list())
goal = df.goal.to_numpy().reshape(-1, 1)
country = df.country.to_numpy().reshape(-1, 1)
durations = df.durations.to_numpy().reshape(-1, 1)
cat = df.category.to_numpy().reshape(-1, 1)

In [None]:
name.shape, blurb.shape, goal.shape, country.shape, durations.shape, cat.shape

In [None]:
X = np.hstack([name, blurb, goal, country, durations, cat])

In [None]:
X.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y.to_numpy(), test_size=0.15, random_state=45)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# 3. Training

In [None]:
def logistic_regression(X, y):
    """
    Logistic regression model using GridSearchCV. Since GridSearchCV does cross validation internally,
    we choose not to split X into training and validation set. We choose to do 5 fold cross validation
    during GridSearch. With that, data issplit three ways: 0.68 train, 0.17 validation, and 0.15 test.
    We will continue to use OneHotEncoding and StandardScaler in our training pipeline. Since some of
    the categorical features have very high cardinality, e.g., funder with 1898 categories, we choose
    to take only the top 6 with high cardinality to reduce training time.
    
    Parameters
    ----------
    X : training data
    y : target data
    
    Returns
    ----------
    search.best_estimator : the best Logistic Regression model produced by the GridSearchCV
    """

    logreg = LogisticRegression(solver='lbfgs', multi_class='ovr', random_state=45, max_iter=500)
    encoder = OneHotEncoder(categories='auto', handle_unknown='ignore')
    scaler = StandardScaler(with_mean=False)

    pipe = Pipeline(steps=[('encoder', encoder),
                           ('scaler', scaler),
                           ('logreg', logreg)
                           ])
    
    param_grid = {
        'logreg__C': np.power(10.0, np.arange(3, 10)),
    }
    
    search = GridSearchCV(pipe, param_grid, n_jobs=-1, cv=5)
    %time search.fit(X, y)
    print("Training Score (accuracy): {}".format(search.best_score_))
    print("Best Parameters: {}".format(search.best_params_))
    
    return search.best_estimator_

## 3.1 Full Features
We will train using logistic regression with all the features. We first convert the text into document embeddings of size 10. Then we compaign them together with other features into a feature matrix. The matrix will be feed into a scikit-learn pipeline with scaling and encoding.

In [None]:
model = logistic_regression(X_train, y_train)

In [None]:
model.score(X_test, y_test)

## 3.2 Text Features only
The full text model has a testing accuracy of only 61.7%, lower than our original 68% logistic regression model without text. Name and blurb might not be have much predictive power to the success of a kickstarter campaign. Lets test it out quickly using name and blurb as input features to the model.

In [None]:
X_text = np.hstack([name, blurb])
X_train, X_test, y_train, y_test = train_test_split(X_text, y.to_numpy(), test_size=0.15, random_state=45)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
logreg = LogisticRegression(solver='lbfgs', multi_class='ovr', random_state=45, max_iter=500) 
pipe = Pipeline(steps=[('logreg', logreg)])

param_grid = {
    'logreg__C': np.power(10.0, np.arange(3, 10)),
}   

search = GridSearchCV(pipe, param_grid, n_jobs=-1, cv=5)
%time search.fit(X_train, y_train)

print("Training Score (accuracy): {}".format(search.best_score_))
print("Best Parameters: {}".format(search.best_params_))

model = search.best_estimator_#(solver='lbfgs', multi_class='ovr', random_state=45, max_iter=500).fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

## 3.3 Full Features, Modified Pipeline
Interesting. Models with text features only has 65.5% accuracy. So the text features does mean something useful. Perhaps it has something to do with the pipeline, specifically the scalar part. So we will test full features without scalar in the scikit-learn pipeline but scale `goal` and `duration` outside of the pipeline.

In [None]:
goal_norm = (goal - goal.mean())/goal.std()
dur_norm = (durations - durations.mean())/durations.std()
X_text = np.hstack([name, blurb, goal_norm, dur_norm, country, cat])
X_train, X_test, y_train, y_test = train_test_split(X_text, y.to_numpy(), test_size=0.15, random_state=45)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# logreg = LogisticRegression(solver='lbfgs', multi_class='ovr', random_state=45, max_iter=500)
# encoder = OneHotEncoder(categories='auto', handle_unknown='ignore')

# pipe = Pipeline(steps=[('encoder', encoder),
#                        ('logreg', logreg)
#                        ])

# param_grid = {
#     'logreg__C': np.power(10.0, np.arange(3, 10)),
# }

# search = GridSearchCV(pipe, param_grid, n_jobs=-1, cv=5)
# %time search.fit(X_train, y_train)
# print("Training Score (accuracy): {}".format(search.best_score_))
# print("Best Parameters: {}".format(search.best_params_))

# model = search.best_estimator_

In [None]:
model.score(X_test, y_test)

In [None]:
model.named_steps['logreg'].coef_.shape

In [None]:
df.country.value_counts(dropna=False).index.shape, df.category.value_counts(dropna=False).index.shape

## 3.4 Full Features, Manual Transformation, No Pipeline
Looking at the number of coefficients, something is seriously messed up. We are expecting 10 for name, 10 for blurb, 1 for goal, 1 for duration, 22 for country, and 15 for categories. That is a total of 59 parameters. But looking at our coef_, we have 3,448,327. Something went wrong, most probably encoder. We will try to manually encode each of the columns to make it work. 

In [None]:
ohe = OneHotEncoder(categories='auto', handle_unknown='ignore')
cc = ohe.fit_transform(country).toarray()

In [None]:
cc.shape

In [None]:
cc.toarray().shape

In [None]:
ohe2 = OneHotEncoder(categories='auto', handle_unknown='ignore')
catcat = ohe.fit_transform(cat).toarray()

In [None]:
catcat.shape

In [None]:
goal_norm = (goal - goal.mean())/goal.std()
dur_norm = (durations - durations.mean())/durations.std()

In [None]:
name.shape, blurb.shape, goal_norm.shape, dur_norm.shape, cc.shape, catcat.shape

In [None]:
X_text = np.hstack([name, blurb, goal_norm, dur_norm, cc, catcat])
X_train, X_test, y_train, y_test = train_test_split(X_text, y.to_numpy(), test_size=0.15, random_state=45)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
logreg = LogisticRegression(solver='lbfgs', multi_class='ovr', random_state=45, max_iter=500)

pipe = Pipeline(steps=[('logreg', logreg)
                       ])

param_grid = {
    'logreg__C': np.power(10.0, np.arange(3, 10)),
}

search = GridSearchCV(pipe, param_grid, n_jobs=-1, cv=5)
%time search.fit(X_train, y_train)
print("Training Score (accuracy): {}".format(search.best_score_))
print("Best Parameters: {}".format(search.best_params_))

model = search.best_estimator_

In [None]:
model.score(X_test, y_test)

In [None]:
model.named_steps['logreg'].coef_.shape

## 3.5 Full Features
That's better. Not the number of feature parameters now make sense, the testing accuracy is higher as well, at 0.689. Further, the model size is now only 1.9kB. Now we will try to create a column_transformer for features outside of the doc2vec models. The reason we choose not to incorporate doc2vec into the pipeline yet is due to the speed of the transformation. Embedding the whole name and the whole blurb takes around 25 minutes.

In [None]:
numeric_features = ['goal', 'durations']
numeric_transformer = StandardScaler(with_mean=False)
    
categorical_features = ['country', 'category']
categorical_transformer = OneHotEncoder(categories='auto', handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [None]:
df_no_state = df.iloc[:, :-1]
df_no_state.reset_index(inplace=True)
df_no_state.drop("index",axis=1,inplace=True)

In [None]:
df_no_state.columns

In [None]:
df.columns

In [None]:
ct_model = preprocessor.fit(df_no_state)
ct_filename = '../models/20191023_preproc.sav'
pickle.dump(ct_model, open(filename, 'wb'))

In [None]:
ct_model = pickle.load(open(ct_filename, 'rb'))
X_transformed = ct_model.transform(df).toarray()

In [None]:
X_FF = np.hstack([name, blurb, X_transformed])
X_FF.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_FF, y.to_numpy(), test_size=0.15, random_state=45)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
logreg = LogisticRegression(solver='lbfgs', multi_class='ovr', random_state=45, max_iter=500)

pipe = Pipeline(steps=[('logreg', logreg)])

param_grid = {
    'logreg__C': np.power(10.0, np.arange(3, 10)),
}

search = GridSearchCV(pipe, param_grid, n_jobs=-1, cv=5)
%time search.fit(X_train, y_train)
print("Training Score (accuracy): {}".format(search.best_score_))
print("Best Parameters: {}".format(search.best_params_))

model = search.best_estimator_

In [None]:
model.score(X_test, y_test)

In [None]:
model.named_steps['logreg'].coef_.shape

In [None]:
filename = '../models/20191023_logreg_text_69.sav'
pickle.dump(model, open(filename, 'wb'))

# 4. Inferencing

In [None]:
test1 = '''{"name": "This is a test Kickstarter header", "blurb": "This is an example description of a kickstarter project to test for the API. I would like to thank my wife, parents, and all my loving family members for this to work. I would also like to thank all the Kickstarter team members and project leads for making this possible.", 
            "goal": 2011.0, 
            "country": "US", 
            "duration":67.0, 
            "category": "publishing"}'''
test1j = json.loads(test1)
test1df = pd.DataFrame.from_records(test1j, index=[0], columns=['name', 'blurb', 'goal', 'country', 'duration', 'category'])
#model.predict_proba(test3df.to_numpy())[:,1]

In [None]:
D2V_FILENAME = "../models/d2v.model"
CT_FILENAME = '../models/20191023_preproc.sav'
MODEL_FILENAME = "../models/20191023_logreg_text_69.sav"

d2v = Doc2Vec.load(D2V_FILENAME)
ct = pickle.load(open(CT_FILENAME, 'rb'))
model = pickle.load(open(MODEL_FILENAME, 'rb'))

In [None]:
df.iloc[:,:-1].columns

In [None]:
test1df.columns

In [None]:
# preprocessing
X_none_text = ct_model.transform(test1df).toarray()

# 5.1 Distributionof testing data
Look at the distribution of the prediction for the probability of success.

y_pred = model.predict_proba(X_test)
plt.hist(y_pred[:,1], bins=40);

print(f"Highest probability w/in test {np.max(y_pred[:,1])}, highest prob sample location is {np.argmax(y_pred[:,1])}")
x_high = X_test[np.argmax(y_pred[:,1])]
print(x_high)
print(f"The highest probabilty sample is {x_high}, shape is {x_high.shape})")

model.predict_proba(x_high.reshape(1, -1))

model.predict_proba(X_test[np.argmax(y_pred[:,1])].reshape(1, -1))

goal, duration, country, category = 2011.0, 67.0, 'US', 'publishing'