# Linear Regression, Numeric and Categorical Features

In [1]:
import sys
import os
sys.path.append(os.path.abspath('../data'))
import pathlib
import json
from datetime import datetime

import numpy as np
import pandas as pd
import pickle

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error

# 1. Loading

In [2]:
data_root = pathlib.Path('../data')

In [3]:
all_json_paths = list(data_root.glob('*.json'))
all_json_paths = [str(path) for path in all_json_paths]

In [4]:
all_json_paths

['../data/Kickstarter_2019-01-17T03_20_02_630Z.json']

In [5]:
data = []
for line in open(all_json_paths[0], 'r', encoding='utf8'):
    data.append(json.loads(line))
    
data = [record['data'] for record in data]
raw = pd.DataFrame.from_records(data)

# 2. Preprocessing

In [6]:
def preproc(df: pd.DataFrame) -> pd.DataFrame:
    """
    Naive preprocessing the input data by dropping samples that still have the campaign running,
    impute durations and categories, dropping unnecessary features, and one-hot encoding for
    training.
    
    Parameters
    ----------
    df : pandas.DataFrame
    
    Returns
    ----------
    df : pandas.DataFrame
    """
    # get durations by taking the difference between launch and deadline and transform
    # the seconds integer into days.
    df['durations'] = round((df.deadline - df.launched_at)/(60*60*24))
    
    # parse the category feature's json format and extract the first level categories
    df['cat_slug'] = df.category.apply(lambda x: x['slug'].split('/')[0])

    # map states to 1 for success and 0 for others. Also will drop all 'live' records.
    state_dict = {'successful':1, 'failed':0, 'canceled':0, 'suspended':0}
    df = df.replace({"state": state_dict})
    df = df[df.state != 'live']

    # drop unused features
    df = df[['name', 'blurb', 'goal', 'country', 'durations', 'cat_slug', 'state']]
    
    return df

# 3. Training

## 3.1 Linear Regression Baseline

In [18]:
cols_names = raw.columns.to_list()
X_col = ['goal', 'durations']#, 'country', 'cat_slug']
df = raw.copy()
df = preproc(df)
X = df[X_col].to_numpy()
# need to add .astype('int') to turn it y into int from object. otherise sklearn wont work
# https://stackoverflow.com/questions/45346550/valueerror-unknown-label-type-unknown
y = df.state.to_numpy().astype('int')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=45)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((172516, 2), (30444, 2), (172516,), (30444,))

In [19]:
# normalize X_train
X_train = (X_train - X_train.mean())/X_train.std()

%time clf = LinearRegression().fit(X_train, y_train)

print(f"R^2 score of the model is: {clf.score(X_train, y_train)}")
y_pred = clf.predict(X_test)#train[0].reshape(1, -1))
print(f"Mean square error is: {mean_squared_error(y_test, y_pred)}")

CPU times: user 15 ms, sys: 355 µs, total: 15.3 ms
Wall time: 17.7 ms
R^2 score of the model is: 0.026041726661496956
Mean square error is: 38379720612.492325


## 3.2 SGD Regressor

In [20]:
cols_names = raw.columns.to_list()
X_col = ['goal', 'durations', 'country', 'cat_slug']
df = raw.copy()
df = preproc(df)
X = df[X_col].to_numpy()
# need to add .astype('int') to turn it y into int from object. otherise sklearn wont work
# https://stackoverflow.com/questions/45346550/valueerror-unknown-label-type-unknown
y = df.state.to_numpy().astype('int')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=45)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((172516, 4), (30444, 4), (172516,), (30444,))

In [21]:
def regression(X, y):
    """
    Logistic regression model using GridSearchCV. Since GridSearchCV does cross validation internally,
    we choose not to split X into training and validation set. We choose to do 5 fold cross validation
    during GridSearch. With that, data issplit three ways: 0.68 train, 0.17 validation, and 0.15 test.
    We will continue to use OneHotEncoding and StandardScaler in our training pipeline. Since some of
    the categorical features have very high cardinality, e.g., funder with 1898 categories, we choose
    to take only the top 6 with high cardinality to reduce training time.
    
    Parameters
    ----------
    X : training data
    y : target data
    
    Returns
    ----------
    search.best_estimator : the best Logistic Regression model produced by the GridSearchCV
    """

    reg = SGDRegressor()#solver='lbfgs', multi_class='ovr', random_state=45, max_iter=500)
    encoder = OneHotEncoder(categories='auto', handle_unknown='ignore')
    scaler = StandardScaler(with_mean=False)

    pipe = Pipeline(steps=[('encoder', encoder),
                           ('scaler', scaler),
                           ('reg', reg)
                           ])
    
    param_grid = {
        'reg__l1_ratio': np.arange(0.15, 0.80, 0.05),
    }
    
    search = GridSearchCV(pipe, param_grid, n_jobs=-1, cv=5)
    %time search.fit(X, y)
    print("Training Score (accuracy): {}".format(search.best_score_))
    print("Best Parameters: {}".format(search.best_params_))
    
    return search.best_estimator_

In [22]:
model = regression(X_train, y_train)

CPU times: user 18.1 s, sys: 1.41 s, total: 19.5 s
Wall time: 21.5 s
Training Score (accuracy): -2.4100745789599384e+25
Best Parameters: {'reg__l1_ratio': 0.3500000000000001}


In [23]:
model.score(X_test, y_test)

-5.654150148975187e+25

In [24]:
print(f"R^2 score of the model is: {model.score(X_train, y_train)}")
y_pred = model.predict(X_test)#train[0].reshape(1, -1))
print(f"Mean square error is: {mean_squared_error(y_test, y_pred)}")

R^2 score of the model is: -8.848398892410494e+25
Mean square error is: 1.3770090363515053e+25


## Neither regression produced meaningful results. Will use logistic regression's predict probability output instead.