In [59]:
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn import model_selection
%matplotlib inline

In [60]:
random_state = 69

In [61]:
with open('all_data_to_model.pickle', 'rb') as f:
    project_df = pickle.load(f)

In [62]:
project_df.head()

Unnamed: 0,description_length,n_pledges,pledge_level_min,pledge_level_max,pledge_level_stddev,funding_goal,duration,Comics,Crafts,Dance,...,Food,Games,Journalism,Music,Photography,Publishing,Sculpture,Technology,Theater,funding_percent
0,93,1,7.886086,7.886086,0.0,15772.172704,60,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
1,106,3,2.0,100.0,45.50702,15000.0,60,0,0,0,...,0,0,0,0,0,0,0,0,0,0.001133
2,46,1,8.712913,8.712913,0.0,43564.564775,60,0,0,0,...,0,0,0,0,0,0,0,1,0,0.0
3,160,7,7.837804,783.780404,299.100883,1567.560808,59,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
4,153,10,1.564151,7820.753536,2342.290924,93849.042433,60,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0


In [63]:
project_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55808 entries, 0 to 55877
Data columns (total 23 columns):
description_length     55808 non-null int64
n_pledges              55808 non-null int64
pledge_level_min       55808 non-null float64
pledge_level_max       55808 non-null float64
pledge_level_stddev    55808 non-null float64
funding_goal           55808 non-null float64
duration               55808 non-null int64
Comics                 55808 non-null uint8
Crafts                 55808 non-null uint8
Dance                  55808 non-null uint8
Design                 55808 non-null uint8
Fashion                55808 non-null uint8
Film & Video           55808 non-null uint8
Food                   55808 non-null uint8
Games                  55808 non-null uint8
Journalism             55808 non-null uint8
Music                  55808 non-null uint8
Photography            55808 non-null uint8
Publishing             55808 non-null uint8
Sculpture              55808 non-null uint8
Tec

In [64]:
project_df.funding_percent = project_df.funding_percent.apply(lambda x: 1 if x>=1.0 else 0)
project_df.funding_percent.value_counts()

0    36917
1    18891
Name: funding_percent, dtype: int64

In [65]:
X = project_df.drop(columns=['funding_percent'])
y = project_df.funding_percent

In [66]:
from sklearn.model_selection import train_test_split

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=random_state)

In [73]:
X_train.head()

Unnamed: 0,description_length,n_pledges,pledge_level_min,pledge_level_max,pledge_level_stddev,funding_goal,duration,Comics,Crafts,Dance,...,Film & Video,Food,Games,Journalism,Music,Photography,Publishing,Sculpture,Technology,Theater
25847,274,2,9.306406,93.064061,41.878827,2326.601528,30,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5123,178,5,10.0,500.0,178.594513,2500.0,51,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41376,358,8,10.0,1000.0,317.400944,12000.0,32,0,0,0,...,0,1,0,0,0,0,0,0,0,0
31758,144,3,10.0,50.0,16.996732,625.0,30,0,0,0,...,0,0,0,0,1,0,0,0,0,0
19561,67,3,20.0,100.0,32.998316,1000.0,60,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [10]:
X_train.head()

Unnamed: 0,description_length,n_pledges,pledge_level_min,pledge_level_max,pledge_level_stddev,funding_goal,duration,Comics,Crafts,Dance,...,Film & Video,Food,Games,Journalism,Music,Photography,Publishing,Sculpture,Technology,Theater
25847,274,2,9.306406,93.064061,41.878827,2326.601528,30,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5123,178,5,10.0,500.0,178.594513,2500.0,51,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41376,358,8,10.0,1000.0,317.400944,12000.0,32,0,0,0,...,0,1,0,0,0,0,0,0,0,0
31758,144,3,10.0,50.0,16.996732,625.0,30,0,0,0,...,0,0,0,0,1,0,0,0,0,0
19561,67,3,20.0,100.0,32.998316,1000.0,60,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [11]:
logreg = LogisticRegression(C=0.1)
logreg.fit(X_train, y_train)
y_predicted = logreg.predict(X_val)

#### Evaluating Performance

In [12]:
print('mean accuracy is {}'.format(logreg.score(X_val,y_val)))
conf_mat = confusion_matrix(y_val, y_predicted)
print(conf_mat)

mean accuracy is 0.7022184300341296
[[7079  608]
 [2882 1151]]


In [13]:
kfold = model_selection.KFold(n_splits=10, random_state=42)
model = LogisticRegression(C=0.1)
scoring = 'roc_auc'
results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
print("AUC: mean= {}, stddev = {}".format(results.mean(), results.std()))

AUC: mean= 0.7631943622816664, stddev = 0.009188224986707037


In [14]:
results

array([0.76324432, 0.759175  , 0.77375221, 0.77376122, 0.74531277,
       0.76806244, 0.76114555, 0.75906836, 0.75329729, 0.77512446])

In [15]:
logreg.coef_

array([[ 7.80729928e-04,  4.02767970e-02, -7.22234559e-03,
         5.20851438e-04, -1.58189931e-03, -3.12514537e-05,
        -3.03863917e-02,  3.09728355e-04, -4.97738309e-04,
         2.81348390e-04,  7.82000684e-05, -7.35495021e-04,
         6.92832795e-04, -8.90761309e-04, -4.32815468e-04,
        -1.79238397e-04,  2.02401932e-03, -4.66400304e-04,
        -1.07727098e-03, -4.05993040e-05, -8.86207368e-04,
         6.17608711e-04]])

#### Using standard scalar

In [68]:
ssX = StandardScaler()
X_train_scaled = ssX.fit_transform(X_train)
X_val_scaled = ssX.transform(X_val)

In [17]:
logreg_ss = LogisticRegression(C=1)
logreg_ss.fit(X_train_scaled, y_train)
y_predicted = logreg.predict(X_val)
print('mean accuracy is {}'.format(logreg_ss.score(X_val_scaled,y_val)))
conf_mat = confusion_matrix(y_val, y_predicted)
print(conf_mat)

mean accuracy is 0.725
[[7079  608]
 [2882 1151]]


In [18]:
kfold = model_selection.KFold(n_splits=10, random_state=42)
model = LogisticRegression()
scoring = 'roc_auc'
results = model_selection.cross_val_score(model, X_train_scaled, y_train, cv=kfold, scoring=scoring)
print("AUC: mean= {}, stddev = {}".format(results.mean(), results.std()))

AUC: mean= 0.77691509032102, stddev = 0.007820650759704475


In [19]:
results

array([0.78475917, 0.78383547, 0.77557525, 0.77940983, 0.75977222,
       0.77898881, 0.77808315, 0.77149037, 0.76969045, 0.78754617])

In [20]:
logreg_ss.coef_

array([[ 3.00188722e-01,  7.53360196e-01, -9.28753907e-01,
         1.24295314e-01, -3.45742199e-01, -1.62359248e+01,
        -2.94299687e-01, -2.92016684e-03, -1.42732065e-01,
         9.10304575e-02, -7.21839909e-02, -1.75652424e-01,
         1.88117152e-02, -1.57263534e-01, -1.41167004e-01,
        -8.76774022e-02,  1.61363806e-01, -1.08663681e-01,
        -1.44162476e-01, -4.77684121e-02, -2.06093176e-01,
         1.24038131e-01]])

#### Using Standard Scalar with Lasso instead of Ridge Regularization

In [21]:
logreg_ssl1 = LogisticRegression(penalty='l1', C=0.1)
logreg_ssl1.fit(X_train_scaled, y_train)
y_predicted = logreg.predict(X_val)
print('mean accuracy is {}'.format(logreg_ss.score(X_val_scaled,y_val)))
conf_mat = confusion_matrix(y_val, y_predicted)
print(conf_mat)

mean accuracy is 0.725
[[7079  608]
 [2882 1151]]


In [22]:
kfold = model_selection.KFold(n_splits=10, random_state=42)
model = LogisticRegression(penalty='l1', C=0.1)
scoring = 'roc_auc'
results = model_selection.cross_val_score(model, X_train_scaled, y_train, cv=kfold, scoring=scoring)
print("AUC: mean= {}, stddev = {}".format(results.mean(), results.std()))

AUC: mean= 0.7814748218813596, stddev = 0.008093906712930806


In [23]:
results

array([0.78921991, 0.78848651, 0.78055231, 0.78345862, 0.76424179,
       0.78406602, 0.78177245, 0.77502321, 0.77438811, 0.79353929])

In [24]:
logreg_ssl1.coef_

array([[  0.31115829,   0.78550799,  -0.54653076,   0.        ,
         -0.18598079, -22.90414115,  -0.28165383,   0.        ,
         -0.13468471,   0.09256595,  -0.05165888,  -0.16224509,
          0.03547679,  -0.1392723 ,  -0.12097691,  -0.07994199,
          0.16941641,  -0.10014162,  -0.13252047,  -0.04315147,
         -0.17645164,   0.12862318]])

In [25]:
# Grid Search for C parameter

In [69]:
reg_parameter_values = [50, 100, 150, 250, 500, 1000]
#reg_parameter_values = [0.05-.95, 1, 10, 25, 50, 100]
parameters = {'penalty': ['l1','l2'], 'C': reg_parameter_values, 'class_weight': [None, 'balanced']}
#parameters = {'penalty': ['l2'], 'C': reg_parameter_values, 'class_weight': [None, 'balanced']}
grid = model_selection.GridSearchCV(LogisticRegression(),param_grid=parameters, cv=5, scoring='roc_auc', n_jobs=1)
grid_search = grid.fit(X_train_scaled, y_train)

In [70]:
grid_search.best_params_

{'C': 1000, 'class_weight': 'balanced', 'penalty': 'l1'}

In [71]:
grid_search.best_score_

0.7873140033613463

In [72]:
grid_search.best_estimator_.coef_

array([[ 3.78299193e-01,  8.56092935e-01, -7.23426073e-01,
         2.72243767e-01, -3.93002412e-01, -3.87750973e+01,
        -2.66331703e-01, -1.30183591e-02, -1.43908680e-01,
         9.33473603e-02, -4.18624016e-02, -1.65346234e-01,
         2.78764728e-02, -1.43980427e-01, -1.23412324e-01,
        -8.18262480e-02,  1.53981381e-01, -1.07361744e-01,
        -1.46013097e-01, -4.72670499e-02, -1.54036950e-01,
         1.31324567e-01]])

In [29]:
grid_search.best_estimator_

array(['description_length', 'n_pledges', 'pledge_level_min',
       'pledge_level_max', 'pledge_level_stddev', 'funding_goal',
       'duration', 'Comics', 'Crafts', 'Dance', 'Design', 'Fashion',
       'Film & Video', 'Food', 'Games', 'Journalism', 'Music',
       'Photography', 'Publishing', 'Sculpture', 'Technology', 'Theater'],
      dtype=object)

In [30]:
grid_search.best_estimator_.coef_

array([[ 3.78126801e-01,  8.55756776e-01, -7.24152998e-01,
         2.73528709e-01, -3.94567823e-01, -3.87073411e+01,
        -2.66405164e-01, -1.30021475e-02, -1.43910796e-01,
         9.33404790e-02, -4.19583813e-02, -1.65385410e-01,
         2.78157597e-02, -1.44041863e-01, -1.23473569e-01,
        -8.18434311e-02,  1.53970715e-01, -1.07375003e-01,
        -1.46018872e-01, -4.72711636e-02, -1.54181362e-01,
         1.31299683e-01]])

In [31]:
X_train.quantile(.99)

description_length       2495.920000
n_pledges                  27.000000
pledge_level_min          200.000000
pledge_level_max        10000.000000
pledge_level_stddev      3588.081826
funding_goal           350000.000000
duration                   60.000000
Comics                      1.000000
Crafts                      1.000000
Dance                       1.000000
Design                      1.000000
Fashion                     1.000000
Film & Video                1.000000
Food                        1.000000
Games                       1.000000
Journalism                  0.000000
Music                       1.000000
Photography                 1.000000
Publishing                  1.000000
Sculpture                   0.000000
Technology                  1.000000
Theater                     1.000000
Name: 0.99, dtype: float64

#### Getting rid of extreme funding_goal outliers

In [32]:
X_train_cutoff = X_train[X_train.funding_goal < 350000]

In [33]:
X_train_cutoff_scaled = ssX.fit_transform(X_train)

In [34]:
reg_parameter_values = [50, 100, 150, 250, 500, 1000]
#reg_parameter_values = [0.05-.95, 1, 10, 25, 50, 100]
#parameters = {'penalty': ['l1','l2'], 'C': reg_parameter_values, 'class_weight': [None, 'balanced']}
parameters = {'penalty': ['l2'], 'C': reg_parameter_values, 'class_weight': [None, 'balanced']}
grid = model_selection.GridSearchCV(LogisticRegression(),param_grid=parameters, cv=5, scoring='roc_auc', n_jobs=1)
grid_search = grid.fit(X_train_cutoff_scaled, y_train)

In [35]:
grid_search.best_params_

{'C': 1000, 'class_weight': 'balanced', 'penalty': 'l2'}

In [36]:
grid_search.best_score_

0.787290881469816

In [37]:
X_train.columns.values

array(['description_length', 'n_pledges', 'pledge_level_min',
       'pledge_level_max', 'pledge_level_stddev', 'funding_goal',
       'duration', 'Comics', 'Crafts', 'Dance', 'Design', 'Fashion',
       'Film & Video', 'Food', 'Games', 'Journalism', 'Music',
       'Photography', 'Publishing', 'Sculpture', 'Technology', 'Theater'],
      dtype=object)

In [38]:
grid_search.best_estimator_.coef_

array([[ 3.78126801e-01,  8.55756776e-01, -7.24152998e-01,
         2.73528709e-01, -3.94567823e-01, -3.87073411e+01,
        -2.66405164e-01, -1.30021475e-02, -1.43910796e-01,
         9.33404790e-02, -4.19583813e-02, -1.65385410e-01,
         2.78157597e-02, -1.44041863e-01, -1.23473569e-01,
        -8.18434311e-02,  1.53970715e-01, -1.07375003e-01,
        -1.46018872e-01, -4.72711636e-02, -1.54181362e-01,
         1.31299683e-01]])

In [39]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_val_scaled)
roc_auc_score(y_val, y_pred)

0.7233337841692471