In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, recall_score, plot_confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from datetime import datetime, timedelta
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

In [84]:
df = pd.read_csv('kickstarter_data_with_features.csv')

In [85]:
#These are the only columns we can use as input
df = df[['name', 'goal', 'blurb', 'launched_at', 'deadline','category','state', 'country']]

In [86]:
#filter out all countries not english-speaking
english_countries = ['US', 'IE', 'GB', 'AU', 'CA', 'NZ', ]
df= df[df['country'].isin(english_countries)]

In [56]:
# df = pd.read_csv('ks-projects-201801.csv')

In [87]:
#create 'success'filter which only shows failed or success and cuts out other options
suc_filt = ['failed', 'successful', 'canceled']
df = df[df['state'].isin(suc_filt)]
# Convert fail or succuess to 0 or 1
df['state'] = df['state'].replace({'failed': 0, 'successful': 1, 'canceled': 0})

In [88]:
#Two seperate dataframes are created below, one of only success and one of only fail
faileddf = df[df['state'] == 0]
sucdf = df[df['state'] == 1]

In [89]:
#Baseline prediction
failed = len(faileddf)
suc = len(sucdf)

suc / failed

0.4579348354676609

In [90]:
df_nt = df[['goal', 'launched_at', 'deadline', 'country', 'state']]

In [91]:
#convert time columns to campaign length
df['deadline'] = df['deadline'].str[:10]
df['launched_at'] = df['launched_at'].str[:10]
df_nt['deadline'] = pd.to_datetime(df_nt['deadline'], format="%Y/%m/%d")
df_nt['launched_at'] = pd.to_datetime(df_nt['launched_at'], format="%Y/%m/%d")

df['deadline'] = pd.to_datetime(df['deadline'], format="%Y/%m/%d")
df['launched_at'] = pd.to_datetime(df['launched_at'], format="%Y/%m/%d")
#create new column
df_nt['cam_length'] = df_nt['deadline'] - df_nt['launched_at']
df['cam_length'] = df['deadline'] - df['launched_at']

In [92]:
df

Unnamed: 0,name,goal,blurb,launched_at,deadline,category,state,country,cam_length
0,Auntie Di's Music Time Sign ASL for Hearing an...,1500.0,MTS ASL Curriculum Workbook is a reproducible ...,2014-12-17,2015-01-23,Academic,0,US,37 days
1,Jump Start Kindergarten Toolkit,500.0,"This kit teaches how to print, correct an ugly...",2015-03-02,2015-05-01,Academic,0,US,60 days
2,Ojukwu Balewa Awolowo (O.B.A.) Public Library ...,100000.0,"Establishing a free, world-class, public libra...",2015-01-25,2015-03-26,Academic,0,US,60 days
3,"MASTIZE - [mas-TAHYZ, MAS-tahyz] - to spread",5000.0,Goal: Introducing a new word into the English ...,2014-09-06,2014-10-06,Academic,0,US,30 days
5,Shadow School Board - Reforming Texas School B...,13000.0,Shadow School Board will provide parents & tax...,2015-10-21,2015-11-20,Academic,0,US,30 days
...,...,...,...,...,...,...,...,...,...
20625,Private.Center - Private Social Network,20000.0,Private.Center - the Private Social Network is...,2015-12-17,2016-02-15,Apps,0,GB,60 days
20626,Pathfinder Adventure Card Game Character Track...,5000.0,A companion app for the Pathfinder Adventure C...,2016-05-26,2016-07-01,Apps,0,AU,36 days
20629,Sloth Face App,10000.0,"We all love fatbooth and Ugly booth, why not s...",2015-02-13,2015-04-14,Apps,0,US,60 days
20630,DriverTools,2500.0,Production android app for cab drivers and tru...,2015-04-20,2015-05-20,Apps,0,US,30 days


In [93]:
lb_make = LabelEncoder()
#df_nt["cam_length"] = lb_make.fit_transform(df_nt["cam_length"])
df["cam_length"] = lb_make.fit_transform(df["cam_length"])

In [66]:
# df_nt = df_nt[['goal', 'cam_length', 'state']]

In [94]:
df

Unnamed: 0,name,goal,blurb,launched_at,deadline,category,state,country,cam_length
0,Auntie Di's Music Time Sign ASL for Hearing an...,1500.0,MTS ASL Curriculum Workbook is a reproducible ...,2014-12-17,2015-01-23,Academic,0,US,36
1,Jump Start Kindergarten Toolkit,500.0,"This kit teaches how to print, correct an ugly...",2015-03-02,2015-05-01,Academic,0,US,59
2,Ojukwu Balewa Awolowo (O.B.A.) Public Library ...,100000.0,"Establishing a free, world-class, public libra...",2015-01-25,2015-03-26,Academic,0,US,59
3,"MASTIZE - [mas-TAHYZ, MAS-tahyz] - to spread",5000.0,Goal: Introducing a new word into the English ...,2014-09-06,2014-10-06,Academic,0,US,29
5,Shadow School Board - Reforming Texas School B...,13000.0,Shadow School Board will provide parents & tax...,2015-10-21,2015-11-20,Academic,0,US,29
...,...,...,...,...,...,...,...,...,...
20625,Private.Center - Private Social Network,20000.0,Private.Center - the Private Social Network is...,2015-12-17,2016-02-15,Apps,0,GB,59
20626,Pathfinder Adventure Card Game Character Track...,5000.0,A companion app for the Pathfinder Adventure C...,2016-05-26,2016-07-01,Apps,0,AU,35
20629,Sloth Face App,10000.0,"We all love fatbooth and Ugly booth, why not s...",2015-02-13,2015-04-14,Apps,0,US,59
20630,DriverTools,2500.0,Production android app for cab drivers and tru...,2015-04-20,2015-05-20,Apps,0,US,29


In [96]:
df = df[['name', 'goal', 'blurb', 'state', 'cam_length']]
NUMERIC_COLUMNS = ['goal', "cam_length", ]
LABELS = ['state']
numeric_data_only = df[NUMERIC_COLUMNS]
NON_LABELS = [c for c in df.columns if c not in LABELS]
TEXT_COLUMNS = [c for c in NON_LABELS if c not in NUMERIC_COLUMNS]

In [97]:
def declare(df):
    NUMERIC_COLUMNS = ['goal', "cam_length"]
    LABELS = ['state']
    numeric_data_only = df[NUMERIC_COLUMNS]
    NON_LABELS = [c for c in df.columns if c not in LABELS]
    TEXT_COLUMNS = [c for c in NON_LABELS if c not in NUMERIC_COLUMNS]

    return df

In [98]:
declare(df)

Unnamed: 0,name,goal,blurb,state,cam_length
0,Auntie Di's Music Time Sign ASL for Hearing an...,1500.0,MTS ASL Curriculum Workbook is a reproducible ...,0,36
1,Jump Start Kindergarten Toolkit,500.0,"This kit teaches how to print, correct an ugly...",0,59
2,Ojukwu Balewa Awolowo (O.B.A.) Public Library ...,100000.0,"Establishing a free, world-class, public libra...",0,59
3,"MASTIZE - [mas-TAHYZ, MAS-tahyz] - to spread",5000.0,Goal: Introducing a new word into the English ...,0,29
5,Shadow School Board - Reforming Texas School B...,13000.0,Shadow School Board will provide parents & tax...,0,29
...,...,...,...,...,...
20625,Private.Center - Private Social Network,20000.0,Private.Center - the Private Social Network is...,0,59
20626,Pathfinder Adventure Card Game Character Track...,5000.0,A companion app for the Pathfinder Adventure C...,0,35
20629,Sloth Face App,10000.0,"We all love fatbooth and Ugly booth, why not s...",0,59
20630,DriverTools,2500.0,Production android app for cab drivers and tru...,0,29


In [99]:
df

Unnamed: 0,name,goal,blurb,state,cam_length
0,Auntie Di's Music Time Sign ASL for Hearing an...,1500.0,MTS ASL Curriculum Workbook is a reproducible ...,0,36
1,Jump Start Kindergarten Toolkit,500.0,"This kit teaches how to print, correct an ugly...",0,59
2,Ojukwu Balewa Awolowo (O.B.A.) Public Library ...,100000.0,"Establishing a free, world-class, public libra...",0,59
3,"MASTIZE - [mas-TAHYZ, MAS-tahyz] - to spread",5000.0,Goal: Introducing a new word into the English ...,0,29
5,Shadow School Board - Reforming Texas School B...,13000.0,Shadow School Board will provide parents & tax...,0,29
...,...,...,...,...,...
20625,Private.Center - Private Social Network,20000.0,Private.Center - the Private Social Network is...,0,59
20626,Pathfinder Adventure Card Game Character Track...,5000.0,A companion app for the Pathfinder Adventure C...,0,35
20629,Sloth Face App,10000.0,"We all love fatbooth and Ugly booth, why not s...",0,59
20630,DriverTools,2500.0,Production android app for cab drivers and tru...,0,29


In [100]:
# split into train test
dummy_labels = pd.get_dummies(df[LABELS])

X_train, X_test, y_train, y_test = train_test_split(df[['goal', 'cam_length']], df['state'],train_size=0.80, test_size=0.20, stratify= df['state'], random_state=3)

In [101]:
pipe = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    XGBClassifier()
)

In [102]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('simpleimputer', SimpleImputer()),
                ('standardscaler', StandardScaler()),
                ('xgbclassifier',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, gpu_id=-1,
                               importance_type='gain',
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=0, num_parallel_tree=1, random_state=0,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                               subsample=1, tree_method='exact',
                               validate_parameters=1, verbosity=None))])

In [103]:
pipe_pred = pipe.predict(X_test)

In [104]:
accuracy_score(pipe_pred,y_test)

0.7120622568093385

In [105]:
%%time
pipeline = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    RandomForestClassifier(
        n_jobs=1,
        random_state=42,
        n_estimators=9,
    )
)

param_distributions = {
    'simpleimputer__strategy': ['mean', 'median'],
    'randomforestclassifier__criterion': ('gini', 'entropy'),
    'randomforestclassifier__max_depth': (8,10,12,14,15,16,20,25),
    'randomforestclassifier__min_samples_split': (2,4,6,8,10)
}

search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=240,
    cv=7,
    scoring='accuracy',
    verbose=1,
    return_train_score=True,
    n_jobs=-1,
)

search.fit(X_train,y_train)

print('Best Score:',search.best_score_)
print('Best param:',search.best_params_)
print('Best estimaator:', search.best_estimator_)

Fitting 7 folds for each of 160 candidates, totalling 1120 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:   13.7s
Best Score: 0.702570805083657
Best param: {'simpleimputer__strategy': 'mean', 'randomforestclassifier__min_samples_split': 8, 'randomforestclassifier__max_depth': 8, 'randomforestclassifier__criterion': 'entropy'}
Best estimaator: Pipeline(steps=[('simpleimputer', SimpleImputer()),
                ('standardscaler', StandardScaler()),
                ('randomforestclassifier',
                 RandomForestClassifier(criterion='entropy', max_depth=8,
                                        min_samples_split=8, n_estimators=9,
                                        n_jobs=1, random_state=42))])
Wall time:

In [106]:
%%time
pipe2 = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    RandomForestClassifier(criterion='entropy', max_depth=8,
                                        min_samples_split=8, n_estimators=999,
                                        n_jobs=-1, random_state=42)
    )

pipe2.fit(X_train,y_train)

Wall time: 4.56 s


Pipeline(steps=[('simpleimputer', SimpleImputer()),
                ('standardscaler', StandardScaler()),
                ('randomforestclassifier',
                 RandomForestClassifier(criterion='entropy', max_depth=8,
                                        min_samples_split=8, n_estimators=999,
                                        n_jobs=1, random_state=42))])

In [107]:
pipe2_pred = pipe2.predict(X_test)

In [108]:
accuracy_score(pipe2_pred,y_test)

0.717065036131184

In [132]:
import pickle

In [116]:
dill = pickle.dump(pipe2, open( "kickstarter.pkl", "wb" ) )

In [128]:
file_pickle = open("kickstarter.pkl",'rb')

In [129]:
clf2 = pickle.load(file_pickle)

In [130]:
clf_pred = clf2.predict(X_test)

In [131]:
accuracy_score(clf_pred,y_test)

0.717065036131184