In [1]:
#Imports
import category_encoders as ce
import pandas as pd
import joblib
from joblib import dump, load
import json
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import  f1_score, roc_auc_score, roc_curve, precision_score, recall_score, accuracy_score
from xgboost import XGBClassifier

In [2]:
df1 = pd.read_csv('/Users/charliemay/Desktop/kickstarter_project/DS/clean_kickstart_data.csv')
df1.head()

Unnamed: 0,blurb,country,goal,launched_at,campaign_success,category,subcategory,campaign_length
0,Support great art! Join us as we re-stage the ...,the United States,3000.0,1580433192,1,Dance,Performances,30.0
1,JinBucha is a new kind of Brewery in North Par...,the United States,20000.0,1447526057,0,Food,Drinks,30.0
2,"""Taste The Scent of Tea"" Rose and Blanc Tea Ro...",the United States,15000.0,1518208887,0,Food,Drinks,23.430243
3,Bringing Philly cheesesteaks (and other delici...,the United States,2000.0,1454705444,0,Food,Food Trucks,59.958333
4,a frog plushie keychain and frog butt pin for ...,the United States,800.0,1573236000,1,Design,Product Design,14.0


In [3]:
# Assign datatypes for
df1['blurb'] = df1['blurb'].astype(str)
df1['country'] = df1['country'].astype(str)
df1['campaign_success'] = df1['campaign_success'].astype(int)

In [4]:
# Choose cutoffs based on unix time
cutoff1 = 1466003000
cutoff2 = 1530120000

test = df1[df1['launched_at']>=cutoff2]
train = df1[df1['launched_at']<cutoff2]
val = train[train['launched_at']>cutoff1] 
train = train[train['launched_at']<=cutoff1]

# Drop launched_at column, not used in model
train = train.drop(columns=['launched_at'])
val = val.drop(columns=['launched_at']) 
test = test.drop(columns=['launched_at'])

print(train.shape)
print(val.shape)
print(test.shape)

(86636, 7)
(45201, 7)
(47619, 7)


In [5]:
# Target, is the client a defaulter
target = 'campaign_success'

# Features
features = list(train.drop(columns = [target]))

In [6]:
# Arrange data into X features matrix and y target vector 
X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]
X_test = test[features]
y_test = test[target]

In [7]:
#Create and train transformations
# Encode catagorical features on X_train
encoder = ce.OrdinalEncoder(cols=['country', 'category', 'subcategory'])
encoder.fit(X_train)
print('')




In [8]:
def wrangle(X, encoder):

    # copy to avoid errors
    X = X.reset_index(drop=True).copy()

    X = encoder.transform(X)

    X['blurb_length'] = X['blurb'].apply(lambda x: len(x))
    X['blurb_words'] = X['blurb'].apply(lambda x: len(x.split()))
    X['blurb_uppers'] = X['blurb'].apply(lambda x: sum(map(str.isupper, x.split())))
    X['blurb_qmarks'] = X['blurb'].apply(lambda x: x.count("?"))
    X['blub_exclamation'] = X['blurb'].apply(lambda x: x.count("!"))
    X = X.drop(columns=['blurb'])

    return(X)

In [9]:
X_train_e = wrangle(X_train, encoder)
X_val_e = wrangle(X_val, encoder)
X_test_e = wrangle(X_test, encoder)
print(X_train_e.shape)
print(X_val_e.shape)
print(X_test_e.shape)

(86636, 10)
(45201, 10)
(47619, 10)


In [10]:
""" Define Model"""
# First XGBoost Model
booster= XGBClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# XBG Simple
xgb_simple= booster.fit(X_train_e, y_train) #fit on train

In [11]:
# Create dataframe to track outcomes
columns =['Model','True -', 'False +', 'False -','True +','Accuracy', 'Precision', 'Recall']
tracker= pd.DataFrame(columns=columns)
tracker

Unnamed: 0,Model,True -,False +,False -,True +,Accuracy,Precision,Recall


In [12]:
# Function to add rows to tracking dataframe
# m_name = Model Name

def row_maker(m_name, y, pred):
  tn, fp, fn, tp = confusion_matrix(y, pred).ravel()
  recall = round((tp/(tp+fn)),2) # true positive rate
  n_recall = round((tn/(tn+fp)),2) #true negative rate
  precision = round((tp/(tp+fp)),2) # true positive rate
  top = tp+tn
  bottom = tn+fp+fn+tp
  accuracy = round(((tp+tn)/(tp+tn+fp+fn)),2) # accuracy
  # accuracy = round((top/bottom),2) # accuracy
  new_row = {'Model':m_name,'True -':tn, 'False +':fp, 'False -':fn,'True +':tp ,'Accuracy':accuracy,'Precision':precision, 'Recall':recall}
  return new_row

In [13]:
""" Run Model XGBoost Simple """
xs_y_pred_v = booster.predict(X_val_e)

m_name = 'XGBoost Simple'
y = y_val
pred = xs_y_pred_v

print('Classification Report:\n\n', classification_report(y, pred))

# tracking dataframe
new_row=row_maker(m_name, y, pred)
tracker = tracker.append(new_row, ignore_index=True)
tracker

Classification Report:

               precision    recall  f1-score   support

           0       0.73      0.73      0.73     21196
           1       0.76      0.76      0.76     24005

    accuracy                           0.75     45201
   macro avg       0.74      0.75      0.74     45201
weighted avg       0.75      0.75      0.75     45201



Unnamed: 0,Model,True -,False +,False -,True +,Accuracy,Precision,Recall
0,XGBoost Simple,15507,5689,5798,18207,0.75,0.76,0.76


In [14]:
""" Run Model"""
xs_y_pred_test = booster.predict(X_test_e)

m_name = 'XGBoost Simple TEST'
model =  booster.fit(X_train_e, y_train)
X = X_test_e
y = y_test
pred = xs_y_pred_test

print('Classification Report:\n\n', classification_report(y, pred))

# tracking dataframe
new_row=row_maker(m_name, y, pred)
tracker = tracker.append(new_row, ignore_index=True)
tracker

Classification Report:

               precision    recall  f1-score   support

           0       0.65      0.64      0.64     15914
           1       0.82      0.83      0.82     31705

    accuracy                           0.76     47619
   macro avg       0.73      0.73      0.73     47619
weighted avg       0.76      0.76      0.76     47619



Unnamed: 0,Model,True -,False +,False -,True +,Accuracy,Precision,Recall
0,XGBoost Simple,15507,5689,5798,18207,0.75,0.76,0.76
1,XGBoost Simple TEST,10133,5781,5482,26223,0.76,0.82,0.83


In [15]:
dump(encoder, 'encoder1.joblib' )
encoder1 = load('encoder1.joblib')

In [16]:
# Wrangle Function to 'Pickle'

def wrangler(X, encoder):
  X = pd.DataFrame.from_dict(X, orient='index')
  X = X.T
  X = X.reset_index(drop=True).copy()
  X.rename(columns={'x1':'goal',
                    'x2':'campaign_length',
                    'x3':'country',
                    'x4':'category',
                    'x5':'subcategory',
                    'x6':'blurb'}, inplace=True)

  X = encoder.transform(X)
  X = X[['country', 'goal', 'category', 'subcategory',
         'campaign_length', 'blurb']]

  X['blurb_length'] = X['blurb'].apply(lambda x: len(x))
  X['blurb_words'] = X['blurb'].apply(lambda x: len(x.split()))
  X['blurb_uppers'] = X['blurb'].apply(lambda x: sum(map(str.isupper, x.split())))
  X['blurb_qmarks'] = X['blurb'].apply(lambda x: x.count("?"))
  X['blub_exclamation'] = X['blurb'].apply(lambda x: x.count("!"))
  X = X.drop(columns=['blurb'])
  X = X.astype(int)
  return(X)

In [17]:
dump(wrangler, 'wrangler1.joblib' )
wrangler1 = load('wrangler1.joblib')

In [18]:
# XGBoost Model to Pickle
def kick_boost(X):
  prediction = booster.predict_proba(X)[0][1]
  prediction = round(prediction, 3)
  
  return(prediction)

In [19]:
dump(kick_boost, 'kick_boost1.joblib' )
kick_boost1 = load('kick_boost1.joblib')

In [20]:
# Dummy Data
test_1 = {
  "x1": 10000,
  "x2": 8,
  "x3": "Canada",
  "x4": "Science",
  "x5": "Material Thread Science",
  "x6": "I am making somthing that will do something awesome!!! Answer the question 'will it be amazing?'"
}

In [21]:
result = wrangler1(test_1, encoder1)
prediction = kick_boost1(result)
prediction

0.37