### Initial Imports

In [2]:
#Imports
import category_encoders as ce
import pandas as pd
import joblib
from joblib import dump, load
import json
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import  f1_score, roc_auc_score, roc_curve, precision_score, recall_score, accuracy_score
from xgboost import XGBClassifier

In [32]:
# read data previously cleaned and reduced to neccessary columns
df1a = pd.read_csv('data/Clean_Data_1.csv')
df1b = pd.read_csv('data/Clean_Data_2.csv')
df1 = pd.concat([df1a, df1b], ignore_index=True)
df1.reset_index(drop=True, inplace=True)
print(df1.shape)
df1.head()

(179456, 8)


Unnamed: 0,blurb,country,goal,launched_at,campaign_success,category,subcategory,campaign_length
0,Support great art! Join us as we re-stage the ...,the United States,3000.0,1580433192,1,Dance,Performances,30.0
1,JinBucha is a new kind of Brewery in North Par...,the United States,20000.0,1447526057,0,Food,Drinks,30.0
2,"""Taste The Scent of Tea"" Rose and Blanc Tea Ro...",the United States,15000.0,1518208887,0,Food,Drinks,23.430243
3,Bringing Philly cheesesteaks (and other delici...,the United States,2000.0,1454705444,0,Food,Food Trucks,59.958333
4,a frog plushie keychain and frog butt pin for ...,the United States,800.0,1573236000,1,Design,Product Design,14.0


In [33]:
df1['category'].value_counts()

Music           25651
Film & video    25205
Art             19287
Technology      18724
Publishing      18201
Food            13999
Games           11089
Fashion          9735
Comics           6816
Design           6313
Photography      6135
Crafts           6016
Theater          5182
Journalism       4027
Dance            3076
Name: category, dtype: int64

In [34]:
# Assign datatypes for processing purposes
df1['blurb'] = df1['blurb'].astype(str)
df1['country'] = df1['country'].astype(str)
df1['campaign_success'] = df1['campaign_success'].astype(int)

## Data Splits

### Train, Validate, Test Split

In [35]:
df1['launched_at'].describe()

count    1.794560e+05
mean     1.472947e+09
std      7.402606e+07
min      1.240674e+09
25%      1.421997e+09
50%      1.470489e+09
75%      1.534352e+09
max      1.599797e+09
Name: launched_at, dtype: float64

In [36]:
# Choose cutoffs based on unix time, 75th & 50th percentile
cutoff1 = 1466003000
cutoff2 = 1530120000

test = df1[df1['launched_at']>=cutoff2]
train = df1[df1['launched_at']<cutoff2]
val = train[train['launched_at']>cutoff1] 
train = train[train['launched_at']<=cutoff1]

# Drop launched_at column, not used in model
train = train.drop(columns=['launched_at'])
val = val.drop(columns=['launched_at']) 
test = test.drop(columns=['launched_at'])

print(train.shape)
print(val.shape)
print(test.shape)

(86636, 7)
(45201, 7)
(47619, 7)


In [37]:
# check that data is balanced
train['campaign_success'].value_counts(normalize=True)

1    0.55979
0    0.44021
Name: campaign_success, dtype: float64

### Target Vector / Features Matrix Split

In [38]:
# Target, is the client a defaulter
target = 'campaign_success'

# Features
features = list(train.drop(columns = [target]))

In [39]:
# Arrange data into X features matrix and y target vector 
X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]
X_test = test[features]
y_test = test[target]

## Wrangle Data

### Create Encoder

In [40]:
#Create and train transformations
# Encode catagorical features on X_train
encoder = ce.OrdinalEncoder(cols=['country', 'category', 'subcategory'])
encoder.fit(X_train)
print('')




In [41]:
def wrangle(X, encoder):

    # copy to avoid errors
    X = X.reset_index(drop=True).copy()

    X = encoder.transform(X)

    X['blurb_length'] = X['blurb'].apply(lambda x: len(x))
    X['blurb_words'] = X['blurb'].apply(lambda x: len(x.split()))
    X['blurb_uppers'] = X['blurb'].apply(lambda x: sum(map(str.isupper, x.split())))
    X['blurb_qmarks'] = X['blurb'].apply(lambda x: x.count("?"))
    X['blub_exclamation'] = X['blurb'].apply(lambda x: x.count("!"))
    X = X.drop(columns=['blurb'])

    return(X)

In [42]:
X_train_e = wrangle(X_train, encoder)
X_val_e = wrangle(X_val, encoder)
X_test_e = wrangle(X_test, encoder)
print(X_train_e.shape)
print(X_val_e.shape)
print(X_test_e.shape)

(86636, 10)
(45201, 10)
(47619, 10)


# XGBoost Model

In [43]:
""" Define Model"""
# First XGBoost Model
booster= XGBClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# XBG Simple
xgb_simple= booster.fit(X_train_e, y_train) #fit on train

In [44]:
""" Run Model XGBoost Simple """
xs_y_pred_v = booster.predict(X_val_e)

m_name = 'XGBoost Simple'
y = y_val
pred = xs_y_pred_v

print('Classification Report:\n\n', classification_report(y, pred))

Classification Report:

               precision    recall  f1-score   support

           0       0.73      0.73      0.73     21196
           1       0.76      0.76      0.76     24005

    accuracy                           0.75     45201
   macro avg       0.74      0.75      0.74     45201
weighted avg       0.75      0.75      0.75     45201



In [45]:
""" Run Model"""
xs_y_pred_test = booster.predict(X_test_e)

m_name = 'XGBoost Simple TEST'
model =  booster.fit(X_train_e, y_train)
X = X_test_e
y = y_test
pred = xs_y_pred_test

print('Classification Report:\n\n', classification_report(y, pred))

Classification Report:

               precision    recall  f1-score   support

           0       0.65      0.64      0.64     15914
           1       0.82      0.83      0.82     31705

    accuracy                           0.76     47619
   macro avg       0.73      0.73      0.73     47619
weighted avg       0.76      0.76      0.76     47619



## Creating Pickles via JobLib

In [46]:
# Pickling encoder created above, no changes required
dump(encoder, 'encoder_m.joblib' )
encoder_m = load('encoder_m.joblib')

In [47]:
# Wrangle Function to pickle
# allows for state of incoming data
def wrangler(X, encoder):
  X = pd.DataFrame.from_dict(X, orient='index')
  X = X.T
  X = X.reset_index(drop=True).copy()
  X.rename(columns={'x1':'goal',
                    'x2':'campaign_length',
                    'x3':'country',
                    'x4':'category',
                    'x5':'subcategory',
                    'x6':'blurb'}, inplace=True)

  X = encoder.transform(X)
  X = X[['country', 'goal', 'category', 'subcategory',
         'campaign_length', 'blurb']]

  X['blurb_length'] = X['blurb'].apply(lambda x: len(x))
  X['blurb_words'] = X['blurb'].apply(lambda x: len(x.split()))
  X['blurb_uppers'] = X['blurb'].apply(lambda x: sum(map(str.isupper, x.split())))
  X['blurb_qmarks'] = X['blurb'].apply(lambda x: x.count("?"))
  X['blub_exclamation'] = X['blurb'].apply(lambda x: x.count("!"))
  X = X.drop(columns=['blurb'])
  X = X.astype(int)
  return(X)

In [48]:
dump(wrangler, 'wrangler_m.joblib' )
wrangler_m = load('wrangler_m.joblib')

In [49]:
# XGBoost Model to Pickle
def kick_boost(X):
  prediction = booster.predict_proba(X)[0][1]
  prediction = round(prediction, 3)
  
  return(prediction)

In [50]:
dump(kick_boost, 'kick_boost_m.joblib' )
kick_boost_m = load('kick_boost_m.joblib')

### Testing data through pickled model

In [51]:
# Dummy Data
test_1 = {
  "x1": 10000,
  "x2": 8,
  "x3": "Canada",
  "x4": "Science",
  "x5": "Material Thread Science",
  "x6": "I am making somthing that will do something awesome!!! Answer the question 'will it be amazing?'"
}

In [52]:
result = wrangler_m(test_1, encoder_m)
prediction = kick_boost_m(result)
prediction

0.37