## Reading in the Data and Cleaning

In [16]:
# Import the necessary Libraries 
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn import preprocessing
from sklearn import ensemble
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import time 
import matplotlib.pyplot as plt 

# Read in the data 
data = pd.read_csv('kickstarter_projects_data/ks-projects-201612.csv', 
                   delimiter = ',', encoding='ISO-8859-1')

# Delete unnecessary columns 
del data['Unnamed: 13']
del data['Unnamed: 14']
del data['Unnamed: 15']
del data['Unnamed: 16']
del data['currency ']
del data['category ']
del data['name ']
del data['ID ']
del data['usd pledged ']

  interactivity=interactivity, compiler=compiler, result=result)


In [17]:
# Segement the Date string from the Launch and Deadline columns 
diff = []
launch = []
dead = []
for i in range(len(data)):
    launch.append(data['launched '][i].split(' ')[0])
    dead.append(data['deadline '][i].split(' ')[0])
launch = pd.Series(launch)
dead = pd.Series(dead)

# Convert the the Launch and deadline columns into datetime objects 
launch = pd.to_datetime(launch, format='%m/%d/%y', errors='coerce')
dead = pd.to_datetime(dead, format='%m/%d/%y', errors='coerce')
data['launch'] = launch
data['dead'] = dead

# Loop through and get the total number of days that the project was live on Kickstarter 
for i in range(len(data)):
    diff.append((data['dead'][i] - data['launch'][i]).days)
    
# Create a new column of the number of days 
diff = pd.Series(diff)
data['Length'] = diff

In [18]:
# Check if the row is not aligned with the columns and drop those indexes 
check = ['failed', 'successful', 'canceled', 'live', 'undefined', 'suspended']
count = []
for i in range(len(data)):
    if data['state '][i].isdigit() == True:
        count.append(i)
    elif data['state '][i] not in check:
        count.append(i)
data = data.drop(data.index[count])
data = data.reset_index(drop=True)

# Check if the amount donated is larger than the amount requested and then label it 
# as either successful or failed
data['goal '] = data['goal '].astype('float32')
data['pledged '] = data['pledged '].astype('float32')
data['Diff'] = data['goal ']-data['pledged ']
data['backers '] = data['backers '].astype('int64')

data['goal '] = data['goal '].astype('int64')
data['pledged '] = data['pledged '].astype('int64')
print("Preparing to re-Format")

prep=[]
for i in range(len(data)):
  if data['Diff'][i] <= 0:
    prep.append('successful')
  else: 
    prep.append('failure')

data['result'] = pd.Series(prep)

# Remove severly extreme outliers 
data = data[data['Length'] < 3000.0]

Preparing to re-Format


In [19]:
# Create a list of the columns and remove all non-predictor columns
lst = list(data.columns)
lst.remove('launched ')
lst.remove('deadline ')
lst.remove('state ')
lst.remove('dead')
lst.remove('launch')
lst.remove('Diff')
lst.remove('result')
lst.remove('pledged ')

# Function to determine which of our predictors are categorical 
def cat_features(dataframe):
    td = pd.DataFrame({'a': [1,2,3], 'b': [1.0, 2.0, 3.0]})
    return filter(lambda x: not(dataframe[x].dtype in [td['a'].dtype, 
                                          td['b'].dtype]), list(dataframe))

# Specify the x and the y variables and One-hot encode the categorical predictors 
data_x = data[lst]
data_x = pd.get_dummies(data_x, columns=list(cat_features(data_x[lst])))
data_y = data['result']

## Preprocessing and Train Test Splits

In [20]:
# Build the Label Encoding Pipeline for the Target Variable 
le = preprocessing.LabelEncoder()

# Build the Imputer Pipeline to fill NaN values 
imp = preprocessing.Imputer(missing_values = 'NaN', strategy = 'mean', axis=0)

# Build the Scaler Pipeline for the Predictors 
scaler = preprocessing.MinMaxScaler()
#scaler = preprocessing.StandardScaler()

# Pass the Target Variable through the Label Encoding Pipeline 
data_y = le.fit_transform(data_y)

# Split the data into Training and Testing Sets 
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, 
                                                    test_size=0.2, random_state=4)

# Pass the training Predictors through the Preprocessing Pipeline 
train_x_pp = imp.fit_transform(x_train)
train_x_pp = scaler.fit_transform(train_x_pp)

# Pass the testing Predictors through the Preprocessing Pipeline 
test_x_pp = imp.fit_transform(x_test)
test_x_pp = scaler.fit_transform(test_x_pp)




## Fit the Random Forest with Mulitiple Combinations of Hyper-parameters

## Fit the Random Forest Model with Multiple Combinations of Hyper-Parameters

In [21]:
# Check error metrics for a Random Forest Model with different sets of hyper-parameters 
n_ests = [5,10,50,100]
depths = [2,4,6,8]
for n in n_ests:
    for dp in depths:
        mod = ensemble.RandomForestClassifier(n_estimators=n, max_depth=dp)
        mod.fit(train_x_pp, y_train)
        y_hat = mod.predict(test_x_pp)
        print('-----Evaluating Model: n_estimators = ' + str(n) + ', max_depth = ' + str(dp) + ' ------')
        print(accuracy_score(y_hat, y_test))
        print(f1_score(y_hat, y_test))

-----Evaluating Model: n_estimators = 5, max_depth = 2 ------
0.64342416786593
0.03517146087174978
-----Evaluating Model: n_estimators = 5, max_depth = 4 ------
0.6645312040604738
0.2001844678103671
-----Evaluating Model: n_estimators = 5, max_depth = 6 ------
0.8260371694288411
0.7483322140138795
-----Evaluating Model: n_estimators = 5, max_depth = 8 ------
0.8616127384986769
0.8253422651016542
-----Evaluating Model: n_estimators = 10, max_depth = 2 ------
0.6785974034012658
0.2586908416018274
-----Evaluating Model: n_estimators = 10, max_depth = 4 ------
0.8113055723194528
0.723880259046239
-----Evaluating Model: n_estimators = 10, max_depth = 6 ------
0.8469275644894233
0.8063202412186239
-----Evaluating Model: n_estimators = 10, max_depth = 8 ------
0.8584869164229454
0.8286073055081806
-----Evaluating Model: n_estimators = 50, max_depth = 2 ------
0.795103910372468
0.6776384662203285
-----Evaluating Model: n_estimators = 50, max_depth = 4 ------
0.8431053959116723
0.79744281290580

## Build the best Random Forest Model

In [22]:
mod = ensemble.RandomForestClassifier(n_estimators=100, max_depth=8)
start = time.time()
mod.fit(train_x_pp, y_train)
y_hat = mod.predict(test_x_pp)
end = time.time()
print("Training and Predicting took approximatly: " + str(end-start))
print("------Random Forest Model With n_estimators = 50, and max_depth = 8---------")
print("Accuracy Score: " + str(accuracy_score(y_hat, y_test)))
print("F1 Score: " + str(f1_score(y_hat, y_test)))

Training and Predicting took approximatly: 23.649368047714233
------Random Forest Model With n_estimators = 50, and max_depth = 8---------
Accuracy Score: 0.8472525261903656
F1 Score: 0.8187643440741761


## Test for a better Score by using K-Fold Cross Validation

In [23]:
start = time.time()
k_fold = KFold(n_splits=5, shuffle=True, random_state=4) # This is 5-fold CV
k_fold_scores = cross_val_score(mod, data_x, data_y, scoring='f1_macro', cv=k_fold)
print('CV Score (F1 Macro for K-Fold): ' + str(k_fold_scores))
print("Mean Score of The K-Fold CV: " + str(k_fold_scores.mean()))
end = time.time()
print("K-Fold Training and Predicting took approximatly: " + str(end - start))

CV Score (F1 Macro for K-Fold): [0.89311409 0.89669826 0.89503929 0.89953726 0.89743909]
Mean Score of The K-Fold CV: 0.8963655981259914
K-Fold Training and Predicting took approximatly: 105.10018515586853


## Read in and Clean the Validation Data

In [24]:
val_data = pd.read_csv('kickstarter_projects_data/ks-projects-201801.csv', 
                   delimiter = ',', encoding='ISO-8859-1')

del val_data['currency']
del val_data['category']
del val_data['name']
del val_data['ID']
del val_data['usd_pledged_real']
del val_data['usd_goal_real']
del val_data['usd pledged']

In [25]:
# Segement the Date string from the Launch and Deadline columns 
diff = []
launch = []
dead = []
for i in range(len(val_data)):
    launch.append(val_data['launched'][i].split(' ')[0])
    dead.append(val_data['deadline'][i])
launch = pd.Series(launch)
dead = pd.Series(dead)

# Convert the the Launch and deadline columns into datetime objects 
launch = pd.to_datetime(launch, format='%Y/%m/%d', errors='coerce')
dead = pd.to_datetime(dead, format='%Y/%m/%d', errors='coerce')
val_data['launch'] = launch
val_data['dead'] = dead

# Loop through and get the total number of days that the project was live on Kickstarter 
for i in range(len(val_data)):
    diff.append((val_data['dead'][i] - val_data['launch'][i]).days)
    
# Create a new column of the number of days 
diff = pd.Series(diff)
val_data['Length'] = diff

In [26]:
# Check if the row is not aligned with the columns and drop those indexes 
check = ['failed', 'successful', 'canceled', 'live', 'undefined', 'suspended']
count = []
for i in range(len(val_data)):
    if val_data['state'][i].isdigit() == True:
        count.append(i)
    elif val_data['state'][i] not in check:
        count.append(i)
if len(count) > 0:
    val_data = val_data.drop(val_data.index[count])
    val_data = val_data.reset_index(drop=True)

# Check if the amount donated is larger than the amount requested and then label it 
# as either successful or failed
val_data['goal '] = val_data['goal'].astype('float32')
val_data['pledged '] = val_data['pledged'].astype('float32')
val_data['Diff'] = val_data['goal']-val_data['pledged']
val_data['backers '] = val_data['backers'].astype('int64')

val_data['goal '] = val_data['goal '].astype('int64')
val_data['pledged '] = val_data['pledged '].astype('int64')
print("Preparing to re-Format")

prep=[]
for i in range(len(val_data)):
  if val_data['Diff'][i] <= 0:
    prep.append('successful')
  else: 
    prep.append('failure')

val_data['result'] = pd.Series(prep)
val_data['main_category '] = val_data['main_category']
val_data['country '] = val_data['country']
del val_data['main_category']
del val_data['country']

# Remove severly extreme outliers 
val_data = val_data[val_data['Length'] < 3000.0]

Preparing to re-Format


In [27]:
# Create a list of the columns and remove all non-predictor columns
val_lst = list(val_data.columns)
val_lst.remove('launched')
val_lst.remove('deadline')
val_lst.remove('state')
val_lst.remove('dead')
val_lst.remove('launch')
val_lst.remove('Diff')
val_lst.remove('result')
val_lst.remove('pledged')
val_lst.remove('goal')
val_lst.remove('pledged ')
val_lst.remove('backers')

# Function to determine which of our predictors are categorical 
def cat_features(dataframe):
    td = pd.DataFrame({'a': [1,2,3], 'b': [1.0, 2.0, 3.0]})
    return filter(lambda x: not(dataframe[x].dtype in [td['a'].dtype, 
                                          td['b'].dtype]), list(dataframe))

# Specify the x and the y variables and One-hot encode the categorical predictors 
val_data_x = val_data[val_lst]
val_data_x = pd.get_dummies(val_data_x, columns=list(cat_features(val_data_x[val_lst])))
val_data_y = val_data['result']

# Change names that do not match up with the 
val_data_x['country _N,"0'] = val_data_x['country _N,0"'] 
del val_data_x['country _N,0"']
del val_data_x['country _JP']


## Preprocess the Validation Data 

In [28]:
# Pass the Target Variable through the Label Encoding Pipeline 
val_data_y = le.fit_transform(val_data_y)

# Pass the training Predictors through the Preprocessing Pipeline 
val_x_pp = imp.transform(val_data_x)
val_x_pp = scaler.transform(val_x_pp)


## Make predictions on the Validation Data 

In [29]:
# Make Predictions with the Random Forest Model 
val_y_hat = mod.predict(val_x_pp)

# View Random Forest Predictions 
print("------Random Forest Model With n_estimators = 50, and max_depth = 8---------")
print("Accuracy Score: " + str(accuracy_score(val_y_hat, val_data_y)))
print("F1 Score: " + str(f1_score(val_y_hat, val_data_y)))

------Random Forest Model With n_estimators = 50, and max_depth = 8---------
Accuracy Score: 0.36102087921955134
F1 Score: 0.530358085654834


In [30]:
# Make and View Predictions with the K-fold Model 
val_k_fold_scores = cross_val_score(mod, val_x_pp, val_data_y, scoring='f1_macro', cv=k_fold)
print('CV Score (F1 Macro for K-Fold): ' + str(val_k_fold_scores))
print("Mean Score of The K-Fold CV: " + str(val_k_fold_scores.mean()))

CV Score (F1 Macro for K-Fold): [0.89276626 0.88849037 0.89084706 0.89205283 0.8921951 ]
Mean Score of The K-Fold CV: 0.8912703213019665
