In [1]:
# Import necessary modules and functions

import pandas as pd
import seaborn as sns
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 16, 8

from scipy.stats import norm
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')
from datetime import datetime

In [4]:
# Import the training set file into a Pandas DataFrame.

db = pd.read_csv("KS_train_data.csv",delimiter=',')

In [5]:
db.head()

Unnamed: 0,project_id,backers_count,blurb,category,converted_pledged_amount,country,created_at,currency,deadline,fx_rate,...,launched_at,name,pledged,staff_pick,usd_pledged,location,funded,subcategory,project_url,reward_url
0,KS_000000,80,I will be an artist-in-residence at Elsewhere ...,art,3596,US,1325980505,USD,1334866560,1.0,...,1332969260,Elsewhere Studios artist-in-residency program!,3596.0,False,3596.0,"Paonia, CO",True,mixed media,https://www.kickstarter.com/projects/hilaryeme...,https://www.kickstarter.com/projects/hilaryeme...
1,KS_000001,82,1000 Artists is a public art-making installati...,art,4586,US,1330926084,USD,1336440145,1.0,...,1332984145,1000 Artists: Presidential Inauguration 2013,4586.0,False,4586.0,"Washington, DC",True,public art,https://www.kickstarter.com/projects/17146650/...,https://www.kickstarter.com/projects/17146650/...
2,KS_000002,21,"The Sequel to ‘My Favorite Machine”, ""MyPhoneH...",art,5217,US,1332382894,USD,1338175739,1.0,...,1332991739,MyPhoneHenge,5217.0,False,5217.0,"Frisco, TX",True,sculpture,https://www.kickstarter.com/projects/belveal/m...,https://www.kickstarter.com/projects/belveal/m...
3,KS_000003,37,A film exploring the role and value of art edu...,art,7160,GB,1332342451,USD,1334806096,1.0,...,1332991696,Walk-Through,7160.0,False,7160.0,"Glasgow, UK",True,art,https://www.kickstarter.com/projects/107813091...,https://www.kickstarter.com/projects/107813091...
4,KS_000004,153,We need to build a kitchen for Habitable Space...,art,15445,US,1328562617,USD,1335584240,1.0,...,1332992240,A kitchen for Habitable Spaces,15445.0,False,15445.0,"Kingsbury, TX",True,public art,https://www.kickstarter.com/projects/104409276...,https://www.kickstarter.com/projects/104409276...


In [6]:
db.isnull().sum()

project_id                  0
backers_count               0
blurb                       2
category                    0
converted_pledged_amount    0
country                     3
created_at                  0
currency                    0
deadline                    0
fx_rate                     0
goal                        0
launched_at                 0
name                        1
pledged                     0
staff_pick                  0
usd_pledged                 0
location                    0
funded                      0
subcategory                 0
project_url                 0
reward_url                  0
dtype: int64

In [7]:
# replace NaaN values in country with "?"
db.country.fillna('?',inplace = True)

In [8]:
dbc1 = db.copy()

In [9]:
#Choosing the columns for the new database
dbc1=dbc1[['project_id','country','currency','category','goal','staff_pick','location','subcategory','funded']]

In [10]:
dbc1.head()

Unnamed: 0,project_id,country,currency,category,goal,staff_pick,location,subcategory,funded
0,KS_000000,US,USD,art,2800.0,False,"Paonia, CO",mixed media,True
1,KS_000001,US,USD,art,4500.0,False,"Washington, DC",public art,True
2,KS_000002,US,USD,art,5000.0,False,"Frisco, TX",sculpture,True
3,KS_000003,GB,USD,art,6500.0,False,"Glasgow, UK",art,True
4,KS_000004,US,USD,art,15000.0,False,"Kingsbury, TX",public art,True


In [11]:
dbc1.isnull().sum()

project_id     0
country        0
currency       0
category       0
goal           0
staff_pick     0
location       0
subcategory    0
funded         0
dtype: int64

In [12]:
# converting goal into intervals
dbc1['goal'] = pd.cut(dbc1.goal,
                      bins=[0,5000,10000,25000,50000,75000,100000,250000,500000,750000,1000000,2500000,5000000,7500000,10000000,25000000,50000000,75000000,100000000],
                      labels=['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18'])

In [13]:
dbc1['goal']

0        1
1        1
2        1
3        2
4        3
        ..
99995    2
99996    3
99997    2
99998    3
99999    1
Name: goal, Length: 100000, dtype: category
Categories (18, object): ['1' < '2' < '3' < '4' ... '15' < '16' < '17' < '18']

In [14]:
#One-hot encoding the categorical data
dbc1_hot = pd.get_dummies(dbc1, drop_first=True, columns=['country','category', 'currency','staff_pick','location','subcategory',])

In [15]:
dbc1_hot.head()

Unnamed: 0,project_id,goal,funded,country_AE,country_AF,country_AG,country_AM,country_AQ,country_AR,country_AT,...,subcategory_wearables,subcategory_weaving,subcategory_web,subcategory_webcomics,subcategory_webseries,subcategory_woodworking,subcategory_workshops,subcategory_world music,subcategory_young adult,subcategory_zines
0,KS_000000,1,True,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,KS_000001,1,True,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,KS_000002,1,True,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,KS_000003,2,True,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,KS_000004,3,True,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
#Get Target data 
y = dbc1_hot['funded']

#Load X Variables into a Pandas Dataframe with columns 
X = dbc1_hot.drop(['project_id','funded'], axis = 1)

In [17]:
print(f'X : {X.shape}')

X : (100000, 10631)


In [18]:
# Divide Data into Train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=101)

In [19]:
print(f'X_train : {X_train.shape}')
print(f'y_train : {y_train.shape}')
print(f'X_test : {X_test.shape}')
print(f'y_test : {y_test.shape}')

X_train : (80000, 10631)
y_train : (80000,)
X_test : (20000, 10631)
y_test : (20000,)


In [20]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [2,6]

# Minimum number of samples required to split a node
min_samples_split = [2, 5]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]

# Method of selecting samples for training each tree
bootstrap = [True, False]

In [21]:
# Create the param grid
param_grid = {#'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,}
               #'min_samples_split': min_samples_split,
               #'min_samples_leaf': min_samples_leaf,
              # 'bootstrap': bootstrap}
print(param_grid)


{'max_features': ['auto', 'sqrt'], 'max_depth': [2, 6]}


In [34]:
#The max_depth parameter is adjusted by hand since the grid search for deeper levels took a long time to compute.
rf_Model = RandomForestClassifier(max_depth = 70, max_features = 'auto')

In [23]:
from sklearn.model_selection import GridSearchCV
rf_Grid = GridSearchCV(estimator = rf_Model, param_grid = param_grid, cv = 2, verbose=True, n_jobs = 2)

In [24]:
#rf_Grid.fit(X_train, y_train)

In [25]:
#rf_Grid.best_params_ 

In [26]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from numpy import mean
from numpy import std

In [27]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

In [35]:
scores = cross_val_score(rf_Model, X_test, y_test, scoring='accuracy', cv=cv, n_jobs=-1)

In [36]:
scores

array([0.757 , 0.7485, 0.765 , 0.763 , 0.761 , 0.758 , 0.7505, 0.7745,
       0.761 , 0.773 , 0.7615, 0.748 , 0.7655, 0.7675, 0.7525, 0.752 ,
       0.7655, 0.769 , 0.7605, 0.7715, 0.7685, 0.7515, 0.7715, 0.7545,
       0.7565, 0.746 , 0.773 , 0.7635, 0.7625, 0.759 ])