# Kickstarter - Ensemlbe Method (Random Forest)

In [1]:
import pandas as pd
import numpy as np

# Modelling and evaluation libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor #Regressor! NOT Classifier!
from sklearn import metrics
from sklearn.metrics import r2_score

# Scaling
from sklearn.preprocessing import RobustScaler

# Preprocessing
from sklearn import preprocessing
from sklearn import utils

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Read data

In [2]:
df = pd.read_csv('data/Kickstarter_cleaned.csv')
df = df.drop(['Unnamed: 0'], axis=1)

## Dummy creation

In [3]:
# Create dummies
dummies = pd.get_dummies(df.state, prefix='state', drop_first=True)

# Concatinate dummies to data frame
df = pd.concat([df, dummies], axis=1)

df.head(2)

Unnamed: 0,backers_count,blurb,country,id,name,slug,state,usd_pledged,category_name,creator_name,location_name,location_state,created_at_rd,deadline_rd,launched_at_rd,state_changed_at_rd,usd_goal,duration_days,d_pledged_goal_usd,state_successful
0,47,A colorful Dia de los Muertos themed oracle de...,US,928751314,The Ofrenda Oracle Deck,the-ofrenda-oracle-deck,successful,1950.0,Playing Cards,Lisa Vollrath,Euless,TX,Wed Aug 2 16:28:13 2017,Sat Sep 9 19:00:59 2017,Thu Aug 10 19:00:59 2017,Sat Sep 9 19:00:59 2017,1000.0,30.0,950.0,1
1,271,"Electra's long awaited, eclectic Debut Pop/Roc...",US,928014092,"Record Electra's Debut Album (Pop, Rock, Class...",record-electras-debut-album-pop-rock-classical,successful,22404.0,Rock,Electra,Hollywood,CA,Sun Sep 30 08:45:33 2012,Wed Jun 12 07:03:15 2013,Mon May 13 07:03:15 2013,Wed Jun 12 07:03:15 2013,15000.0,30.0,7404.0,1


## Visualisation

In [4]:
df.columns

Index(['backers_count', 'blurb', 'country', 'id', 'name', 'slug', 'state',
       'usd_pledged', 'category_name', 'creator_name', 'location_name',
       'location_state', 'created_at_rd', 'deadline_rd', 'launched_at_rd',
       'state_changed_at_rd', 'usd_goal', 'duration_days',
       'd_pledged_goal_usd', 'state_successful'],
      dtype='object')

## Modelling

In [5]:
# Global random_state
rand = 1

# Define variables
X = df[['usd_goal', 'backers_count', 'state_successful']]
y = df['usd_pledged']

#y = np.array(df_new.pop('state'))

In [6]:
# Splitting the model into test and train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=rand)

### Scaling x-values

In [7]:
# Define columns to scale
col_scale = ['backers_count', 'usd_goal']

# Scale columns
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train[col_scale])
X_test_scaled = scaler.transform(X_test[col_scale])

# Concatenating scaled and dummy columns 
X_train_preprocessed = np.concatenate([X_train_scaled, X_train.drop(col_scale, axis=1)], axis=1)
X_test_preprocessed = np.concatenate([X_test_scaled, X_test.drop(col_scale, axis=1)], axis=1)

In [8]:
# Define columns to scale
#col_scale = ['usd_pledged']

# Scale columns
#scaler = RobustScaler()
#y_train_scaled = scaler.transform(y_train[col_scale])
#y_test_scaled = scaler.transform(y_test[col_scale])

# Concatenating scaled and dummy columns 
#y_train_preprocessed = np.concatenate([y_train_scaled, y_train.drop(col_scale, axis=1)], axis=1)
#y_test_preprocessed = np.concatenate([y_test_scaled, y_test.drop(col_scale, axis=1)], axis=1)

In [9]:
# Prevent ValueError: Unknown label type: 'continuous'

#lab_enc = preprocessing.LabelEncoder()
#y_train_encoded = lab_enc.fit_transform(y_train)
#print(y_train_encoded)
#print(utils.multiclass.type_of_target(y_train))
#print(utils.multiclass.type_of_target(y_train.astype('int')))
#print(utils.multiclass.type_of_target(y_train_encoded))

In [10]:
# Train the model
rforest = RandomForestRegressor(max_features='sqrt', n_estimators=200, n_jobs=1,
                                 verbose=1, random_state=rand)
rforest.fit(X_train_scaled, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:   17.5s finished


RandomForestRegressor(max_features='sqrt', n_estimators=200, n_jobs=1,
                      random_state=1, verbose=1)

In [11]:
y_pred = rforest.predict(X_test_scaled)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    2.5s finished


## Model Evaluation

In [12]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('Mean of usd_pledged:', df.usd_pledged.mean())

Mean Absolute Error: 6090.130405353872
Mean Squared Error: 2578174633.9328074
Root Mean Squared Error: 50775.72878780577
Mean of usd_pledged: 12297.144432358278


## Model Optimization

In [13]:
r2 = r2_score(y_test, y_pred, multioutput='uniform_average')
r2

0.647510376918945

R Square measures how much of variability in dependent variable can be explained by the model. 