# Kickstarter - Ensemlbe Method (Random Forest)

In [49]:
import pandas as pd
import numpy as np

# Modelling and evaluation libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor #Regressor! NOT Classifier!
from sklearn import metrics
from sklearn.metrics import r2_score

# Scaling
from sklearn.preprocessing import RobustScaler

# Preprocessing
from sklearn import preprocessing
from sklearn import utils

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Read data

In [50]:
df = pd.read_csv('data/Kickstarter_cleaned2.csv')
df = df.drop(['Unnamed: 0'], axis=1)

In [51]:
df.columns.sort_values

<bound method Index.sort_values of Index(['backers_count', 'id', 'usd_pledged', 'category_name', 'category_id',
       'category_parent_id', 'usd_goal', 'duration_days', 'duration_days_prep',
       'year_deadline', 'month_deadline', 'weekday_deadline',
       'weekday_launched_at', 'winter_deadline_True', 'spring_deadline_True',
       'summer_deadline_True', 'deadline_weekend_True',
       'launched_weekend_True', 'country_US_True', 'eastcoast_True',
       'long_blurb_True', 'long_name_True', 'state_b_True',
       'long_creator_name_True'],
      dtype='object')>

## Modelling & Predicting

In [52]:
# Global random_state
rand = 1

# Define variables

X = df[['category_parent_id', 'usd_goal', 'duration_days', 'duration_days_prep',
       'year_deadline', 'winter_deadline_True', 'spring_deadline_True',
       'summer_deadline_True', 'deadline_weekend_True',
       'launched_weekend_True', 'eastcoast_True', 'long_blurb_True',
       'long_name_True', 'long_creator_name_True']]

y = df['usd_pledged']

In [53]:
# Splitting the data set into test and train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=rand)

In [54]:
# Train the model
rforest = RandomForestRegressor(max_features='sqrt', n_estimators=200, n_jobs=-1,
                                 verbose=1, random_state=rand)
rforest.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    6.4s finished


RandomForestRegressor(max_features='sqrt', n_estimators=200, n_jobs=-1,
                      random_state=1, verbose=1)

In [55]:
y_pred = rforest.predict(X_test)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.2s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    1.3s finished


## Results

In [58]:
# Get the average number of nodes and the depth

n_nodes = []
max_depths = []

# Stats about the trees in random forest
for ind_tree in rforest.estimators_:
    n_nodes.append(ind_tree.tree_.node_count)
    max_depths.append(ind_tree.tree_.max_depth)
    
print(f'Average number of nodes:\n {int(np.mean(n_nodes))}')
print(f'Average maximum depth:\n {int(np.mean(max_depths))}')

Average number of nodes:
 155581
Average maximum depth:
 36


In [59]:
# Calculate r^2
r2 = r2_score(y_test, y_pred, multioutput='uniform_average')

# Return evaluation
print('Mean of usd_pledged:\n', round(df.usd_pledged.mean()))
print('Mean Absolute Error:\n', round(metrics.mean_absolute_error(y_test, y_pred)))
print('Mean Squared Error:\n', round(metrics.mean_squared_error(y_test, y_pred)))
print('Root Mean Squared Error:\n', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred))))
print('R^2:\n', round(r2,ndigits=2))

Mean of usd_pledged:
 12297
Mean Absolute Error:
 13056
Mean Squared Error:
 5921640561
Root Mean Squared Error:
 76952
R^2:
 0.19


**Description:**<br/>
* In general **'R Squared'** tells you how related two things are, like correlation. 'R Squared' is the percentage of variation (i.e. varies from 0 to 1) explained by the relationship between two or more variables. So, it measures how much of variability in the dependent variable can be explained by the model.
* The **'R Squared' of 19%** for this model - with its maximum numeric feature combination - is very low. But higher than with fewer features.

**Conclusion:**<br/>
* With a **'R Squared' of 19%**, only 19% of the variability of the dependet variables can be explained by the model.
* Therefore, the model (Random Forest) with its features is not capable of predicting the pledged amount in USD. **The model is underfitting**
* Using other models suitable for regression issues might increase the quality predicting the pledged amount.