# Kickstarter - Multi Regression

In [10]:
import pandas as pd
import numpy as np

# Modelling and evaluation libraries
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn import metrics
#import statsmodels.formula.api as smf

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## Read data

In [11]:
df = pd.read_csv('data/Kickstarter_cleaned.csv')
df = df.drop(['Unnamed: 0'], axis=1)

In [12]:
df.head(1)

Unnamed: 0,backers_count,blurb,country,id,name,slug,state,usd_pledged,category_name,creator_name,location_name,location_state,created_at_rd,deadline_rd,launched_at_rd,state_changed_at_rd,usd_goal,duration_days,d_pledged_goal_usd
0,47,A colorful Dia de los Muertos themed oracle de...,US,928751314,The Ofrenda Oracle Deck,the-ofrenda-oracle-deck,successful,1950.0,Playing Cards,Lisa Vollrath,Euless,TX,Wed Aug 2 16:28:13 2017,Sat Sep 9 19:00:59 2017,Thu Aug 10 19:00:59 2017,Sat Sep 9 19:00:59 2017,1000.0,30.0,950.0


## Dummy creation

In [13]:
# Create dummies
dummies = pd.get_dummies(df.state, prefix='state', drop_first=True)

# Concatinate dummies to data frame
df = pd.concat([df, dummies], axis=1)

In [14]:
df.head(1)

Unnamed: 0,backers_count,blurb,country,id,name,slug,state,usd_pledged,category_name,creator_name,location_name,location_state,created_at_rd,deadline_rd,launched_at_rd,state_changed_at_rd,usd_goal,duration_days,d_pledged_goal_usd,state_successful
0,47,A colorful Dia de los Muertos themed oracle de...,US,928751314,The Ofrenda Oracle Deck,the-ofrenda-oracle-deck,successful,1950.0,Playing Cards,Lisa Vollrath,Euless,TX,Wed Aug 2 16:28:13 2017,Sat Sep 9 19:00:59 2017,Thu Aug 10 19:00:59 2017,Sat Sep 9 19:00:59 2017,1000.0,30.0,950.0,1


In [15]:
#df.columns

## Visualisation

In [None]:
features = df[['usd_pledged', 'usd_goal', 'backers_count', 'state_successful', 'state']]
sns.pairplot(features, hue='state',
             diag_kind='hist', palette='rainbow',
             height=1.5, corner=True);

## Modelling

In [None]:
# Define variables
X = df[['usd_goal', 'backers_count', 'state_successful']]
y = df['usd_pledged']

In [None]:
# Splitting the model into test and train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [None]:
# Train the model
multi_lin_reg_mod = linear_model.LinearRegression()
multi_lin_reg_mod.fit(X_train, y_train)

In [None]:
y_pred = multi_lin_reg_mod.predict(X_test)

## Model Evaluation

In [None]:
smf.ols(formula='usd_pledged ~ usd_goal + backers_count + state_successful',
        data=df).fit().summary()

In [None]:
r2 = r2_score(y_test, y_pred, multioutput='uniform_average')

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('Mean of usd_pledged:', df.usd_pledged.mean())
print('R^2:', r2)

## Model Optimization

R Square measures how much of variability in dependent variable can be explained by the model.
Adding the feature 'd_pledged_goal_usd' as a trial to optimize the models results leads to a R Square of 1, which is not realistic.