# Kickstarter - Multi Regression

In [7]:
import pandas as pd
import numpy as np

# Modelling and evaluation libraries
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import r2_score
import statsmodels.formula.api as smf

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## Read data

In [8]:
df = pd.read_csv('data/Kickstarter_cleaned2.csv')
df = df.drop(['Unnamed: 0'], axis=1)

## Visualisation

In [11]:
# Pairplot for selected features

#features = ['category_parent_id', 'usd_goal', 'duration_days', 'duration_days_prep',
#       'year_deadline', 'winter_deadline_True', 'spring_deadline_True',
#       'summer_deadline_True', 'deadline_weekend_True',
#       'launched_weekend_True', 'eastcoast_True', 'long_blurb_True',
#       'long_name_True', 'state_b_True', 'long_creator_name_True']

#sns.pairplot(features, diag_kind='hist', palette='rainbow', height=1.5, corner=True);

## Modelling & Predicting

In [13]:
# Define variables
X = df[['category_parent_id', 'usd_goal', 'duration_days', 'duration_days_prep',
       'year_deadline', 'winter_deadline_True', 'spring_deadline_True',
       'summer_deadline_True', 'deadline_weekend_True',
       'launched_weekend_True', 'eastcoast_True', 'long_blurb_True',
       'long_name_True', 'long_creator_name_True']]
y = df['usd_pledged']

In [14]:
# Split the data set into test and train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [15]:
# Train model
multi_lin_reg_mod = linear_model.LinearRegression()
multi_lin_reg_mod.fit(X_train, y_train)

LinearRegression()

In [16]:
# Predict target value
y_pred = multi_lin_reg_mod.predict(X_test)

## Results

In [23]:
smf.ols(formula='usd_pledged ~ usd_goal + category_parent_id + duration_days + duration_days_prep + year_deadline + country_US_True',
        data=df).fit().summary()

0,1,2,3
Dep. Variable:,usd_pledged,R-squared:,0.026
Model:,OLS,Adj. R-squared:,0.026
Method:,Least Squares,F-statistic:,726.3
Date:,"Wed, 04 Nov 2020",Prob (F-statistic):,0.0
Time:,13:18:18,Log-Likelihood:,-2118800.0
No. Observations:,166192,AIC:,4238000.0
Df Residuals:,166185,BIC:,4238000.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-2.338e+06,2.17e+05,-10.781,0.000,-2.76e+06,-1.91e+06
usd_goal,0.1633,0.003,62.375,0.000,0.158,0.168
category_parent_id,242.8708,32.578,7.455,0.000,179.019,306.723
duration_days,32.7748,17.594,1.863,0.062,-1.710,67.260
duration_days_prep,14.1472,1.604,8.819,0.000,11.003,17.291
year_deadline,1160.3945,107.551,10.789,0.000,949.597,1371.192
country_US_True,4912.8317,472.774,10.392,0.000,3986.205,5839.458

0,1,2,3
Omnibus:,463678.072,Durbin-Watson:,1.997
Prob(Omnibus):,0.0,Jarque-Bera (JB):,32095947405.191
Skew:,36.068,Prob(JB):,0.0
Kurtosis:,2154.7,Cond. No.,85800000.0


In [24]:
# Calculate r^2
r2 = r2_score(y_test, y_pred, multioutput='uniform_average')

# Return evaluation
print('Mean of usd_pledged:\n', round(df.usd_pledged.mean()))
print('Mean Absolute Error:\n', round(metrics.mean_absolute_error(y_test, y_pred)))
print('Mean Squared Error:\n', round(metrics.mean_squared_error(y_test, y_pred)))
print('Root Mean Squared Error:\n', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred))))
print('R^2:\n', round(r2,ndigits=2))

Mean of usd_pledged:
 12297
Mean Absolute Error:
 16666
Mean Squared Error:
 7057384247
Root Mean Squared Error:
 84008
R^2:
 0.04


**Description:**<br/>
* In general **'R Squared'** tells you how related two things are, like correlation. 'R Squared' is the percentage of variation (i.e. varies from 0 to 1) explained by the relationship between two or more variables. So, it measures how much of variability in the dependent variable can be explained by the model.
* The 'R Squared' for this model - with its maximum numeric feature combination - is very low.
* With decreasing number of features the 'R Squared' decreses. E.g. X = df[[usd_goal', 'duration_days', 'duration_days_prep']]

**Conclusion:**<br/>
* With a **'R Squared' of 4%**, only 4% of the variability of the dependet variables can be explained by the model.
* Therefore, the model (Multi-Linear_Regression) with its features is not suitable of predicting the pledged amount in USD. **The model is underfitting**
* Probabliy because the numeric variables are mainly from a categorical nature.
* Using other regression models like **polynomial regression** might increas the quality predicting the pledged amount.