In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sql_functions as sf
import capstone_functions as cf
import itertools

# this so called "line magic" command, amongst other things, stores the plots in the notebook document.
%matplotlib inline

# warnings supression
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats

### Variables

In [2]:
schema = "bgg_data"
ks_table = "clean_ks_with_bgg_id"
main_stats_table = "unfiltered_main_stats_cleaned"
engine = sf.get_engine()

### Fetch the data from DB and merge into one Dataframe

In [3]:
ks_query = f"SELECT * from {schema}.{ks_table};"
main_query = f"SELECT * FROM {schema}.{main_stats_table};"

In [4]:
df_ks = sf.get_dataframe(ks_query)
df_main = sf.get_dataframe(main_query)
df_ks.rename({'bgg_id':'id'},axis=1,inplace=True)

In [5]:
df = pd.merge(df_ks,df_main,how='inner',on='id')

- Drop all non-numerical values (including Timestamps)  
- Afterwards use the z-score to truncate all the outliers of specific columns (min_playtime)

In [6]:
df.drop(['yearpublished','deadline', 'created_at','playtime', 'launched_at','state_changed_at','successful','country','currency','averageweight'],axis=1,inplace=True)

In [7]:
df.dropna(axis=0,inplace=True)

#### Choose the independent variable (Predictor)

In [8]:

df = df[(np.abs(stats.zscore(df['min_playtime'])) < 2)]
df = df[(np.abs(stats.zscore(df['max_playtime'])) < 2)]
df = df[(np.abs(stats.zscore(df['max_players'])) < 2)]
df = df[(np.abs(stats.zscore(df['min_players'])) < 3)]
df = df[df["min_age"] < 19]

In [9]:
columns_list = df.columns.to_list()
columns_list.pop(4)
columns_list = columns_list[3:]

In [10]:
columns_combinations = []
temp_list = []
for i in range(len(columns_list)):
    temp_list.append(columns_list[i])
    columns_combinations.append(temp_list.copy())
print(columns_combinations)

[['backers_count'], ['backers_count', 'min_players'], ['backers_count', 'min_players', 'max_players'], ['backers_count', 'min_players', 'max_players', 'min_playtime'], ['backers_count', 'min_players', 'max_players', 'min_playtime', 'max_playtime'], ['backers_count', 'min_players', 'max_players', 'min_playtime', 'max_playtime', 'min_age'], ['backers_count', 'min_players', 'max_players', 'min_playtime', 'max_playtime', 'min_age', 'average'], ['backers_count', 'min_players', 'max_players', 'min_playtime', 'max_playtime', 'min_age', 'average', 'user_rated'], ['backers_count', 'min_players', 'max_players', 'min_playtime', 'max_playtime', 'min_age', 'average', 'user_rated', 'num_owned'], ['backers_count', 'min_players', 'max_players', 'min_playtime', 'max_playtime', 'min_age', 'average', 'user_rated', 'num_owned', 'trading'], ['backers_count', 'min_players', 'max_players', 'min_playtime', 'max_playtime', 'min_age', 'average', 'user_rated', 'num_owned', 'trading', 'wanting'], ['backers_count'

In [11]:
for comb in columns_combinations:
    formula = "usd_pledged ~ goal"
    for col in comb:
        formula += f" + {col}"
    rs = smf.ols(formula=formula, data=df).fit().rsquared
    print(f'{formula}:   {rs:.3}')


usd_pledged ~ goal + backers_count:   0.697
usd_pledged ~ goal + backers_count + min_players:   0.699
usd_pledged ~ goal + backers_count + min_players + max_players:   0.699
usd_pledged ~ goal + backers_count + min_players + max_players + min_playtime:   0.701
usd_pledged ~ goal + backers_count + min_players + max_players + min_playtime + max_playtime:   0.702
usd_pledged ~ goal + backers_count + min_players + max_players + min_playtime + max_playtime + min_age:   0.703
usd_pledged ~ goal + backers_count + min_players + max_players + min_playtime + max_playtime + min_age + average:   0.703
usd_pledged ~ goal + backers_count + min_players + max_players + min_playtime + max_playtime + min_age + average + user_rated:   0.703
usd_pledged ~ goal + backers_count + min_players + max_players + min_playtime + max_playtime + min_age + average + user_rated + num_owned:   0.706
usd_pledged ~ goal + backers_count + min_players + max_players + min_playtime + max_playtime + min_age + average + user_r

In [36]:
formul = f"usd_pledged ~ min_players + min_playtime"
rsqu = smf.ols(formula=formul, data=df).fit().summary()
#print(f'{formul}:   {rsqu:.3}')

In [37]:
rsqu

0,1,2,3
Dep. Variable:,usd_pledged,R-squared:,0.034
Model:,OLS,Adj. R-squared:,0.034
Method:,Least Squares,F-statistic:,72.48
Date:,"Mon, 11 Jul 2022",Prob (F-statistic):,1.17e-31
Time:,22:59:41,Log-Likelihood:,-59505.0
No. Observations:,4094,AIC:,119000.0
Df Residuals:,4091,BIC:,119000.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.985e+05,2.61e+04,7.596,0.000,1.47e+05,2.5e+05
min_players,-8.623e+04,1.13e+04,-7.616,0.000,-1.08e+05,-6.4e+04
min_playtime,2300.4956,285.405,8.060,0.000,1740.947,2860.044

0,1,2,3
Omnibus:,6919.19,Durbin-Watson:,1.98
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6320443.388
Skew:,11.447,Prob(JB):,0.0
Kurtosis:,194.122,Cond. No.,167.0


In [None]:
sns.pairplot(X);

- Define dependent variable (since it's always a single variable, we can use dot notation here)
- Since our statsmodel OLS model needs an adjustable intercept, we add a column of 1s to:

In [None]:
y = df.usd_pledged
X = sm.add_constant(X)

display(X.head())

Now, our data is prepared for modelling!  

The following steps are:
- creating a model based on the modules notation,
- fitting the model to our data (this is the part where the sum of squared residuals is reduced to min - and passing the model results in a result variable)
- print a summary with the models statistics

In [None]:
# Prepare data for modeling
# X3 is already defined, we still need to add the constand though:
X = sm.add_constant(X)

#fit model and get model summery in one step
sm.OLS(y, X).fit().summary()

In [None]:
# print out the intercept and slope of your model by adressing the parameters via ".params"
# (you can also find it in the table above by looking at const coef and weight coef)
intercept, slope = model_results.params

print(f'intercept: {intercept}, slope: {slope}')