In [55]:
from sql_functions import get_dataframe
import pandas as pd
import capstone_functions as cf
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
from scipy import interpolate
from sklearn import linear_model

# Variables

In [56]:
top_XX_mechanic = 10

schema = "bgg_data"
main = "ml_boardgame_stats"
subdomain = "subdomain"
unique_subdomain = "unique_subdomain"
kickstarter = "kickstarter_unique_campaigns"
slug = "unique_slug_bgg_id"
mechanic = "mechanics"
unique_mechanics = "unique_mechanics"


#honor = "honor_clean"



## Create Dataframes for Kickstarter and Mechanic

In [57]:

df_ks = get_dataframe(f"SELECT * FROM {schema}.{kickstarter}")
df_slug = get_dataframe(f"SELECT * FROM {schema}.{slug}")
df_mech = get_dataframe(f"SELECT * FROM {schema}.{mechanic}")
df_u_mech = get_dataframe(f"SELECT * FROM {schema}.{unique_mechanics}")

#df_marketplace = cf.avg_price_from_marketplace()
#df_honor = get_dataframe(f"SELECT * FROM {schema}.{honor}")

Merge kickstarter table with bgg_ids and rename column bgg_id to id

In [58]:
df_ks = pd.merge(df_slug,df_ks,on='slug')
df_ks.rename({'bgg_id':'id'},axis=1,inplace=True)

create dataframe with all mechanics and merge it with the kickstarter dataframe

In [59]:
df_mech = pd.merge(df_mech,df_u_mech,on='mechanic_id')
df_mech = pd.merge(df_ks,df_mech,on='id')

create a list with the top XX mechanics

In [60]:
top_mechanics_list = list(df_mech.groupby(['mechanic']).count().sort_values(ascending=False,axis=0,by='id').reset_index()["mechanic"].head(top_XX_mechanic))

reduce the dataframe such that IDs are unique

In [61]:
df_mech["is_in_top_XX_mechanics"] = df_mech["mechanic"].isin(top_mechanics_list)
df_mech = df_mech[['id','is_in_top_XX_mechanics']]
df_mech = df_mech.groupby('id').sum().reset_index()

Because there are multiple mechanics for one ID, it is possible that for the same ID there are some mechanics in the top and others are not.
- create new column with True/False if ID is in top XX categories (XX will be set in the top code field: "Variables" -> "top_XX_mechanic")

In [62]:
df_mech[f"top_{top_XX_mechanic}_mechanic"] = df_mech.is_in_top_XX_mechanics > 0

In [63]:
df_mech = df_mech[['id',f"top_{top_XX_mechanic}_mechanic"]]

## Create Dataframes for main and subdomain

In [64]:
df_main = get_dataframe(f"SELECT * FROM {schema}.{main}")
df_sub = get_dataframe(f"SELECT * FROM {schema}.{subdomain}")
df_u_sub = get_dataframe(f"SELECT * FROM {schema}.{unique_subdomain}")

- merge Kickstarter with subdomains => 1021 non-null

In [65]:
df_sub = pd.merge(df_sub,df_u_sub,on="subdomain_id")
df_ks_sub = pd.merge(df_ks,df_sub,on='id')

- merge Kickstarter_subdomains with main

In [66]:
df_ks_sub_main = pd.merge(df_ks_sub,df_main,on='id')

- Build new columns with goal in USD

In [67]:
df_ks_sub_main["usd_goal"] = df_ks_sub_main['goal']*(df_ks_sub_main.pledged/df_ks_sub_main.usd_pledged)

# Machine Learning
- Extract only necessary columns for our ML and drop all null

In [68]:
df_ML = df_ks_sub_main[['country','usd_pledged','subdomain_name','min_players','max_players','min_playtime','max_playtime','min_age','averageweight','usd_goal']]
#df_ML.dropna(inplace=True);

In [69]:
df_ML["min_playtime"].fillna(0,inplace=True);
df_ML["max_playtime"].fillna(df_ML["max_playtime"].max(),inplace=True);

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ML["min_playtime"].fillna(0,inplace=True);
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ML["max_playtime"].fillna(df_ML["max_playtime"].max(),inplace=True);


In [70]:
df_ML = df_ML[(np.abs(stats.zscore(df_ML['min_playtime'])) < 2.5)]
df_ML = df_ML[(np.abs(stats.zscore(df_ML['max_playtime'])) < 3)]

In [71]:
df_ML.describe()

Unnamed: 0,usd_pledged,min_players,max_players,min_playtime,max_playtime,min_age,averageweight,usd_goal
count,319.0,319.0,319.0,319.0,319.0,319.0,319.0,319.0
mean,137399.6,1.786834,4.711599,52.047022,88.573668,10.880878,2.37685,36890.98
std,277044.0,0.717054,5.88182,36.361752,74.854702,3.26477,0.776606,202487.0
min,40.49,1.0,1.0,5.0,10.0,0.0,1.0,4.569029
25%,18529.0,1.0,2.0,30.0,40.0,10.0,1.9,5000.0
50%,43668.01,2.0,4.0,45.0,60.0,12.0,2.3,12400.45
75%,103764.0,2.0,6.0,60.0,120.0,13.0,2.8572,25000.0
max,2559458.0,7.0,100.0,180.0,480.0,17.0,4.6163,2513552.0


In [72]:
df_ML.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 319 entries, 0 to 335
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   country         319 non-null    object 
 1   usd_pledged     319 non-null    float64
 2   subdomain_name  319 non-null    object 
 3   min_players     319 non-null    float64
 4   max_players     319 non-null    float64
 5   min_playtime    319 non-null    float64
 6   max_playtime    319 non-null    float64
 7   min_age         319 non-null    int64  
 8   averageweight   319 non-null    float64
 9   usd_goal        319 non-null    float64
dtypes: float64(7), int64(1), object(2)
memory usage: 27.4+ KB


In [73]:
subdomain_dummy = pd.get_dummies(df_ML.subdomain_name.apply(pd.Series).stack(), drop_first=True).sum(level=0)
subdomain_dummy.columns = subdomain_dummy.columns.str.strip()

  subdomain_dummy = pd.get_dummies(df_ML.subdomain_name.apply(pd.Series).stack(), drop_first=True).sum(level=0)


In [74]:
country_dummy = pd.get_dummies(df_ML.country.apply(pd.Series).stack(), drop_first=True).sum(level=0)
country_dummy.columns = country_dummy.columns.str.strip()

  country_dummy = pd.get_dummies(df_ML.country.apply(pd.Series).stack(), drop_first=True).sum(level=0)


In [75]:
df = pd.concat([df_ML,subdomain_dummy], axis=1)
df = df.drop(["subdomain_name"], axis=1)
df.columns

Index(['country', 'usd_pledged', 'min_players', 'max_players', 'min_playtime',
       'max_playtime', 'min_age', 'averageweight', 'usd_goal', 'Children's',
       'Customizable', 'Family', 'Party', 'Strategy', 'Thematic', 'Wargames'],
      dtype='object')

In [76]:
df = pd.concat([df,country_dummy], axis=1)
df = df.drop(["country"], axis=1)
df.columns

Index(['usd_pledged', 'min_players', 'max_players', 'min_playtime',
       'max_playtime', 'min_age', 'averageweight', 'usd_goal', 'Children's',
       'Customizable', 'Family', 'Party', 'Strategy', 'Thematic', 'Wargames',
       'AU', 'BE', 'CA', 'CH', 'DE', 'ES', 'FR', 'GB', 'HK', 'IT', 'NL', 'NO',
       'SG', 'US'],
      dtype='object')

In [77]:
X = df[['AU', 'BE', 'CA', 'CH', 'DE', 'ES', 'FR', 'GB', 'HK', 'IT', 'NL', 'NO', 'SG', 'US',
        "Children's", "Customizable", "Family", "Party", "Strategy", "Thematic", "Wargames", 
        "min_players", "max_players", "min_playtime", "min_age", "averageweight", 'usd_goal']]

In [78]:
y = df.usd_pledged

In [79]:
#X = sm.add_constant(X)

In [80]:
# create an OLS model
our_model = sm.OLS(y, X)

# use the data to calculate the intercept and slope
model_results = our_model.fit()

# return the output of the model
model_results.summary() # summary contains eg. 'const' (intercept) and 'slope' of the regression equation.

0,1,2,3
Dep. Variable:,usd_pledged,R-squared (uncentered):,0.376
Model:,OLS,Adj. R-squared (uncentered):,0.318
Method:,Least Squares,F-statistic:,6.514
Date:,"Fri, 15 Jul 2022",Prob (F-statistic):,8.44e-18
Time:,11:08:08,Log-Likelihood:,-4409.8
No. Observations:,319,AIC:,8874.0
Df Residuals:,292,BIC:,8975.0
Df Model:,27,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
AU,-2.451e+04,1.34e+05,-0.183,0.855,-2.89e+05,2.4e+05
BE,1.949e+05,1.55e+05,1.261,0.208,-1.09e+05,4.99e+05
CA,1.599e+05,1.23e+05,1.296,0.196,-8.29e+04,4.03e+05
CH,-8.799e+04,1.68e+05,-0.523,0.602,-4.19e+05,2.43e+05
DE,3.833e+05,1.6e+05,2.393,0.017,6.8e+04,6.98e+05
ES,-1.952e+04,1.45e+05,-0.135,0.893,-3.05e+05,2.66e+05
FR,-2.104e+04,1.18e+05,-0.179,0.858,-2.52e+05,2.1e+05
GB,1.756e+04,9.32e+04,0.188,0.851,-1.66e+05,2.01e+05
HK,-1.89e+05,2.12e+05,-0.890,0.374,-6.07e+05,2.29e+05

0,1,2,3
Omnibus:,282.612,Durbin-Watson:,1.553
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6774.228
Skew:,3.598,Prob(JB):,0.0
Kurtosis:,24.398,Cond. No.,5320000.0


# Pick the most significant input parameters

In [81]:
X = df[['averageweight','usd_goal','min_playtime','Party','DE','min_players','min_age']]
X = sm.add_constant(X)
X.describe()

Unnamed: 0,const,averageweight,usd_goal,min_playtime,Party,DE
count,319.0,319.0,319.0,319.0,319.0,319.0
mean,1.0,2.37685,36890.98,52.047022,0.053292,0.012539
std,0.0,0.776606,202487.0,36.361752,0.224967,0.111449
min,1.0,1.0,4.569029,5.0,0.0,0.0
25%,1.0,1.9,5000.0,30.0,0.0,0.0
50%,1.0,2.3,12400.45,45.0,0.0,0.0
75%,1.0,2.8572,25000.0,60.0,0.0,0.0
max,1.0,4.6163,2513552.0,180.0,1.0,1.0


In [84]:
# create an OLS model
our_model = sm.OLS(y, X)
#our_model = sm.MANOVA(y, X)

# use the data to calculate the intercept and slope
model_results = our_model.fit()

# return the output of the model
model_results.summary() # summary contains eg. 'const' (intercept) and 'slope' of the regression equation.

0,1,2,3
Dep. Variable:,usd_pledged,R-squared:,0.173
Model:,OLS,Adj. R-squared:,0.16
Method:,Least Squares,F-statistic:,13.13
Date:,"Tue, 12 Jul 2022",Prob (F-statistic):,1.28e-11
Time:,19:52:24,Log-Likelihood:,-4419.5
No. Observations:,319,AIC:,8851.0
Df Residuals:,313,BIC:,8873.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-8.058e+04,4.92e+04,-1.637,0.103,-1.77e+05,1.63e+04
averageweight,1.052e+05,2.24e+04,4.693,0.000,6.11e+04,1.49e+05
usd_goal,0.1524,0.072,2.120,0.035,0.011,0.294
min_playtime,-1208.8459,459.565,-2.630,0.009,-2113.073,-304.619
Party,3.985e+05,6.61e+04,6.025,0.000,2.68e+05,5.29e+05
DE,3.158e+05,1.28e+05,2.464,0.014,6.36e+04,5.68e+05

0,1,2,3
Omnibus:,278.83,Durbin-Watson:,1.526
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6227.891
Skew:,3.555,Prob(JB):,0.0
Kurtosis:,23.445,Cond. No.,1850000.0


In [78]:
regr = linear_model.LinearRegression()
regr.fit(X, y)

LinearRegression()

In [80]:
# X = df[['averageweight','usd_goal','min_playtime','Party', 'DE']]
predicted_pledged_amount = regr.predict([[4, 50000, 20, 1, 0]])
predicted_pledged_amount



array([722197.26737772])