In [220]:
from sql_functions import get_dataframe
import pandas as pd
import capstone_functions as cf
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
from scipy import interpolate
from sklearn import linear_model
import seaborn as sns

pd.options.mode.chained_assignment = None  # default='warn'

# Variables

In [221]:
# How many mechanics are in the top mechanics
top_XX_mechanic = 25

#schema and table names, we are using in this notebook
schema = 'bgg_data'
main = 'ml_boardgame_stats'
subdomain = 'subdomain'
unique_subdomain = 'unique_subdomain'
kickstarter = 'kickstarter_unique_campaigns'
slug = 'unique_slug_bgg_id'
mechanic = 'mechanics'
unique_mechanics = 'unique_mechanics'
honor = 'honor_clean'

## Create Dataframes for Kickstarter and Mechanic and build df for the top XX mechanics

In [222]:

df_ks = get_dataframe(f"SELECT * FROM {schema}.{kickstarter}")
df_slug = get_dataframe(f"SELECT * FROM {schema}.{slug}")
df_mech = get_dataframe(f"SELECT * FROM {schema}.{mechanic}")
df_u_mech = get_dataframe(f"SELECT * FROM {schema}.{unique_mechanics}")

#df_marketplace = cf.avg_price_from_marketplace()
#df_honor = get_dataframe(f"SELECT * FROM {schema}.{honor}")

Merge kickstarter table with bgg_ids and rename column bgg_id to id

In [223]:
df_ks = pd.merge(df_slug,df_ks,on='slug')
df_ks.rename({'bgg_id':'id'},axis=1,inplace=True)

create dataframe with all mechanics and merge it with the kickstarter dataframe

In [224]:
df_mech = pd.merge(df_mech,df_u_mech,on='mechanic_id')
df_mech = pd.merge(df_ks,df_mech,on='id')

create a list with the top XX mechanics

In [225]:
top_mechanics_list = list(df_mech.groupby(['mechanic']).count().sort_values(ascending=False,axis=0,by='id').reset_index()["mechanic"].head(top_XX_mechanic))

reduce the dataframe such that IDs are unique

In [226]:
df_mech["is_in_top_XX_mechanics"] = df_mech["mechanic"].isin(top_mechanics_list)
df_mech = df_mech[['id','is_in_top_XX_mechanics']]
df_mech = df_mech.groupby('id').sum().reset_index()

Because there are multiple mechanics for one ID, it is possible that for the same ID there are some mechanics in the top and others are not.
- create new column with True/False if ID is in top XX categories (XX will be set in the top code field: "Variables" -> "top_XX_mechanic")

In [227]:
df_mech[f"top_{top_XX_mechanic}_mechanic"] = df_mech.is_in_top_XX_mechanics > 0

In [228]:
df_mech = df_mech[['id',f"top_{top_XX_mechanic}_mechanic"]]

## Create Dataframes for main and subdomain

In [229]:
df_main = get_dataframe(f"SELECT * FROM {schema}.{main}")
df_sub = get_dataframe(f"SELECT * FROM {schema}.{subdomain}")
df_u_sub = get_dataframe(f"SELECT * FROM {schema}.{unique_subdomain}")

- merge Kickstarter with subdomains => 1021 non-null

In [230]:
df_sub = pd.merge(df_sub,df_u_sub,on="subdomain_id")
df_ks_sub = pd.merge(df_ks,df_sub,on='id')

- merge Kickstarter_subdomains with main => ~330 entries

In [231]:
df_ks_sub_main = pd.merge(df_ks_sub,df_main,on='id')

- Build new columns with goal in USD

In [232]:
df_ks_sub_main["usd_goal"] = df_ks_sub_main['goal']*(df_ks_sub_main.pledged/df_ks_sub_main.usd_pledged)

merge with mechanics dataframe => 326 non-null entries

In [233]:
df_ks_sub_main_mech = pd.merge(df_ks_sub_main,df_mech,on='id')

In [234]:
#df_ks_sub_main_mech.columns

## Machine Learning
- Extract only necessary columns for our ML and drop all null

In [235]:
df_ML = df_ks_sub_main_mech[['country','usd_pledged','subdomain_name','min_players','max_players','min_playtime','max_playtime','min_age','averageweight','usd_goal',f"top_{top_XX_mechanic}_mechanic"]]
df_ML.dropna(inplace=True);

In [236]:
df_ML[f"top_{top_XX_mechanic}_mechanic"] = df_ML[f"top_{top_XX_mechanic}_mechanic"].astype(int)

In [237]:
df_ML["min_playtime"].fillna(df_ML["max_playtime"].median(),inplace=True);
df_ML["max_playtime"].fillna(df_ML["max_playtime"].median(),inplace=True);


In [238]:
df_ML = df_ML[(np.abs(stats.zscore(df_ML['min_playtime'])) < 2.5)]
df_ML = df_ML[(np.abs(stats.zscore(df_ML['max_playtime'])) < 3)]

In [239]:
#df_ML.describe()

## Build dummies for subdomain and countries

In [240]:
subdomain_dummy = pd.get_dummies(df_ML.subdomain_name.apply(pd.Series).stack(), drop_first=True).groupby(level=0).sum()
subdomain_dummy.columns = subdomain_dummy.columns.str.strip()

In [241]:
country_dummy = pd.get_dummies(df_ML.country.apply(pd.Series).stack(), drop_first=True).groupby(level=0).sum()
country_dummy.columns = country_dummy.columns.str.strip()

In [242]:
df = pd.concat([df_ML,subdomain_dummy], axis=1)
df = df.drop(["subdomain_name"], axis=1)
#df.columns

In [243]:
df = pd.concat([df,country_dummy], axis=1)
df = df.drop(["country"], axis=1)
df.columns

Index(['usd_pledged', 'min_players', 'max_players', 'min_playtime',
       'max_playtime', 'min_age', 'averageweight', 'usd_goal',
       'top_25_mechanic', 'Children's', 'Customizable', 'Family', 'Party',
       'Strategy', 'Thematic', 'Wargames', 'AU', 'BE', 'CA', 'CH', 'DE', 'ES',
       'FR', 'GB', 'HK', 'IT', 'NL', 'NO', 'SG', 'US'],
      dtype='object')

In [244]:
X = df[['DE', 'GB', 'US',
        "Children's", "Customizable", "Family", "Party", "Strategy", "Thematic", "Wargames", 
        "min_players", "min_playtime", "min_age", "averageweight", 'usd_goal',f"top_{top_XX_mechanic}_mechanic"]]

In [245]:
X = df[["min_players", "min_playtime", "min_age", "averageweight", 'usd_goal',f"top_{top_XX_mechanic}_mechanic"]]
#sns.pairplot(X)

In [246]:
y = df.usd_pledged

In [247]:
X = sm.add_constant(X)
#X.dropna(inplace=True)

#### Investigate the whole dataframe with all columns of interest

In [248]:
# create an OLS model
our_model = sm.OLS(y, X)

# use the data to calculate the intercept and slope
model_results = our_model.fit()

# return the output of the model
model_results.summary() # summary contains eg. 'const' (intercept) and 'slope' of the regression equation.

0,1,2,3
Dep. Variable:,usd_pledged,R-squared:,0.068
Model:,OLS,Adj. R-squared:,0.05
Method:,Least Squares,F-statistic:,3.734
Date:,"Thu, 14 Jul 2022",Prob (F-statistic):,0.00133
Time:,12:12:18,Log-Likelihood:,-4384.4
No. Observations:,315,AIC:,8783.0
Df Residuals:,308,BIC:,8809.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-9.833e+04,1.17e+05,-0.840,0.402,-3.29e+05,1.32e+05
min_players,1.889e+04,2.2e+04,0.859,0.391,-2.44e+04,6.22e+04
min_playtime,-1393.6573,495.177,-2.814,0.005,-2368.014,-419.300
min_age,-863.3791,5199.704,-0.166,0.868,-1.11e+04,9368.058
averageweight,8.363e+04,2.56e+04,3.270,0.001,3.33e+04,1.34e+05
usd_goal,0.1721,0.077,2.234,0.026,0.021,0.324
top_25_mechanic,8.163e+04,9.29e+04,0.879,0.380,-1.01e+05,2.64e+05

0,1,2,3
Omnibus:,329.466,Durbin-Watson:,1.442
Prob(Omnibus):,0.0,Jarque-Bera (JB):,11558.992
Skew:,4.505,Prob(JB):,0.0
Kurtosis:,31.275,Cond. No.,1890000.0


#### ...and pick columns accordingly

In [249]:
#X = df[['averageweight','usd_goal','min_playtime','Party','DE']]
X = df[['averageweight','min_playtime','Party','DE']]
X = sm.add_constant(X)
X.describe()

Unnamed: 0,const,averageweight,min_playtime,Party,DE
count,315.0,315.0,315.0,315.0,315.0
mean,1.0,2.382807,52.326984,0.050794,0.012698
std,0.0,0.778498,36.500298,0.219925,0.112148
min,1.0,1.0,5.0,0.0,0.0
25%,1.0,1.90615,30.0,0.0,0.0
50%,1.0,2.3077,45.0,0.0,0.0
75%,1.0,2.87365,60.0,0.0,0.0
max,1.0,4.6163,180.0,1.0,1.0


In [250]:
# create an OLS model
our_model = sm.OLS(y, X)
#our_model = sm.MANOVA(y, X)

# use the data to calculate the intercept and slope
model_results = our_model.fit()

# return the output of the model
model_results.summary() # summary contains eg. 'const' (intercept) and 'slope' of the regression equation.

0,1,2,3
Dep. Variable:,usd_pledged,R-squared:,0.16
Model:,OLS,Adj. R-squared:,0.15
Method:,Least Squares,F-statistic:,14.8
Date:,"Thu, 14 Jul 2022",Prob (F-statistic):,4.45e-11
Time:,12:12:19,Log-Likelihood:,-4368.0
No. Observations:,315,AIC:,8746.0
Df Residuals:,310,BIC:,8765.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-9.397e+04,4.94e+04,-1.902,0.058,-1.91e+05,3249.247
averageweight,1.142e+05,2.23e+04,5.132,0.000,7.04e+04,1.58e+05
min_playtime,-1243.0401,465.123,-2.672,0.008,-2158.238,-327.842
Party,4.089e+05,6.87e+04,5.955,0.000,2.74e+05,5.44e+05
DE,3.113e+05,1.3e+05,2.403,0.017,5.64e+04,5.66e+05

0,1,2,3
Omnibus:,267.727,Durbin-Watson:,1.528
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5559.808
Skew:,3.427,Prob(JB):,0.0
Kurtosis:,22.407,Cond. No.,572.0


In [251]:
regr = linear_model.LinearRegression()
regr.fit(X, y)

LinearRegression()

In [252]:
#X = df[['averageweight','min_playtime','Party','DE']]
predicted_pledged_amount = regr.predict([[1, 4, 50, 1, 0 ]])
predicted_pledged_amount



array([709790.90826722])

# Test

In [253]:
main = 'unfiltered_main_stats_cleaned'

In [254]:
df_ks = get_dataframe(f"SELECT * FROM {schema}.{kickstarter}")
df_slug = get_dataframe(f"SELECT * FROM {schema}.{slug}")
df_mech = get_dataframe(f"SELECT * FROM {schema}.{mechanic}")
df_u_mech = get_dataframe(f"SELECT * FROM {schema}.{unique_mechanics}")
df_main = get_dataframe(f"SELECT * FROM {schema}.{main}")
df_sub = get_dataframe(f"SELECT * FROM {schema}.{subdomain}")
df_u_sub = get_dataframe(f"SELECT * FROM {schema}.{unique_subdomain}")

In [255]:
df_ks = pd.merge(df_slug,df_ks,on='slug')
df_ks.rename({'bgg_id':'id'},axis=1,inplace=True)
df_mech = pd.merge(df_mech,df_u_mech,on='mechanic_id')
#df_mech = pd.merge(df_ks,df_mech,on='id')
df_sub = pd.merge(df_sub,df_u_sub,on="subdomain_id")
df_ks_sub = pd.merge(df_ks,df_sub,on='id')

In [256]:
upcoming_games = [300195, 360676, 366797, 342081, 341870,
                  355113, 358809, 359962, 349779, 312682]

In [257]:
df_upcoming_games = df_main.loc[df_main["id"].isin(upcoming_games), :]
df_upcoming_games.drop(['yearpublished','averageweight'],inplace=True,axis=1)

In [258]:
df_upcoming_games

Unnamed: 0,id,min_players,max_players,playtime,min_playtime,max_playtime,min_age,average,user_rated,num_owned,trading,wanting,wishing,numcomments,numweights
27556,359962,1.0,4.0,60.0,45.0,60.0,8,8.0,1,1,0,2,16,0,0
50534,312682,1.0,5.0,240.0,60.0,240.0,14,8.5625,16,2,0,9,140,9,2
52076,341870,,,,,,0,9.33333,3,2,0,68,454,7,0
52516,349779,1.0,2.0,180.0,150.0,180.0,14,7.35,4,0,0,2,19,2,3
67473,342081,1.0,4.0,180.0,60.0,180.0,14,8.0,2,2,0,4,24,1,2
69367,300195,1.0,6.0,180.0,90.0,180.0,14,0.0,0,4,0,3,70,1,0
76040,355113,2.0,4.0,120.0,45.0,120.0,10,8.75,4,1,0,0,4,4,1
79263,358809,2.0,4.0,80.0,40.0,80.0,14,9.33333,3,2,0,0,5,2,0
129886,360676,2.0,2.0,20.0,20.0,20.0,12,0.0,0,1,0,6,55,1,0


In [259]:
df_upcoming_games = pd.merge(df_upcoming_games, df_mech, on='id')

In [260]:
df_upcoming_games = pd.merge(df_upcoming_games, df_sub, on='id')

In [261]:
df_upcoming_games

Unnamed: 0,id,min_players,max_players,playtime,min_playtime,max_playtime,min_age,average,user_rated,num_owned,trading,wanting,wishing,numcomments,numweights,mechanic_id,mechanic,subdomain_id,subdomain_name
0,349779,1.0,2.0,180.0,150.0,180.0,14,7.35,4,0,0,2,19,2,3,2040,hand management,4664,Wargames
1,349779,1.0,2.0,180.0,150.0,180.0,14,7.35,4,0,0,2,19,2,3,2046,area movement,4664,Wargames
2,349779,1.0,2.0,180.0,150.0,180.0,14,7.35,4,0,0,2,19,2,3,2857,card play conflict resolution,4664,Wargames
3,349779,1.0,2.0,180.0,150.0,180.0,14,7.35,4,0,0,2,19,2,3,2902,income,4664,Wargames


In [262]:
family_query = f"SELECT * FROM {schema}.family_bgg;"
df_family = get_dataframe(family_query)

df_upcoming = pd.merge(df_main,df_family,on='id')

In [263]:
print(df_upcoming.family_value.unique())

['food / cooking' 'aliens / extraterrestrials' 'anima tactics' ... 'sonix'
 '50 clues – the maria trilogy' 'hero immortal king']


In [282]:
all_upcoming_games = df_upcoming.query("family_type == 'admin' and family_value == 'upcoming releases'")

In [283]:
all_upcoming_games.columns

Index(['id', 'yearpublished', 'min_players', 'max_players', 'playtime',
       'min_playtime', 'max_playtime', 'min_age', 'average', 'user_rated',
       'num_owned', 'trading', 'wanting', 'wishing', 'numcomments',
       'numweights', 'averageweight', 'family_type', 'family_value',
       'family_id'],
      dtype='object')

In [284]:
all_upcoming_games.drop(["yearpublished", 'trading', 'numcomments','family_type', 'family_value',
       'family_id','average', 'user_rated'],axis=1,inplace=True)

In [285]:
all_upcoming_games.nunique()

id               5499
min_players         8
max_players        29
playtime           84
min_playtime       43
max_playtime       84
min_age            22
num_owned         399
wanting           106
wishing           346
numweights         32
averageweight      87
dtype: int64

In [274]:
all_upcoming_games.dropna(inplace=True)

In [275]:
all_upcoming_games

Unnamed: 0,id,min_players,max_players,playtime,min_playtime,max_playtime,min_age,average,user_rated,num_owned,wanting,wishing,numweights
278,344235,3.0,8.0,25.0,15.0,25.0,11,10.00000,1,1,0,0,0
281,344240,3.0,5.0,45.0,20.0,45.0,9,5.05000,10,18,0,1,1
291,344249,2.0,6.0,20.0,20.0,20.0,8,6.66667,3,9,1,1,0
303,344259,1.0,1.0,20.0,5.0,20.0,8,0.00000,0,1,0,1,0
314,344268,2.0,4.0,40.0,20.0,40.0,9,5.43471,66,7,8,31,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
208691,218280,1.0,6.0,120.0,30.0,120.0,13,0.00000,0,0,0,4,0
208841,250992,2.0,8.0,30.0,15.0,30.0,11,9.00000,1,1,0,1,0
208894,298086,1.0,6.0,120.0,45.0,120.0,10,7.94552,32,11,5,57,2
208941,298133,1.0,4.0,90.0,60.0,90.0,14,0.00000,0,3,0,0,0
