In [33]:
from sql_functions import get_dataframe
import pandas as pd
import Capstone_functions as cf
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
from scipy import interpolate
from sklearn import linear_model
import seaborn as sns

pd.options.mode.chained_assignment = None  # default='warn'

# Variables

In [34]:
# How many mechanics are in the top mechanics
top_XX_mechanic = 25

#schema and table names, we are using in this notebook
schema = 'bgg_data'
main = 'ml_boardgame_stats'
subdomain = 'subdomain'
unique_subdomain = 'unique_subdomain'
kickstarter = 'kickstarter_unique_campaigns'
slug = 'unique_slug_bgg_id'
mechanic = 'mechanics'
unique_mechanics = 'unique_mechanics'
honor = 'honor_clean'

## Create Dataframes for Kickstarter and Mechanic and build df for the top XX mechanics

In [35]:

df_ks = get_dataframe(f"SELECT * FROM {schema}.{kickstarter}")
df_slug = get_dataframe(f"SELECT * FROM {schema}.{slug}")
df_mech = get_dataframe(f"SELECT * FROM {schema}.{mechanic}")
df_u_mech = get_dataframe(f"SELECT * FROM {schema}.{unique_mechanics}")
df_main = get_dataframe(f"SELECT * FROM {schema}.{main}")
df_sub = get_dataframe(f"SELECT * FROM {schema}.{subdomain}")
df_u_sub = get_dataframe(f"SELECT * FROM {schema}.{unique_subdomain}")

#df_marketplace = cf.avg_price_from_marketplace()
#df_honor = get_dataframe(f"SELECT * FROM {schema}.{honor}")

Merge kickstarter table with bgg_ids and rename column bgg_id to id

In [36]:
df_ks = pd.merge(df_slug,df_ks,on='slug')
df_ks.rename({'bgg_id':'id'},axis=1,inplace=True)

## Create a list with the top XX mechanics

create dataframe with all mechanics and merge it with the kickstarter dataframe

In [37]:
df_mech = pd.merge(df_mech,df_u_mech,on='mechanic_id')
df_mech = pd.merge(df_ks,df_mech,on='id')

In [38]:
top_mechanics_list = list(df_mech.groupby(['mechanic']).count().sort_values(ascending=False,axis=0,by='id').reset_index()["mechanic"].head(top_XX_mechanic))

#### Alternative for top XX mechanics calculation

In [39]:
# top mechanics by bgg rating
df_mech_02 = pd.merge(df_mech,df_u_mech,on='mechanic_id')
df_mech_02 = pd.merge(df_main,df_mech,on='id')

In [40]:
top_rated_mechanics_list = list(df_mech_02.groupby('mechanic').mean('average').sort_values('average',ascending=False).reset_index().mechanic.head(top_XX_mechanic))

### reduce the dataframe such that IDs are unique

choose between two different ways of defining "top mechanics"

In [41]:
#mechanics_list = top_mechanics_list
mechanics_list = top_rated_mechanics_list
print(mechanics_list)

df_mech["is_in_top_XX_mechanics"] = df_mech["mechanic"].isin(mechanics_list)
df_mech = df_mech[['id','is_in_top_XX_mechanics']]
df_mech = df_mech.groupby('id').sum().reset_index()

['predictive bid', 'automatic resource growth', 'pattern movement', 'line of sight', 'increase value of unchosen resources', 'impulse movement', 'hidden movement', 'zone of control', 're-rolling and locking', 'cube tower', 'follow', 'programmed movement', 'worker placement, different worker types', 'investment', 'roles with asymmetric information', 'deck construction', 'command cards', 'worker placement with dice workers', 'flicking', 'movement template', 'scenario / mission / campaign game', 'three dimensional movement', 'measurement movement', 'income', 'bribery']


Because there are multiple mechanics for one ID, it is possible that for the same ID there are some mechanics in the top and others are not.
- create new column with True/False if ID is in top XX categories (XX will be set in the top code field: "Variables" -> "top_XX_mechanic")

In [42]:
df_mech[f"top_{top_XX_mechanic}_mechanic"] = df_mech.is_in_top_XX_mechanics > 0

In [43]:
df_mech = df_mech[['id',f"top_{top_XX_mechanic}_mechanic"]]

- merge Kickstarter with subdomains => 1021 non-null

In [44]:
df_sub = pd.merge(df_sub,df_u_sub,on="subdomain_id")
df_ks_sub = pd.merge(df_ks,df_sub,on='id')

- merge Kickstarter_subdomains with main => ~330 entries

In [45]:
df_ks_sub_main = pd.merge(df_ks_sub,df_main,on='id')

- Build new columns with goal in USD

In [46]:
df_ks_sub_main["usd_goal"] = df_ks_sub_main['goal']*(df_ks_sub_main.pledged/df_ks_sub_main.usd_pledged)

merge with mechanics dataframe => 326 non-null entries

In [47]:
df_ks_sub_main_mech = pd.merge(df_ks_sub_main,df_mech,on='id')

In [48]:
#df_ks_sub_main_mech.columns

## Machine Learning
- Extract only necessary columns for our ML and drop all null

In [49]:
df_ML = df_ks_sub_main_mech[['country','usd_pledged','subdomain_name','min_players','max_players','min_playtime','max_playtime','min_age','averageweight','usd_goal',f"top_{top_XX_mechanic}_mechanic"]]
df_ML.dropna(inplace=True);

In [50]:
df_ML[f"top_{top_XX_mechanic}_mechanic"] = df_ML[f"top_{top_XX_mechanic}_mechanic"].astype(int)

In [51]:
df_ML["min_playtime"].fillna(df_ML["max_playtime"].median(),inplace=True);
df_ML["max_playtime"].fillna(df_ML["max_playtime"].median(),inplace=True);


In [52]:
df_ML = df_ML[(np.abs(stats.zscore(df_ML['min_playtime'])) < 2.5)]
df_ML = df_ML[(np.abs(stats.zscore(df_ML['max_playtime'])) < 3)]

In [53]:
#df_ML.describe()

## Build dummies for subdomain and countries

In [54]:
subdomain_dummy = pd.get_dummies(df_ML.subdomain_name.apply(pd.Series).stack(), drop_first=True).groupby(level=0).sum()
subdomain_dummy.columns = subdomain_dummy.columns.str.strip()

In [55]:
country_dummy = pd.get_dummies(df_ML.country.apply(pd.Series).stack(), drop_first=True).groupby(level=0).sum()
country_dummy.columns = country_dummy.columns.str.strip()

In [56]:
df = pd.concat([df_ML,subdomain_dummy], axis=1)
df = df.drop(["subdomain_name"], axis=1)
#df.columns

In [57]:
df = pd.concat([df,country_dummy], axis=1)
df = df.drop(["country"], axis=1)
df.columns

Index(['usd_pledged', 'min_players', 'max_players', 'min_playtime',
       'max_playtime', 'min_age', 'averageweight', 'usd_goal',
       'top_25_mechanic', 'Children's', 'Customizable', 'Family', 'Party',
       'Strategy', 'Thematic', 'Wargames', 'AU', 'BE', 'CA', 'CH', 'DE', 'ES',
       'FR', 'GB', 'HK', 'IT', 'NL', 'NO', 'SG', 'US'],
      dtype='object')

In [58]:
X = df[['DE', 'GB', 'US',
        "Children's", "Customizable", "Family", "Party", "Strategy", "Thematic", "Wargames", 
        "min_players", "min_playtime", "min_age", "averageweight", 'usd_goal',f"top_{top_XX_mechanic}_mechanic", "usd_pledged"]]

In [29]:
#X = df[["min_players", "min_playtime", "min_age", "averageweight", 'usd_goal',f"top_{top_XX_mechanic}_mechanic"]]
#sns.pairplot(X)

In [30]:
y = df.usd_pledged

In [62]:
df2 = X
df2

Unnamed: 0,DE,GB,US,Children's,Customizable,Family,Party,Strategy,Thematic,Wargames,min_players,min_playtime,min_age,averageweight,usd_goal,top_25_mechanic,usd_pledged
0,0,0,0,0,0,0,0,0,0,1,2.0,90.0,10,2.6250,786.797836,0,10105.01
1,0,0,0,0,0,1,0,0,0,0,2.0,15.0,6,1.0385,44415.559001,0,210531.26
2,0,0,1,0,0,0,0,0,0,1,1.0,120.0,12,2.6667,27663.000000,0,63244.00
3,0,0,1,0,0,1,0,0,0,0,2.0,5.0,10,1.6750,4000.000000,1,12058.00
4,0,0,1,0,0,0,0,1,0,0,2.0,60.0,10,2.2534,18446.000000,1,76254.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324,0,0,1,0,0,0,0,0,1,0,2.0,20.0,13,1.8500,25000.000000,0,587340.00
325,0,0,1,0,0,0,1,0,0,0,2.0,20.0,13,1.8500,25000.000000,0,587340.00
326,0,0,0,0,0,0,0,1,0,0,2.0,20.0,14,2.2471,31756.549936,0,478870.73
327,0,0,0,0,0,0,0,0,1,0,1.0,45.0,12,3.1111,30876.930287,0,550446.72


In [63]:
col = df2.columns.to_list()
col

['DE',
 'GB',
 'US',
 "Children's",
 'Customizable',
 'Family',
 'Party',
 'Strategy',
 'Thematic',
 'Wargames',
 'min_players',
 'min_playtime',
 'min_age',
 'averageweight',
 'usd_goal',
 'top_25_mechanic',
 'usd_pledged']

In [64]:
DE = [df2["DE"].to_list()]
GB = [df2["GB"].to_list()]
US = [df2["US"].to_list()]
chi = [df2["Children's"].to_list()]
cust = [df2["Customizable"].to_list()]
fam = [df2["Family"].to_list()]
party = [df2["Party"].to_list()]
strat = [df2["Strategy"].to_list()]
them = [df2["Thematic"].to_list()]
war = [df2["Wargames"].to_list()]
min_players = [df2["min_players"].to_list()]
min_playtime = [df2["min_playtime"].to_list()]
min_age = [df2["min_age"].to_list()]
avgwei = [df2["averageweight"].to_list()]
usd_goal = [df2["usd_goal"].to_list()]
mech = [df2["top_25_mechanic"].to_list()]
usd_pledged = [df2["usd_pledged"].to_list()]

In [65]:
dic = {
col[0]: DE,
col[1]: GB,
col[2]: US,
col[3]: chi,
col[4]: cust,
col[5]: fam,
col[6]: party,
col[7]: strat,
col[8]: them,
col[9]: war,
col[10]: min_players,
col[11]: min_playtime,
col[12]: min_age,
col[13]: avgwei,
col[14]: usd_goal,
col[15]: mech,
col[16]: usd_pledged,
}

In [66]:
df_lin_re = pd.DataFrame(dic)
df_lin_re

Unnamed: 0,DE,GB,US,Children's,Customizable,Family,Party,Strategy,Thematic,Wargames,min_players,min_playtime,min_age,averageweight,usd_goal,top_25_mechanic,usd_pledged
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, ...","[0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, ...","[2.0, 2.0, 1.0, 2.0, 2.0, 1.0, 2.0, 2.0, 1.0, ...","[90.0, 15.0, 120.0, 5.0, 60.0, 10.0, 60.0, 30....","[10, 6, 12, 10, 10, 10, 8, 0, 10, 12, 10, 14, ...","[2.625, 1.0385, 2.6667, 1.675, 2.2534, 2.0714,...","[786.7978359249521, 44415.55900059686, 27663.0...","[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[10105.01, 210531.26, 63244.0, 12058.0, 76254...."


In [67]:
df_lin_re.to_csv("/Users/jannikduda/neuefische/Daily-Lama-Capstone/data/bgg_lin_re2.csv")

In [None]:
#X = sm.add_constant(X)
#X.dropna(inplace=True)

#### Investigate the whole dataframe with all columns of interest

In [None]:
# create an OLS model
our_model = sm.OLS(y, X)

# use the data to calculate the intercept and slope
model_results = our_model.fit()

# return the output of the model
model_results.summary() # summary contains eg. 'const' (intercept) and 'slope' of the regression equation.

0,1,2,3
Dep. Variable:,usd_pledged,R-squared (uncentered):,0.384
Model:,OLS,Adj. R-squared (uncentered):,0.351
Method:,Least Squares,F-statistic:,11.66
Date:,"Fri, 15 Jul 2022",Prob (F-statistic):,1.7199999999999998e-23
Time:,14:22:59,Log-Likelihood:,-4353.8
No. Observations:,315,AIC:,8740.0
Df Residuals:,299,BIC:,8800.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
DE,3.544e+05,1.33e+05,2.663,0.008,9.25e+04,6.16e+05
GB,1326.1065,5.32e+04,0.025,0.980,-1.03e+05,1.06e+05
US,2.136e+04,3.63e+04,0.589,0.556,-5e+04,9.27e+04
Children's,-7.769e+04,1.03e+05,-0.753,0.452,-2.81e+05,1.25e+05
Customizable,-8.066e+04,2.56e+05,-0.315,0.753,-5.84e+05,4.23e+05
Family,1.31e+04,5.15e+04,0.254,0.800,-8.83e+04,1.14e+05
Party,4.153e+05,8.2e+04,5.062,0.000,2.54e+05,5.77e+05
Strategy,-9862.6789,6.03e+04,-0.164,0.870,-1.29e+05,1.09e+05
Thematic,5.522e+04,6.27e+04,0.880,0.379,-6.82e+04,1.79e+05

0,1,2,3
Omnibus:,278.352,Durbin-Watson:,1.547
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6807.499
Skew:,3.564,Prob(JB):,0.0
Kurtosis:,24.63,Cond. No.,3770000.0


## Pick the most significant input parameters

In [None]:
params = ['averageweight', 
        'min_playtime', 
        'min_players', 
        'min_age', 
        f"top_{top_XX_mechanic}_mechanic", 
        'Party', 
        'Thematic', 
        "Children's", 
        'DE',
        'usd_goal']

In [None]:
X = df[params]

In [686]:
#X = sm.add_constant(X)
#X.describe()

In [687]:
# create an OLS model
our_model = sm.OLS(y, X)

# use the data to calculate the intercept and slope
model_results = our_model.fit()

# return the output of the model
model_results.summary()

0,1,2,3
Dep. Variable:,usd_pledged,R-squared (uncentered):,0.374
Model:,OLS,Adj. R-squared (uncentered):,0.353
Method:,Least Squares,F-statistic:,18.19
Date:,"Fri, 15 Jul 2022",Prob (F-statistic):,5.21e-26
Time:,14:23:01,Log-Likelihood:,-4356.5
No. Observations:,315,AIC:,8733.0
Df Residuals:,305,BIC:,8771.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
averageweight,8.185e+04,2.16e+04,3.793,0.000,3.94e+04,1.24e+05
min_playtime,-1025.0151,456.851,-2.244,0.026,-1923.993,-126.037
min_players,-1.456e+04,1.77e+04,-0.823,0.411,-4.93e+04,2.02e+04
min_age,-3303.7160,4369.513,-0.756,0.450,-1.19e+04,5294.492
top_25_mechanic,1.667e+05,4.81e+04,3.469,0.001,7.22e+04,2.61e+05
Party,4.207e+05,7.21e+04,5.836,0.000,2.79e+05,5.63e+05
Thematic,8.116e+04,4.13e+04,1.965,0.050,-127.306,1.62e+05
Children's,-7.205e+04,9.9e+04,-0.728,0.467,-2.67e+05,1.23e+05
DE,3.3e+05,1.29e+05,2.561,0.011,7.65e+04,5.83e+05

0,1,2,3
Omnibus:,275.707,Durbin-Watson:,1.548
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6611.64
Skew:,3.519,Prob(JB):,0.0
Kurtosis:,24.312,Cond. No.,1940000.0


In [688]:
regr = linear_model.LinearRegression()
regr.fit(X, y)

LinearRegression()

In [689]:
params

['averageweight',
 'min_playtime',
 'min_players',
 'min_age',
 'top_25_mechanic',
 'Party',
 'Thematic',
 "Children's",
 'DE',
 'usd_goal']

In [694]:
test_params = [1,20,2,8,1,1,1,1,1,100000]
predicted_pledged_amount = regr.predict([test_params])
predicted_pledged_amount



array([944635.73480226])

# Test

In [253]:
main = 'unfiltered_main_stats_cleaned'

In [254]:
df_ks = get_dataframe(f"SELECT * FROM {schema}.{kickstarter}")
df_slug = get_dataframe(f"SELECT * FROM {schema}.{slug}")
df_mech = get_dataframe(f"SELECT * FROM {schema}.{mechanic}")
df_u_mech = get_dataframe(f"SELECT * FROM {schema}.{unique_mechanics}")
df_main = get_dataframe(f"SELECT * FROM {schema}.{main}")
df_sub = get_dataframe(f"SELECT * FROM {schema}.{subdomain}")
df_u_sub = get_dataframe(f"SELECT * FROM {schema}.{unique_subdomain}")

In [255]:
df_ks = pd.merge(df_slug,df_ks,on='slug')
df_ks.rename({'bgg_id':'id'},axis=1,inplace=True)
df_mech = pd.merge(df_mech,df_u_mech,on='mechanic_id')
#df_mech = pd.merge(df_ks,df_mech,on='id')
df_sub = pd.merge(df_sub,df_u_sub,on="subdomain_id")
df_ks_sub = pd.merge(df_ks,df_sub,on='id')

In [256]:
upcoming_games = [300195, 360676, 366797, 342081, 341870,
                  355113, 358809, 359962, 349779, 312682]

In [257]:
df_upcoming_games = df_main.loc[df_main["id"].isin(upcoming_games), :]
df_upcoming_games.drop(['yearpublished','averageweight'],inplace=True,axis=1)

In [258]:
df_upcoming_games

Unnamed: 0,id,min_players,max_players,playtime,min_playtime,max_playtime,min_age,average,user_rated,num_owned,trading,wanting,wishing,numcomments,numweights
27556,359962,1.0,4.0,60.0,45.0,60.0,8,8.0,1,1,0,2,16,0,0
50534,312682,1.0,5.0,240.0,60.0,240.0,14,8.5625,16,2,0,9,140,9,2
52076,341870,,,,,,0,9.33333,3,2,0,68,454,7,0
52516,349779,1.0,2.0,180.0,150.0,180.0,14,7.35,4,0,0,2,19,2,3
67473,342081,1.0,4.0,180.0,60.0,180.0,14,8.0,2,2,0,4,24,1,2
69367,300195,1.0,6.0,180.0,90.0,180.0,14,0.0,0,4,0,3,70,1,0
76040,355113,2.0,4.0,120.0,45.0,120.0,10,8.75,4,1,0,0,4,4,1
79263,358809,2.0,4.0,80.0,40.0,80.0,14,9.33333,3,2,0,0,5,2,0
129886,360676,2.0,2.0,20.0,20.0,20.0,12,0.0,0,1,0,6,55,1,0


In [259]:
df_upcoming_games = pd.merge(df_upcoming_games, df_mech, on='id')

In [260]:
df_upcoming_games = pd.merge(df_upcoming_games, df_sub, on='id')

In [261]:
df_upcoming_games

Unnamed: 0,id,min_players,max_players,playtime,min_playtime,max_playtime,min_age,average,user_rated,num_owned,trading,wanting,wishing,numcomments,numweights,mechanic_id,mechanic,subdomain_id,subdomain_name
0,349779,1.0,2.0,180.0,150.0,180.0,14,7.35,4,0,0,2,19,2,3,2040,hand management,4664,Wargames
1,349779,1.0,2.0,180.0,150.0,180.0,14,7.35,4,0,0,2,19,2,3,2046,area movement,4664,Wargames
2,349779,1.0,2.0,180.0,150.0,180.0,14,7.35,4,0,0,2,19,2,3,2857,card play conflict resolution,4664,Wargames
3,349779,1.0,2.0,180.0,150.0,180.0,14,7.35,4,0,0,2,19,2,3,2902,income,4664,Wargames


In [262]:
family_query = f"SELECT * FROM {schema}.family_bgg;"
df_family = get_dataframe(family_query)

df_upcoming = pd.merge(df_main,df_family,on='id')

In [263]:
print(df_upcoming.family_value.unique())

['food / cooking' 'aliens / extraterrestrials' 'anima tactics' ... 'sonix'
 '50 clues – the maria trilogy' 'hero immortal king']


In [282]:
all_upcoming_games = df_upcoming.query("family_type == 'admin' and family_value == 'upcoming releases'")

In [283]:
all_upcoming_games.columns

Index(['id', 'yearpublished', 'min_players', 'max_players', 'playtime',
       'min_playtime', 'max_playtime', 'min_age', 'average', 'user_rated',
       'num_owned', 'trading', 'wanting', 'wishing', 'numcomments',
       'numweights', 'averageweight', 'family_type', 'family_value',
       'family_id'],
      dtype='object')

In [284]:
all_upcoming_games.drop(["yearpublished", 'trading', 'numcomments','family_type', 'family_value',
       'family_id','average', 'user_rated'],axis=1,inplace=True)

In [285]:
all_upcoming_games.nunique()

id               5499
min_players         8
max_players        29
playtime           84
min_playtime       43
max_playtime       84
min_age            22
num_owned         399
wanting           106
wishing           346
numweights         32
averageweight      87
dtype: int64

In [274]:
all_upcoming_games.dropna(inplace=True)

In [275]:
all_upcoming_games

Unnamed: 0,id,min_players,max_players,playtime,min_playtime,max_playtime,min_age,average,user_rated,num_owned,wanting,wishing,numweights
278,344235,3.0,8.0,25.0,15.0,25.0,11,10.00000,1,1,0,0,0
281,344240,3.0,5.0,45.0,20.0,45.0,9,5.05000,10,18,0,1,1
291,344249,2.0,6.0,20.0,20.0,20.0,8,6.66667,3,9,1,1,0
303,344259,1.0,1.0,20.0,5.0,20.0,8,0.00000,0,1,0,1,0
314,344268,2.0,4.0,40.0,20.0,40.0,9,5.43471,66,7,8,31,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
208691,218280,1.0,6.0,120.0,30.0,120.0,13,0.00000,0,0,0,4,0
208841,250992,2.0,8.0,30.0,15.0,30.0,11,9.00000,1,1,0,1,0
208894,298086,1.0,6.0,120.0,45.0,120.0,10,7.94552,32,11,5,57,2
208941,298133,1.0,4.0,90.0,60.0,90.0,14,0.00000,0,3,0,0,0
