In [4]:
from sql_functions import get_dataframe
import pandas as pd
import Capstone_functions as cf
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
from scipy import interpolate
from sklearn import linear_model

# Variables

In [5]:
top_XX_mechanic = 25

schema = "bgg_data"
main = "ml_boardgame_stats"
subdomain = "subdomain"
unique_subdomain = "unique_subdomain"
kickstarter = "kickstarter_unique_campaigns"
slug = "unique_slug_bgg_id"
mechanic = "mechanics"
unique_mechanics = "unique_mechanics"


#honor = "honor_clean"



## Create Dataframes for Kickstarter and Mechanic

In [6]:

df_ks = get_dataframe(f"SELECT * FROM {schema}.{kickstarter}")
df_slug = get_dataframe(f"SELECT * FROM {schema}.{slug}")
df_mech = get_dataframe(f"SELECT * FROM {schema}.{mechanic}")
df_u_mech = get_dataframe(f"SELECT * FROM {schema}.{unique_mechanics}")

#df_marketplace = cf.avg_price_from_marketplace()
#df_honor = get_dataframe(f"SELECT * FROM {schema}.{honor}")

Merge kickstarter table with bgg_ids and rename column bgg_id to id

In [7]:
df_ks = pd.merge(df_slug,df_ks,on='slug')
df_ks.rename({'bgg_id':'id'},axis=1,inplace=True)

create dataframe with all mechanics and merge it with the kickstarter dataframe

In [8]:
df_mech = pd.merge(df_mech,df_u_mech,on='mechanic_id')
df_mech = pd.merge(df_ks,df_mech,on='id')

create a list with the top XX mechanics

In [9]:
top_mechanics_list = list(df_mech.groupby(['mechanic']).count().sort_values(ascending=False,axis=0,by='id').reset_index()["mechanic"].head(top_XX_mechanic))

reduce the dataframe such that IDs are unique

In [10]:
df_mech["is_in_top_XX_mechanics"] = df_mech["mechanic"].isin(top_mechanics_list)
df_mech = df_mech[['id','is_in_top_XX_mechanics']]
df_mech = df_mech.groupby('id').sum().reset_index()

Because there are multiple mechanics for one ID, it is possible that for the same ID there are some mechanics in the top and others are not.
- create new column with True/False if ID is in top XX categories (XX will be set in the top code field: "Variables" -> "top_XX_mechanic")

In [11]:
df_mech[f"top_{top_XX_mechanic}_mechanic"] = df_mech.is_in_top_XX_mechanics > 0

In [12]:
df_mech = df_mech[['id',f"top_{top_XX_mechanic}_mechanic"]]

## Create Dataframes for main and subdomain

In [13]:
df_main = get_dataframe(f"SELECT * FROM {schema}.{main}")
df_sub = get_dataframe(f"SELECT * FROM {schema}.{subdomain}")
df_u_sub = get_dataframe(f"SELECT * FROM {schema}.{unique_subdomain}")

- merge Kickstarter with subdomains => 1021 non-null

In [14]:
df_sub = pd.merge(df_sub,df_u_sub,on="subdomain_id")
df_ks_sub = pd.merge(df_ks,df_sub,on='id')

- merge Kickstarter_subdomains with main

In [15]:
df_ks_sub_main = pd.merge(df_ks_sub,df_main,on='id')

- Build new columns with goal in USD

In [16]:
df_ks_sub_main["usd_goal"] = df_ks_sub_main['goal']*(df_ks_sub_main.pledged/df_ks_sub_main.usd_pledged)

# Machine Learning
- Extract only necessary columns for our ML and drop all null

In [17]:
df_ML = df_ks_sub_main[['country','usd_pledged','subdomain_name','min_players','max_players','min_playtime','max_playtime','min_age','averageweight','usd_goal']]
#df_ML.dropna(inplace=True);

In [18]:
df_ML["min_playtime"].fillna(0,inplace=True);
df_ML["max_playtime"].fillna(df_ML["max_playtime"].max(),inplace=True);

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ML["min_playtime"].fillna(0,inplace=True);
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ML["max_playtime"].fillna(df_ML["max_playtime"].max(),inplace=True);


In [19]:
df_ML = df_ML[(np.abs(stats.zscore(df_ML['min_playtime'])) < 2.5)]
df_ML = df_ML[(np.abs(stats.zscore(df_ML['max_playtime'])) < 3)]

In [20]:
df_ML.describe()

Unnamed: 0,usd_pledged,min_players,max_players,min_playtime,max_playtime,min_age,averageweight,usd_goal
count,319.0,319.0,319.0,319.0,319.0,319.0,319.0,319.0
mean,137399.6,1.786834,4.711599,52.047022,88.573668,10.880878,2.37685,36890.98
std,277044.0,0.717054,5.88182,36.361752,74.854702,3.26477,0.776606,202487.0
min,40.49,1.0,1.0,5.0,10.0,0.0,1.0,4.569029
25%,18529.0,1.0,2.0,30.0,40.0,10.0,1.9,5000.0
50%,43668.01,2.0,4.0,45.0,60.0,12.0,2.3,12400.45
75%,103764.0,2.0,6.0,60.0,120.0,13.0,2.8572,25000.0
max,2559458.0,7.0,100.0,180.0,480.0,17.0,4.6163,2513552.0


In [21]:
df_ML.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 319 entries, 0 to 335
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   country         319 non-null    object 
 1   usd_pledged     319 non-null    float64
 2   subdomain_name  319 non-null    object 
 3   min_players     319 non-null    float64
 4   max_players     319 non-null    float64
 5   min_playtime    319 non-null    float64
 6   max_playtime    319 non-null    float64
 7   min_age         319 non-null    int64  
 8   averageweight   319 non-null    float64
 9   usd_goal        319 non-null    float64
dtypes: float64(7), int64(1), object(2)
memory usage: 27.4+ KB


In [22]:
subdomain_dummy = pd.get_dummies(df_ML.subdomain_name.apply(pd.Series).stack(), drop_first=True).sum(level=0)
subdomain_dummy.columns = subdomain_dummy.columns.str.strip()

  subdomain_dummy = pd.get_dummies(df_ML.subdomain_name.apply(pd.Series).stack(), drop_first=True).sum(level=0)


In [23]:
country_dummy = pd.get_dummies(df_ML.country.apply(pd.Series).stack(), drop_first=True).sum(level=0)
country_dummy.columns = country_dummy.columns.str.strip()

  country_dummy = pd.get_dummies(df_ML.country.apply(pd.Series).stack(), drop_first=True).sum(level=0)


In [24]:
df = pd.concat([df_ML,subdomain_dummy], axis=1)
df = df.drop(["subdomain_name"], axis=1)
df.columns

Index(['country', 'usd_pledged', 'min_players', 'max_players', 'min_playtime',
       'max_playtime', 'min_age', 'averageweight', 'usd_goal', 'Children's',
       'Customizable', 'Family', 'Party', 'Strategy', 'Thematic', 'Wargames'],
      dtype='object')

In [25]:
df = pd.concat([df,country_dummy], axis=1)
df = df.drop(["country"], axis=1)
df.columns

Index(['usd_pledged', 'min_players', 'max_players', 'min_playtime',
       'max_playtime', 'min_age', 'averageweight', 'usd_goal', 'Children's',
       'Customizable', 'Family', 'Party', 'Strategy', 'Thematic', 'Wargames',
       'AU', 'BE', 'CA', 'CH', 'DE', 'ES', 'FR', 'GB', 'HK', 'IT', 'NL', 'NO',
       'SG', 'US'],
      dtype='object')

In [26]:
X = df[['AU', 'BE', 'CA', 'CH', 'DE', 'ES', 'FR', 'GB', 'HK', 'IT', 'NL', 'NO', 'SG', 'US',
        "Children's", "Customizable", "Family", "Party", "Strategy", "Thematic", "Wargames", 
        "min_players", "max_players", "min_playtime", "min_age", "averageweight", 'usd_goal']]

In [27]:
y = df.usd_pledged

In [36]:
df2 = df[['DE', "Party", "min_playtime", "averageweight", 'usd_goal', "usd_pledged"]]

In [28]:
X = sm.add_constant(X)

In [29]:
# create an OLS model
our_model = sm.OLS(y, X)

# use the data to calculate the intercept and slope
model_results = our_model.fit()

# return the output of the model
model_results.summary() # summary contains eg. 'const' (intercept) and 'slope' of the regression equation.

0,1,2,3
Dep. Variable:,usd_pledged,R-squared:,0.222
Model:,OLS,Adj. R-squared:,0.15
Method:,Least Squares,F-statistic:,3.084
Date:,"Wed, 13 Jul 2022",Prob (F-statistic):,1.36e-06
Time:,12:01:12,Log-Likelihood:,-4409.7
No. Observations:,319,AIC:,8875.0
Df Residuals:,291,BIC:,8981.0
Df Model:,27,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-9.687e+04,2.06e+05,-0.471,0.638,-5.01e+05,3.08e+05
AU,5.406e+04,2.14e+05,0.252,0.801,-3.67e+05,4.75e+05
BE,2.713e+05,2.24e+05,1.211,0.227,-1.7e+05,7.12e+05
CA,2.382e+05,2.07e+05,1.150,0.251,-1.69e+05,6.46e+05
CH,-9731.5544,2.37e+05,-0.041,0.967,-4.75e+05,4.56e+05
DE,4.584e+05,2.26e+05,2.027,0.044,1.33e+04,9.04e+05
ES,5.702e+04,2.18e+05,0.262,0.794,-3.72e+05,4.86e+05
FR,5.663e+04,2.03e+05,0.280,0.780,-3.42e+05,4.55e+05
GB,9.539e+04,1.9e+05,0.503,0.615,-2.78e+05,4.69e+05

0,1,2,3
Omnibus:,282.973,Durbin-Watson:,1.555
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6835.612
Skew:,3.601,Prob(JB):,0.0
Kurtosis:,24.504,Cond. No.,10500000.0


In [54]:
X = df[['averageweight','usd_goal','min_playtime','Party','DE']]
X = sm.add_constant(X)
X.describe()

Unnamed: 0,const,averageweight,usd_goal,min_playtime,Party,DE
count,319.0,319.0,319.0,319.0,319.0,319.0
mean,1.0,2.37685,36890.98,52.047022,0.053292,0.012539
std,0.0,0.776606,202487.0,36.361752,0.224967,0.111449
min,1.0,1.0,4.569029,5.0,0.0,0.0
25%,1.0,1.9,5000.0,30.0,0.0,0.0
50%,1.0,2.3,12400.45,45.0,0.0,0.0
75%,1.0,2.8572,25000.0,60.0,0.0,0.0
max,1.0,4.6163,2513552.0,180.0,1.0,1.0


In [40]:
df2 = sm.add_constant(df2)

In [49]:
col = df2.columns.to_list()
col

['const',
 'DE',
 'Party',
 'min_playtime',
 'averageweight',
 'usd_goal',
 'usd_pledged']

In [46]:
const = [df2["const"].to_list()]
avgwei = [df2["averageweight"].to_list()]
usd_goal = [df2["usd_goal"].to_list()]
min_playtime = [df2["min_playtime"].to_list()]
party = [df2["Party"].to_list()]
DE = [df2["DE"].to_list()]
usd_pledged = [df2["usd_pledged"].to_list()]

const

## Für Christopher


[[1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,

In [50]:
dic = {
col[0]: const,
col[1]: DE,
col[2]: party,
col[3]: min_playtime,
col[4]: avgwei,
col[5]: usd_goal,
col[6]: usd_pledged
}

In [51]:
df_lin_re = pd.DataFrame(dic)
df_lin_re

Unnamed: 0,const,DE,Party,min_playtime,averageweight,usd_goal,usd_pledged
0,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[90.0, 15.0, 120.0, 5.0, 60.0, 10.0, 60.0, 30....","[2.625, 1.0385, 2.6667, 1.675, 2.2534, 2.0714,...","[786.7978359249521, 44415.55900059686, 27663.0...","[10105.01, 210531.26, 63244.0, 12058.0, 76254...."


In [52]:
df_lin_re.to_csv("/Users/jannikduda/neuefische/Daily-Lama-Capstone/data/bgg_lin_re.csv")

In [56]:
# create an OLS model
our_model = sm.OLS(y, X)
#our_model = sm.MANOVA(y, X)

# use the data to calculate the intercept and slope
model_results = our_model.fit()

# return the output of the model
model_results.summary() # summary contains eg. 'const' (intercept) and 'slope' of the regression equation.

0,1,2,3
Dep. Variable:,usd_pledged,R-squared:,0.173
Model:,OLS,Adj. R-squared:,0.16
Method:,Least Squares,F-statistic:,13.13
Date:,"Wed, 13 Jul 2022",Prob (F-statistic):,1.28e-11
Time:,14:58:04,Log-Likelihood:,-4419.5
No. Observations:,319,AIC:,8851.0
Df Residuals:,313,BIC:,8873.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-8.058e+04,4.92e+04,-1.637,0.103,-1.77e+05,1.63e+04
averageweight,1.052e+05,2.24e+04,4.693,0.000,6.11e+04,1.49e+05
usd_goal,0.1524,0.072,2.120,0.035,0.011,0.294
min_playtime,-1208.8459,459.565,-2.630,0.009,-2113.073,-304.619
Party,3.985e+05,6.61e+04,6.025,0.000,2.68e+05,5.29e+05
DE,3.158e+05,1.28e+05,2.464,0.014,6.36e+04,5.68e+05

0,1,2,3
Omnibus:,278.83,Durbin-Watson:,1.526
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6227.891
Skew:,3.555,Prob(JB):,0.0
Kurtosis:,23.445,Cond. No.,1850000.0


In [63]:
model_results.rsquared

0.17340417146148845

In [60]:
regr = linear_model.LinearRegression()
regr.fit(X, y)

LinearRegression()

In [61]:
# X = df[['averageweight','usd_goal','min_playtime','Party', 'DE']]
predicted_pledged_amount = regr.predict([[4, 50000, 20, 1, 0]])
predicted_pledged_amount



ValueError: X has 5 features, but LinearRegression is expecting 6 features as input.