In [427]:
from sql_functions import get_dataframe, get_engine, build_table
import pandas as pd
import capstone_functions as cf
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
from scipy import interpolate
from sklearn import linear_model
import seaborn as sns

pd.options.mode.chained_assignment = None  # default='warn'

############################
# VARIBALES
############################
# How many mechanics are in the top mechanics
top_XX_mechanic = 25

#schema and table names
schema = 'bgg_data'

table_main = 'unfiltered_main_stats_cleaned'
table_subdomain = 'subdomain'
table_unique_subdomain = 'unique_subdomain'
table_kickstarter = 'kickstarter_unique_campaigns'
table_slug = 'unique_slug_bgg_id'
table_mechanic = 'mechanics'
table_unique_mechanics = 'unique_mechanics'
table_family = 'family_bgg'

engine = get_engine()

### Build base dataframes

In [428]:
df_main = get_dataframe(f"SELECT * FROM {schema}.{table_main}")
df_family = get_dataframe(f"SELECT * FROM {schema}.{table_family};")
df_mech = get_dataframe(f"SELECT * FROM {schema}.{table_mechanic}")
df_u_mech = get_dataframe(f"SELECT * FROM {schema}.{table_unique_mechanics}")
df_mech = pd.merge(df_mech,df_u_mech,on='mechanic_id')

### CREATE Dataframe for upcoming games

In [429]:
df_main_family = pd.merge(df_main,df_family,on='id')
df_upcoming_games = df_main_family.query("family_type == 'admin' and family_value == 'upcoming releases'")
df_upcoming_games.drop(["yearpublished", 'trading', 'numcomments','family_type', 'family_value',
       'family_id','average', 'user_rated'],axis=1,inplace=True)

### CREATE Dataframe for mechanics and list for top mechanics

In [430]:
df_temp = pd.merge(df_main,df_mech,on='id')
top_rated_mechanics_list = list(df_temp.groupby('mechanic').mean('average').sort_values('average',ascending=False).reset_index().mechanic.head(top_XX_mechanic))

df_mech["is_in_top_XX_mechanics"] = df_mech["mechanic"].isin(top_rated_mechanics_list)
df_mech = df_mech[['id','is_in_top_XX_mechanics']]
df_mech = df_mech.groupby('id').sum().reset_index()
df_mech[f"top_{top_XX_mechanic}_mechanic"] = df_mech.is_in_top_XX_mechanics > 0
df_mech = df_mech[['id',f"top_{top_XX_mechanic}_mechanic"]]

### MERGE upcoming_games and mech

In [431]:
df_upcoming_games_mech = pd.merge(df_upcoming_games,df_mech, on='id')

### Filter for only upcoming games on kickstarter

In [432]:
df_upcoming_games_mech_only_ks_and_topXX = df_upcoming_games_mech.loc[df_upcoming_games_mech.kickstarter,:]

In [433]:
df_upcoming_games_mech_only_ks_and_topXX.dropna(inplace=True)
df_upcoming = df_upcoming_games_mech_only_ks_and_topXX.sort_values('wishing',ascending=False)
df_upcoming.drop(['playtime','kickstarter'],axis=1,inplace=True)
df_upcoming[f"top_{top_XX_mechanic}_mechanic"] = df_upcoming[f"top_{top_XX_mechanic}_mechanic"].astype(int)

In [434]:
df_upcoming

Unnamed: 0,id,min_players,max_players,min_playtime,max_playtime,min_age,num_owned,wanting,wishing,numweights,averageweight,top_25_mechanic
3787,295770,1.0,4.0,30.0,120.0,14,1365,152,3028,48,3.5625,0
3079,219650,1.0,4.0,60.0,120.0,14,196,161,1758,1,4.0000,0
4271,337627,1.0,4.0,90.0,240.0,15,260,129,1700,45,4.3556,0
4919,322289,1.0,4.0,60.0,120.0,14,522,176,1687,62,3.7419,1
4228,240980,5.0,20.0,30.0,120.0,15,917,201,1673,65,3.0923,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3475,317640,2.0,6.0,10.0,20.0,12,6,0,0,1,1.0000,0
1579,240110,5.0,10.0,30.0,60.0,4,0,0,0,1,2.0000,0
1559,340069,3.0,8.0,15.0,30.0,10,3,0,0,1,1.0000,0
1552,340023,2.0,4.0,15.0,30.0,8,0,0,0,1,1.0000,0


### CREATE dataframe for existing games

In [435]:
df_main_family = pd.merge(df_main,df_family,on='id')
df_existing_games = df_main_family.query("family_value != 'upcoming releases'")
df_existing_games.drop(["yearpublished", 'trading', 'numcomments','family_type', 'family_value',
       'family_id','average', 'playtime', 'user_rated'],axis=1,inplace=True)

### CREATE dataframe for kickstarter games

In [436]:

df_ks = get_dataframe(f"SELECT * FROM {schema}.{table_kickstarter}")
df_slug = get_dataframe(f"SELECT * FROM {schema}.{table_slug}")
df_ks = pd.merge(df_slug,df_ks,on='slug')
df_ks.rename({'bgg_id':'id'},axis=1,inplace=True)
df_ks['usd_goal'] = df_ks['goal']*(df_ks.pledged/df_ks.usd_pledged)
df_ks = df_ks[['id','usd_pledged','usd_goal']]

In [437]:
df_existing = pd.merge(df_existing_games,df_ks,on='id')
df_existing = pd.merge(df_existing,df_mech,on='id')
df_existing.drop('kickstarter',axis=1,inplace=True)
df_existing.drop_duplicates(inplace=True)
df_existing.dropna(inplace=True)
df_existing.top_25_mechanic = df_existing.top_25_mechanic.astype(int)

In [438]:
df_existing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2759 entries, 0 to 10157
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               2759 non-null   int64  
 1   min_players      2759 non-null   float64
 2   max_players      2759 non-null   float64
 3   min_playtime     2759 non-null   float64
 4   max_playtime     2759 non-null   float64
 5   min_age          2759 non-null   int64  
 6   num_owned        2759 non-null   int64  
 7   wanting          2759 non-null   int64  
 8   wishing          2759 non-null   int64  
 9   numweights       2759 non-null   int64  
 10  averageweight    2759 non-null   float64
 11  usd_pledged      2759 non-null   float64
 12  usd_goal         2759 non-null   float64
 13  top_25_mechanic  2759 non-null   int64  
dtypes: float64(7), int64(7)
memory usage: 323.3 KB


In [439]:
df_existing = df_existing[(np.abs(stats.zscore(df_existing['min_playtime'])) < 2.5)]
df_existing = df_existing[(np.abs(stats.zscore(df_existing['max_playtime'])) < 2.5)]
df_existing = df_existing[df_existing.min_age < 19]
df_existing = df_existing[(np.abs(stats.zscore(df_existing['wishing'])) < 3)]
df_existing = df_existing[(np.abs(stats.zscore(df_existing['usd_goal'])) < 3)]

In [440]:
# df_existing.min_players = df_existing.min_players.astype(int)
# df_existing.max_players = df_existing.max_players.astype(int)
# df_existing.min_playtime = df_existing.min_playtime.astype(int)
# df_existing.max_playtime = df_existing.max_playtime.astype(int)
df_existing.top_25_mechanic = df_existing.top_25_mechanic.astype(int)
#df_existing.averageweight = df_existing.averageweight.round(2)
#df_existing.usd_pledged = df_existing.usd_pledged.round(2)

In [441]:
params = [#'id',
        'min_players', 
        #'max_players', 
        'min_playtime', 
        #'max_playtime',
        'min_age', 
        #'num_owned', 
        'wanting', 
        'wishing', 
        #'numweights',
        'averageweight', 
        #'usd_pledged',  
        #'top_25_mechanic',
        #'usd_goal'
        ]
y_train = df_existing.usd_pledged
X_train = df_existing[params]
df_upcoming["usd_goal"] = df_existing['usd_goal'].mean()
#X = sm.add_constant(X)

In [442]:
# create an OLS model
our_model = sm.OLS(y_train, X_train)
model_results = our_model.fit()
model_results.summary() # summary contains eg. 'const' (intercept) and 'slope' of the regression equation.

0,1,2,3
Dep. Variable:,usd_pledged,R-squared (uncentered):,0.304
Model:,OLS,Adj. R-squared (uncentered):,0.302
Method:,Least Squares,F-statistic:,193.8
Date:,"Sat, 16 Jul 2022",Prob (F-statistic):,2.05e-205
Time:,10:10:07,Log-Likelihood:,-39027.0
No. Observations:,2674,AIC:,78070.0
Df Residuals:,2668,BIC:,78100.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
min_players,-5.225e+04,1.12e+04,-4.653,0.000,-7.43e+04,-3.02e+04
min_playtime,1035.0264,403.909,2.563,0.010,243.021,1827.032
min_age,3844.5633,2957.560,1.300,0.194,-1954.779,9643.906
wanting,-6616.5698,386.716,-17.110,0.000,-7374.863,-5858.276
wishing,1437.2669,63.655,22.579,0.000,1312.448,1562.086
averageweight,4.762e+04,1.54e+04,3.088,0.002,1.74e+04,7.79e+04

0,1,2,3
Omnibus:,3521.068,Durbin-Watson:,1.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,871128.757
Skew:,7.203,Prob(JB):,0.0
Kurtosis:,90.242,Cond. No.,862.0


In [443]:
regr = linear_model.LinearRegression()
regr.fit(X_train.values, y_train.values)

LinearRegression()

In [444]:
X_test = df_upcoming[params]
X_IDs = df_upcoming['id']

In [445]:
df_predict = pd.DataFrame(columns = ['id','predicted_usd_pledged'])
#print(df_predict)
for row in X_test.iterrows():
    game_id = df_upcoming.loc[row[0],'id']
    test_params = row[1].to_list()
    predicted_pledged_amount = regr.predict([test_params])
    new_row = [game_id,predicted_pledged_amount[0]]
    df_predict.loc[len(df_predict)] = new_row

In [426]:
df_predict.describe()

Unnamed: 0,id,predicted_usd_pledged
count,804.0,804.0
mean,297921.936567,172702.7
std,48831.165314,254657.4
min,113503.0,-151538.6
25%,270266.0,46654.94
50%,311824.5,118676.1
75%,336176.5,206854.3
max,365602.0,3552950.0


In [310]:
table_game_names = 'game_names'
df_names = get_dataframe(f"SELECT id, game_name from {schema}.{table_game_names} WHERE primary_bool = 1;")

In [312]:
df_OUT = pd.merge(df_names,df_predict,on='id')

In [449]:
df_OUT.sort_values('predicted_usd_pledged',ascending=False).head(25)

Unnamed: 0,id,game_name,predicted_usd_pledged
603,295770,Frosthaven,3539392.0
666,337627,Voidfall,1895253.0
266,331106,The Witcher: Old World,1781887.0
508,219650,Arydia: The Paths We Dare Tread,1713556.0
769,322289,Darwin's Journey,1522397.0
342,350316,Wayfarers of the South Tigris,1447636.0
180,359871,Arcs,1353418.0
768,360692,Septima,1284162.0
517,311988,Frostpunk: The Board Game,1209964.0
798,286063,The 7th Citadel,1151645.0


In [319]:
build_table(engine,'predicted_pledged_amount',df_OUT,schema)

The predicted_pledged_amount table was imported successfully.
