In [5]:
from sql_functions import get_dataframe
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn

from pylab import rcParams
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats

import warnings
warnings.filterwarnings("ignore")
RSEED = 10

In [6]:
schema = "bgg_data"
kickstarter = "kickstarter_unique_campaigns"
slug = "unique_slug_bgg_id"
main_stats = "unfiltered_main_stats_cleaned"
stat = "statistics"

df_ks = get_dataframe(f"SELECT * FROM {schema}.{kickstarter}")
df_slug = get_dataframe(f"SELECT * FROM {schema}.{slug}")
df_main_stats = get_dataframe(f"SELECT * FROM {schema}.{main_stats}")
#df_ks.rename({'bgg_id':'id'},axis=1,inplace=True)

In [7]:
df_ks_slug = pd.merge(df_slug,df_ks,on='slug')
df_ks_slug.rename({'bgg_id':'id'},axis=1,inplace=True)

In [8]:
df = pd.merge(df_ks_slug,df_main_stats,on='id')

In [9]:
df.columns

Index(['slug', 'id', 'deadline', 'created_at', 'launched_at', 'country',
       'currency', 'goal', 'pledged', 'backers_count', 'usd_pledged',
       'state_changed_at', 'successful', 'yearpublished', 'min_players',
       'max_players', 'playtime', 'min_playtime', 'max_playtime', 'min_age',
       'average', 'user_rated', 'num_owned', 'trading', 'wanting', 'wishing',
       'numcomments', 'numweights', 'averageweight'],
      dtype='object')

In [10]:
cols_to_drop = ['id',
                'slug',
                'deadline', 
                'created_at',
                'launched_at',
                'country',
                'pledged',
                'backers_count',
                'currency',
                'state_changed_at',
                'successful',
                'yearpublished',
                'playtime',
                'wishing',
                'numcomments',
                'user_rated',
                'num_owned',
                'trading',
                'numweights']

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4537 entries, 0 to 4536
Data columns (total 29 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   slug              4537 non-null   object        
 1   id                4537 non-null   int64         
 2   deadline          4537 non-null   datetime64[ns]
 3   created_at        4537 non-null   datetime64[ns]
 4   launched_at       4537 non-null   datetime64[ns]
 5   country           4537 non-null   object        
 6   currency          4537 non-null   object        
 7   goal              4537 non-null   int64         
 8   pledged           4537 non-null   float64       
 9   backers_count     4537 non-null   int64         
 10  usd_pledged       4537 non-null   float64       
 11  state_changed_at  4537 non-null   datetime64[ns]
 12  successful        4537 non-null   bool          
 13  yearpublished     4405 non-null   float64       
 14  min_players       4521 n

In [95]:
df.drop(cols_to_drop,axis=1,inplace=True)

In [96]:
df.dropna(axis=0,inplace=True)

In [97]:
stats.zscore(df['min_playtime'])

2      -0.246679
3      -0.246679
4       0.012582
6       0.401475
7       0.401475
          ...   
4530   -0.117049
4531    0.142213
4532    0.012582
4533   -0.203469
4535   -0.289890
Name: min_playtime, Length: 2950, dtype: float64

In [98]:
df = df[(np.abs(stats.zscore(df['min_playtime'])) < 2)]
df = df[(np.abs(stats.zscore(df['max_playtime'])) < 2)]
df = df[(np.abs(stats.zscore(df['max_players'])) < 2)]
df = df[(np.abs(stats.zscore(df['min_players'])) < 3)]
df = df[df["min_age"] < 19]

In [99]:
df.describe()

Unnamed: 0,goal,usd_pledged,min_players,max_players,min_playtime,max_playtime,min_age,average,wanting,averageweight
count,2810.0,2810.0,2810.0,2810.0,2810.0,2810.0,2810.0,2810.0,2810.0,2810.0
mean,25304.47,160783.1,1.73879,4.673665,40.55694,70.376157,11.080427,7.0161,46.670107,2.196895
std,66162.77,576094.3,0.61147,1.687361,29.522215,50.316731,3.195194,1.156285,124.743347,0.767368
min,5.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
25%,5000.0,9106.225,1.0,4.0,20.0,30.0,9.0,6.466095,2.0,1.6667
50%,12000.0,26566.5,2.0,4.0,30.0,60.0,12.0,7.06618,9.0,2.0
75%,25000.0,89845.5,2.0,6.0,60.0,90.0,14.0,7.673322,36.0,2.75
max,2200000.0,12969610.0,3.0,16.0,250.0,360.0,18.0,10.0,2008.0,4.8173


In [100]:
y = df["usd_pledged"]
X = df.drop("usd_pledged", axis=1)

In [102]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2810 entries, 2 to 4535
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   goal           2810 non-null   int64  
 1   min_players    2810 non-null   float64
 2   max_players    2810 non-null   float64
 3   min_playtime   2810 non-null   float64
 4   max_playtime   2810 non-null   float64
 5   min_age        2810 non-null   int64  
 6   average        2810 non-null   float64
 7   wanting        2810 non-null   int64  
 8   averageweight  2810 non-null   float64
dtypes: float64(6), int64(3)
memory usage: 219.5 KB


In [108]:
# Train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [109]:
classifier = LogisticRegression() # instantiate a sklearn logistic regression class
classifier.fit(X_train, y_train) # fit the classifier/model on our train data 
y_prediction = classifier.predict(X_test) # use the fit model to predict on our test data 

#have a look at the predicitons
#y_prediction[:10]

ValueError: Unknown label type: 'continuous'