In [1]:
# Data handling
import pandas as pd
import numpy as np
import random
import missingno as msn
from sklearn.impute import SimpleImputer
from scipy import stats
import datetime as dt
# Scrapping data
from bs4 import BeautifulSoup
import requests
import csv
import re
# Plotting
import matplotlib.pyplot as plt
from matplotlib import cm 
from pandas.plotting import scatter_matrix
import seaborn as sns
from matplotlib import dates as mpl_dates
# Data Split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
# Data Pre-Processing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
# Modeling
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
import xgboost as xgb

# Assesing model performance
from sklearn.metrics import SCORERS
from sklearn.model_selection import cross_val_score

# Tunning
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

# Ignoring warnings
import warnings
warnings.filterwarnings("ignore")

## 1. Data Cleaning and feature enginnering

[Data collection](https://www.kaggle.com/caesuric/bgggamesdata?select=basic_data.csv)

In [2]:
pd.set_option("display.max_columns", 40)
pd.set_option("display.max_rows", 1000)

In [9]:
df2 = pd.read_csv("database.sqlite/board_games.csv")

In [10]:
df2.columns

Index(['name', 'description', 'thumbnail', 'image', 'rating', 'bayes_rating',
       'usersrated', 'bggrank', 'stddev', 'owned', 'trading', 'wanting',
       'wishing', 'numweights', 'averageweight', 'yearpublished', 'minplayers',
       'maxplayers', 'playingtime', 'minplaytime', 'maxplaytime', 'age'],
      dtype='object')

In [12]:
df2 = df2.drop(["thumbnail", "image"], axis=1)

In [13]:
# it may be interesting to have the description length as an independent feature
df2["description_length"] = df2["description"].str.len()

Next, lets look at the nan values

In [7]:
df2.isna().sum()

name                       0
description                0
rating                     0
bayes_rating               0
usersrated                 0
bggrank                    0
stddev                     0
owned                      0
trading                    0
wanting                    0
wishing                    0
numweights                 0
averageweight              0
yearpublished         167543
minplayers            167543
maxplayers            167543
playingtime           167543
minplaytime           167543
maxplaytime           167543
age                   167543
description_length         0
dtype: int64

In [14]:
df2 = df2.dropna()

In [18]:
df2 = df2.reset_index(drop=True)

We can observe that there are some clear outliers such the negative yearpublish. Before removing outliers we need to decide what numerical attributes we want to keep.

Let's remove outliers beased on the **Z-score**

In [19]:
# divinding out data set into numeric and object. 
df2_num = df2.select_dtypes(include=["int64", "float64"])
df2_cat = df2.select_dtypes(include="object")

In [22]:
df2_num = df2_num[(np.abs(stats.zscore(df2_num)) < 3).all(axis=1)]

Let's focus on the categorical values next

We can join now the df_num and df_cat

In [25]:
df2 = pd.concat([df2_num, df2_cat], axis=1, sort=False)

In [28]:
df2 = df2.dropna().reset_index(drop=True)

In [30]:
df2["owned by year"] = df2["owned"]/(2020 - df2["yearpublished"])

In [37]:
df2 = df2.loc[~(df2["owned by year"] < 0)]

In [39]:
df2 = df2.reset_index(drop=True)

In [41]:
df2["yearpublished"] = df2["yearpublished"].astype(int)

In [43]:
df2

Unnamed: 0,rating,bayes_rating,usersrated,stddev,owned,trading,wanting,wishing,numweights,averageweight,yearpublished,minplayers,maxplayers,playingtime,minplaytime,maxplaytime,age,description_length,name,description,bggrank,owned by year
0,6.61412,5.80514,545.0,1.44458,1236.0,68.0,69.0,175.0,54.0,1.9630,1981,3.0,4.0,30.0,30.0,30.0,12.0,937.0,Dragonmaster,Dragonmaster is a trick-taking card game based...,3473,31.692308
1,6.61683,5.70180,334.0,1.23365,616.0,38.0,60.0,121.0,30.0,2.6667,1992,2.0,4.0,60.0,60.0,60.0,12.0,611.0,Tal der Könige,When you see the triangular box and the luxuri...,4583,22.000000
2,6.51062,5.55218,80.0,1.64101,124.0,1.0,33.0,49.0,7.0,3.0000,1989,2.0,6.0,240.0,240.0,240.0,12.0,455.0,Mare Mediterraneum,"In the ancient lands along the Mediterranean, ...",9421,4.000000
3,6.14306,5.57732,196.0,1.28950,527.0,33.0,22.0,52.0,15.0,2.4000,1993,2.0,5.0,120.0,120.0,120.0,12.0,651.0,Lords of Creation,"In this interesting offering from Warfrog, pla...",7803,19.518519
4,6.75753,6.16222,1593.0,1.22475,1820.0,88.0,53.0,184.0,181.0,1.8619,1998,3.0,4.0,25.0,25.0,25.0,10.0,1593.0,Basari,Basari is a game of gem merchants competing in...,1767,82.727273
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101766,0.00000,0.00000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0000,2021,2.0,4.0,120.0,90.0,120.0,12.0,882.0,Factory 42,"A semi-cooperative game of dwarves, negotiatio...",Not Ranked,-0.000000
101767,2.00000,0.00000,1.0,0.00000,0.0,0.0,0.0,1.0,0.0,0.0000,2019,2.0,6.0,0.0,0.0,0.0,13.0,737.0,Clue: Downton Abbey,The classic mystery board game has an aristocr...,Not Ranked,0.000000
101768,0.00000,0.00000,0.0,0.00000,1.0,0.0,0.0,0.0,0.0,0.0000,2020,3.0,5.0,20.0,5.0,20.0,14.0,418.0,Crossroads,"If you want to improve your guitar, go to the ...",Not Ranked,inf
101769,0.00000,0.00000,0.0,0.00000,0.0,0.0,1.0,2.0,0.0,0.0000,2021,2.0,2.0,480.0,240.0,480.0,12.0,1316.0,2 Minutes to Midnight,2 Minutes to Midnight is a fresh design using ...,Not Ranked,-0.000000
