# Data cleaning BGG Database

### Import Data from SQL

In [71]:
import pandas as pd
import sql_functions as sf
import Capstone_functions as cp
from IPython.display import clear_output

engine = sf.get_engine()
 
schema = "bgg_data"

In [72]:
sql = f'''SELECT * FROM {schema}.main;
      '''
df_main = sf.get_dataframe(sql)

## I start filtering by main table

In [73]:
df_main.head(2)

Unnamed: 0,id,yearpublished,min_players,max_players,playtime,min_playtime,max_playtime,min_age
0,98401,2010,2,4,30,30,30,10
1,98402,2011,2,5,30,30,30,8


In [74]:
df_main.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137153 entries, 0 to 137152
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype
---  ------         --------------   -----
 0   id             137153 non-null  int64
 1   yearpublished  137153 non-null  int64
 2   min_players    137153 non-null  int64
 3   max_players    137153 non-null  int64
 4   playtime       137153 non-null  int64
 5   min_playtime   137153 non-null  int64
 6   max_playtime   137153 non-null  int64
 7   min_age        137153 non-null  int64
dtypes: int64(8)
memory usage: 8.4 MB


### I merge the main table with the statistics table

In [75]:
sql = f'''SELECT * FROM {schema}.statistics;
      '''
df_stats = sf.get_dataframe(sql)

In [76]:
df_stats.head(2)

Unnamed: 0,id,average,user_rated,num_owned,trading,wanting,wishing,numcomments,numweights,averageweight
0,98400,0.0,0,0,0,0,0,0,0,0.0
1,98401,7.08333,3,15,0,1,3,2,2,1.5


In [120]:
df_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333291 entries, 0 to 333290
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   id             333291 non-null  int64  
 1   average        333291 non-null  float64
 2   user_rated     333291 non-null  int64  
 3   num_owned      333291 non-null  int64  
 4   trading        333291 non-null  int64  
 5   wanting        333291 non-null  int64  
 6   wishing        333291 non-null  int64  
 7   numcomments    333291 non-null  int64  
 8   numweights     333291 non-null  int64  
 9   averageweight  333291 non-null  float64
dtypes: float64(2), int64(8)
memory usage: 25.4 MB


In [None]:
df_stats.query

In [77]:
df_main_stats = pd.merge(df_main,df_stats,on='id')

In [78]:
df_main_stats.head(2)

Unnamed: 0,id,yearpublished,min_players,max_players,playtime,min_playtime,max_playtime,min_age,average,user_rated,num_owned,trading,wanting,wishing,numcomments,numweights,averageweight
0,98401,2010,2,4,30,30,30,10,7.08333,3,15,0,1,3,2,2,1.5
1,98402,2011,2,5,30,30,30,8,6.34615,26,20,0,7,13,9,5,2.2


In [121]:
df_main_stats.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 137147 entries, 0 to 137146
Data columns (total 17 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   id             137147 non-null  int64  
 1   yearpublished  137147 non-null  int64  
 2   min_players    137147 non-null  int64  
 3   max_players    137147 non-null  int64  
 4   playtime       137147 non-null  int64  
 5   min_playtime   137147 non-null  int64  
 6   max_playtime   137147 non-null  int64  
 7   min_age        137147 non-null  int64  
 8   average        137147 non-null  float64
 9   user_rated     137147 non-null  int64  
 10  num_owned      137147 non-null  int64  
 11  trading        137147 non-null  int64  
 12  wanting        137147 non-null  int64  
 13  wishing        137147 non-null  int64  
 14  numcomments    137147 non-null  int64  
 15  numweights     137147 non-null  int64  
 16  averageweight  137147 non-null  float64
dtypes: float64(2), int64(15)
memo

### I use min age <= 18 

In [79]:
mask_min_age = df_main_stats['min_age'] < 18

In [80]:
df_main_stats_masked = df_main_stats.loc[mask_min_age,:]

In [81]:
df_main_stats_masked


Unnamed: 0,id,yearpublished,min_players,max_players,playtime,min_playtime,max_playtime,min_age,average,user_rated,num_owned,trading,wanting,wishing,numcomments,numweights,averageweight
0,98401,2010,2,4,30,30,30,10,7.08333,3,15,0,1,3,2,2,1.5
1,98402,2011,2,5,30,30,30,8,6.34615,26,20,0,7,13,9,5,2.2
2,98406,2011,2,4,30,30,30,10,6.73333,9,16,3,0,0,3,1,1.0
3,98416,2007,2,4,50,50,50,8,6.80000,2,16,2,1,0,2,1,3.0
4,98417,2009,2,2,60,60,60,12,0.00000,0,10,1,2,2,0,1,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137142,298189,2020,3,7,120,30,120,7,0.00000,0,0,0,0,0,0,0,0.0
137143,298190,2020,3,6,30,30,30,12,6.66667,3,47,2,1,5,4,0,0.0
137144,298191,2022,2,5,80,30,80,8,0.00000,0,5,0,0,0,0,0,0.0
137145,298193,2020,2,6,20,10,20,10,7.38077,26,61,1,0,7,7,2,1.5


### I filter all games id out that don't have at least 10 reviews

In [82]:
mask_reviews = df_main_stats_masked['user_rated'] > 9

In [83]:
df_main_stats_masked = df_main_stats_masked.loc[mask_reviews,:]

In [84]:
df_main_stats_masked.head(2)

Unnamed: 0,id,yearpublished,min_players,max_players,playtime,min_playtime,max_playtime,min_age,average,user_rated,num_owned,trading,wanting,wishing,numcomments,numweights,averageweight
1,98402,2011,2,5,30,30,30,8,6.34615,26,20,0,7,13,9,5,2.2
12,98426,0,2,6,45,45,45,0,5.43333,48,61,0,0,5,9,0,0.0


In [85]:
df_main_stats_masked.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49270 entries, 1 to 137145
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             49270 non-null  int64  
 1   yearpublished  49270 non-null  int64  
 2   min_players    49270 non-null  int64  
 3   max_players    49270 non-null  int64  
 4   playtime       49270 non-null  int64  
 5   min_playtime   49270 non-null  int64  
 6   max_playtime   49270 non-null  int64  
 7   min_age        49270 non-null  int64  
 8   average        49270 non-null  float64
 9   user_rated     49270 non-null  int64  
 10  num_owned      49270 non-null  int64  
 11  trading        49270 non-null  int64  
 12  wanting        49270 non-null  int64  
 13  wishing        49270 non-null  int64  
 14  numcomments    49270 non-null  int64  
 15  numweights     49270 non-null  int64  
 16  averageweight  49270 non-null  float64
dtypes: float64(2), int64(15)
memory usage: 6.8 MB


### I filter all games out that are owned by less than 10 people

In [86]:
mask_owned = df_main_stats_masked['num_owned'] > 9

In [87]:
df_main_stats_masked = df_main_stats_masked.loc[mask_owned,:]

In [88]:
df_main_stats_masked.head(2)

Unnamed: 0,id,yearpublished,min_players,max_players,playtime,min_playtime,max_playtime,min_age,average,user_rated,num_owned,trading,wanting,wishing,numcomments,numweights,averageweight
1,98402,2011,2,5,30,30,30,8,6.34615,26,20,0,7,13,9,5,2.2
12,98426,0,2,6,45,45,45,0,5.43333,48,61,0,0,5,9,0,0.0


In [89]:
df_main_stats_masked.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48795 entries, 1 to 137145
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             48795 non-null  int64  
 1   yearpublished  48795 non-null  int64  
 2   min_players    48795 non-null  int64  
 3   max_players    48795 non-null  int64  
 4   playtime       48795 non-null  int64  
 5   min_playtime   48795 non-null  int64  
 6   max_playtime   48795 non-null  int64  
 7   min_age        48795 non-null  int64  
 8   average        48795 non-null  float64
 9   user_rated     48795 non-null  int64  
 10  num_owned      48795 non-null  int64  
 11  trading        48795 non-null  int64  
 12  wanting        48795 non-null  int64  
 13  wishing        48795 non-null  int64  
 14  numcomments    48795 non-null  int64  
 15  numweights     48795 non-null  int64  
 16  averageweight  48795 non-null  float64
dtypes: float64(2), int64(15)
memory usage: 6.7 MB


### I look for outliers in df_main_stats_masked table

In [90]:
df_main_stats_masked.describe()

Unnamed: 0,id,yearpublished,min_players,max_players,playtime,min_playtime,max_playtime,min_age,average,user_rated,num_owned,trading,wanting,wishing,numcomments,numweights,averageweight
count,48795.0,48795.0,48795.0,48795.0,48795.0,48795.0,48795.0,48795.0,48795.0,48795.0,48795.0,48795.0,48795.0,48795.0,48795.0,48795.0,48795.0
mean,142401.013792,1981.290276,1.923496,5.872917,78.791331,55.727513,77.930321,9.239697,6.640524,476.855006,997.909827,25.459391,26.783523,133.950999,118.423076,25.632462,1.730011
std,109703.640873,236.374177,0.719411,15.282816,384.468289,305.18436,384.547079,4.233991,1.087096,2693.970859,4094.283649,74.2558,83.838609,578.764236,484.439348,144.78574,1.118108
min,1.0,-3500.0,0.0,0.0,0.0,0.0,0.0,0.0,1.16208,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,25343.5,2004.0,2.0,3.0,20.0,20.0,20.0,8.0,5.93728,20.0,70.0,2.0,1.0,7.0,9.0,1.0,1.0
50%,148556.0,2013.0,2.0,4.0,45.0,30.0,45.0,10.0,6.71875,49.0,192.0,6.0,6.0,21.0,22.0,3.0,1.801
75%,236840.5,2017.0,2.0,6.0,90.0,60.0,90.0,12.0,7.441495,168.0,595.0,20.0,19.0,69.0,66.0,10.0,2.5
max,365104.0,2024.0,15.0,1000.0,60000.0,60000.0,60000.0,17.0,10.0,112718.0,175618.0,2655.0,2008.0,20091.0,19916.0,7809.0,5.0


In [91]:
mask = df_main_stats_masked['min_playtime'] < 600


In [92]:
df_main_stats_masked.loc[mask,:]

Unnamed: 0,id,yearpublished,min_players,max_players,playtime,min_playtime,max_playtime,min_age,average,user_rated,num_owned,trading,wanting,wishing,numcomments,numweights,averageweight
1,98402,2011,2,5,30,30,30,8,6.34615,26,20,0,7,13,9,5,2.2000
12,98426,0,2,6,45,45,45,0,5.43333,48,61,0,0,5,9,0,0.0000
17,98443,2012,1,4,90,90,90,10,6.72745,286,528,47,57,177,137,21,2.6667
22,98452,1984,1,0,0,0,0,0,6.30769,13,39,2,0,1,7,1,3.0000
25,98472,2011,3,7,60,60,60,10,5.54551,167,477,34,7,71,55,14,2.2143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137136,298165,2020,2,5,45,30,45,12,7.05882,17,38,3,0,11,4,3,2.0000
137137,298166,2020,2,6,5,5,5,8,6.57194,134,379,3,3,23,29,8,1.1250
137138,298171,2020,2,6,5,5,5,8,6.47783,106,328,2,4,26,26,3,1.0000
137139,298173,2020,2,6,5,5,5,8,6.55657,99,271,2,4,24,22,4,1.0000


# Cleaning Marketplace table

### Clean-out all offers above 250 dollars

I first import the table from our SQL-Database

In [93]:
sql = f'''SELECT * FROM {schema}.marketplace_listings;
      '''
df_marketplace = sf.get_dataframe(sql)

In [94]:
df_marketplace.head(2)

Unnamed: 0,id,listdate,price,currency,condition,conv_currency,price_in_dollars
0,98443,2018-08-27,50.0,GBP,verygood,1.21,60.5
1,98443,2019-03-25,115.0,EUR,likenew,1.03,118.45


In [95]:
df_marketplace.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241557 entries, 0 to 241556
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   id                241557 non-null  int64         
 1   listdate          241557 non-null  datetime64[ns]
 2   price             241557 non-null  float64       
 3   currency          241557 non-null  object        
 4   condition         241547 non-null  object        
 5   conv_currency     241557 non-null  float64       
 6   price_in_dollars  241557 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(1), object(2)
memory usage: 12.9+ MB


In [96]:
mask_too_exp = (df_marketplace['price_in_dollars'] < 251) & (df_marketplace['price_in_dollars'] > 0)

In [97]:
df_marketplace_filt = df_marketplace.loc[mask_too_exp,:]

In [98]:
df_marketplace_filt.head(2)

Unnamed: 0,id,listdate,price,currency,condition,conv_currency,price_in_dollars
0,98443,2018-08-27,50.0,GBP,verygood,1.21,60.5
1,98443,2019-03-25,115.0,EUR,likenew,1.03,118.45


In [99]:
df_marketplace_filt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 238629 entries, 0 to 241556
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   id                238629 non-null  int64         
 1   listdate          238629 non-null  datetime64[ns]
 2   price             238629 non-null  float64       
 3   currency          238629 non-null  object        
 4   condition         238619 non-null  object        
 5   conv_currency     238629 non-null  float64       
 6   price_in_dollars  238629 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(1), object(2)
memory usage: 14.6+ MB


In [100]:
df_marketplace_filt.groupby('condition').count()

Unnamed: 0_level_0,id,listdate,price,currency,conv_currency,price_in_dollars
condition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
acceptable,3145,3145,3145,3145,3145,3145
good,12767,12767,12767,12767,12767,12767
likenew,56907,56907,56907,56907,56907,56907
new,117682,117682,117682,117682,117682,117682
verygood,48118,48118,48118,48118,48118,48118


In [101]:
sql = f'''SELECT * FROM {schema}.expansions;
      '''
df_expansions = sf.get_dataframe(sql)

In [105]:
expansion_liste = df_expansions['expansion_id']

In [107]:
mask = ~df_main_stats_masked['id'].isin(expansion_liste)

In [108]:
df_main_stats_masked = df_main_stats_masked.loc[mask,:]

In [109]:
df_main_stats_masked

Unnamed: 0,id,yearpublished,min_players,max_players,playtime,min_playtime,max_playtime,min_age,average,user_rated,num_owned,trading,wanting,wishing,numcomments,numweights,averageweight
1,98402,2011,2,5,30,30,30,8,6.34615,26,20,0,7,13,9,5,2.2000
12,98426,0,2,6,45,45,45,0,5.43333,48,61,0,0,5,9,0,0.0000
25,98472,2011,3,7,60,60,60,10,5.54551,167,477,34,7,71,55,14,2.2143
28,98475,2011,1,4,50,50,50,10,6.76767,43,78,2,8,30,11,4,2.7500
38,98529,2011,2,4,15,15,15,6,6.04040,248,427,24,18,42,118,22,1.0909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137136,298165,2020,2,5,45,30,45,12,7.05882,17,38,3,0,11,4,3,2.0000
137137,298166,2020,2,6,5,5,5,8,6.57194,134,379,3,3,23,29,8,1.1250
137138,298171,2020,2,6,5,5,5,8,6.47783,106,328,2,4,26,26,3,1.0000
137139,298173,2020,2,6,5,5,5,8,6.55657,99,271,2,4,24,22,4,1.0000


In [126]:
mask_year_0 = df_main_stats_masked['yearpublished'] == 0

# df_main_stats_masked = df_main_stats_masked.loc[mask_year_0,'yearpublished'] = None

TypeError: 'int' object is not subscriptable

In [118]:
df_main_stats_masked

Unnamed: 0,id,yearpublished,min_players,max_players,playtime,min_playtime,max_playtime,min_age,average,user_rated,num_owned,trading,wanting,wishing,numcomments,numweights,averageweight
1,98402,2011,2,5,30,30,30,8,6.34615,26,20,0,7,13,9,5,2.2000
12,98426,0,2,6,45,45,45,0,5.43333,48,61,0,0,5,9,0,0.0000
25,98472,2011,3,7,60,60,60,10,5.54551,167,477,34,7,71,55,14,2.2143
28,98475,2011,1,4,50,50,50,10,6.76767,43,78,2,8,30,11,4,2.7500
38,98529,2011,2,4,15,15,15,6,6.04040,248,427,24,18,42,118,22,1.0909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137136,298165,2020,2,5,45,30,45,12,7.05882,17,38,3,0,11,4,3,2.0000
137137,298166,2020,2,6,5,5,5,8,6.57194,134,379,3,3,23,29,8,1.1250
137138,298171,2020,2,6,5,5,5,8,6.47783,106,328,2,4,26,26,3,1.0000
137139,298173,2020,2,6,5,5,5,8,6.55657,99,271,2,4,24,22,4,1.0000
