# Data cleaning BGG Database

### Import Data from SQL

In [36]:
import pandas as pd
import sql_functions as sf
import Capstone_functions as cp
from IPython.display import clear_output

engine = sf.get_engine()
 
schema = "bgg_data"

In [37]:
sql = f'''SELECT * FROM {schema}.main;
      '''
df_main = sf.get_dataframe(sql)

## I start filtering by main table

In [38]:
df_main.head(2)

Unnamed: 0,id,yearpublished,min_players,max_players,playtime,min_playtime,max_playtime,min_age
0,98401,2010,2,4,30,30,30,10
1,98402,2011,2,5,30,30,30,8


In [39]:
df_main.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137153 entries, 0 to 137152
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype
---  ------         --------------   -----
 0   id             137153 non-null  int64
 1   yearpublished  137153 non-null  int64
 2   min_players    137153 non-null  int64
 3   max_players    137153 non-null  int64
 4   playtime       137153 non-null  int64
 5   min_playtime   137153 non-null  int64
 6   max_playtime   137153 non-null  int64
 7   min_age        137153 non-null  int64
dtypes: int64(8)
memory usage: 8.4 MB


### I merge the main table with the statistics table

In [40]:
sql = f'''SELECT * FROM {schema}.statistics;
      '''
df_stats = sf.get_dataframe(sql)

In [41]:
df_stats.head(2)

Unnamed: 0,id,average,user_rated,num_owned,trading,wanting,wishing,numcomments,numweights,averageweight
0,98400,0.0,0,0,0,0,0,0,0,0.0
1,98401,7.08333,3,15,0,1,3,2,2,1.5


In [42]:
df_main_stats = pd.merge(df_main,df_stats,on='id')

In [43]:
df_main_stats.head(2)

Unnamed: 0,id,yearpublished,min_players,max_players,playtime,min_playtime,max_playtime,min_age,average,user_rated,num_owned,trading,wanting,wishing,numcomments,numweights,averageweight
0,98401,2010,2,4,30,30,30,10,7.08333,3,15,0,1,3,2,2,1.5
1,98402,2011,2,5,30,30,30,8,6.34615,26,20,0,7,13,9,5,2.2


### I use min age <= 18 

In [44]:
mask_min_age = df_main_stats['min_age'] < 18

In [45]:
df_main_stats_masked = df_main_stats.loc[mask_min_age,:]

In [46]:
df_main_stats_masked


Unnamed: 0,id,yearpublished,min_players,max_players,playtime,min_playtime,max_playtime,min_age,average,user_rated,num_owned,trading,wanting,wishing,numcomments,numweights,averageweight
0,98401,2010,2,4,30,30,30,10,7.08333,3,15,0,1,3,2,2,1.5
1,98402,2011,2,5,30,30,30,8,6.34615,26,20,0,7,13,9,5,2.2
2,98406,2011,2,4,30,30,30,10,6.73333,9,16,3,0,0,3,1,1.0
3,98416,2007,2,4,50,50,50,8,6.80000,2,16,2,1,0,2,1,3.0
4,98417,2009,2,2,60,60,60,12,0.00000,0,10,1,2,2,0,1,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137142,298189,2020,3,7,120,30,120,7,0.00000,0,0,0,0,0,0,0,0.0
137143,298190,2020,3,6,30,30,30,12,6.66667,3,47,2,1,5,4,0,0.0
137144,298191,2022,2,5,80,30,80,8,0.00000,0,5,0,0,0,0,0,0.0
137145,298193,2020,2,6,20,10,20,10,7.38077,26,61,1,0,7,7,2,1.5


### I filter all games id out that don't have at least 10 reviews

In [47]:
mask_reviews = df_main_stats_masked['user_rated'] > 9

In [48]:
df_main_stats_masked = df_main_stats_masked.loc[mask_reviews,:]

In [49]:
df_main_stats_masked.head(2)

Unnamed: 0,id,yearpublished,min_players,max_players,playtime,min_playtime,max_playtime,min_age,average,user_rated,num_owned,trading,wanting,wishing,numcomments,numweights,averageweight
1,98402,2011,2,5,30,30,30,8,6.34615,26,20,0,7,13,9,5,2.2
12,98426,0,2,6,45,45,45,0,5.43333,48,61,0,0,5,9,0,0.0


In [50]:
df_main_stats_masked.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49270 entries, 1 to 137145
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             49270 non-null  int64  
 1   yearpublished  49270 non-null  int64  
 2   min_players    49270 non-null  int64  
 3   max_players    49270 non-null  int64  
 4   playtime       49270 non-null  int64  
 5   min_playtime   49270 non-null  int64  
 6   max_playtime   49270 non-null  int64  
 7   min_age        49270 non-null  int64  
 8   average        49270 non-null  float64
 9   user_rated     49270 non-null  int64  
 10  num_owned      49270 non-null  int64  
 11  trading        49270 non-null  int64  
 12  wanting        49270 non-null  int64  
 13  wishing        49270 non-null  int64  
 14  numcomments    49270 non-null  int64  
 15  numweights     49270 non-null  int64  
 16  averageweight  49270 non-null  float64
dtypes: float64(2), int64(15)
memory usage: 6.8 MB


### I filter all games out that are owned by less than 10 people

In [51]:
mask_owned = df_main_stats_masked['num_owned'] > 9

In [52]:
df_main_stats_masked = df_main_stats_masked.loc[mask_owned,:]

In [56]:
df_main_stats_masked.head(2)

Unnamed: 0,id,yearpublished,min_players,max_players,playtime,min_playtime,max_playtime,min_age,average,user_rated,num_owned,trading,wanting,wishing,numcomments,numweights,averageweight
1,98402,2011,2,5,30,30,30,8,6.34615,26,20,0,7,13,9,5,2.2
12,98426,0,2,6,45,45,45,0,5.43333,48,61,0,0,5,9,0,0.0


In [54]:
df_main_stats_masked.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48795 entries, 1 to 137145
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             48795 non-null  int64  
 1   yearpublished  48795 non-null  int64  
 2   min_players    48795 non-null  int64  
 3   max_players    48795 non-null  int64  
 4   playtime       48795 non-null  int64  
 5   min_playtime   48795 non-null  int64  
 6   max_playtime   48795 non-null  int64  
 7   min_age        48795 non-null  int64  
 8   average        48795 non-null  float64
 9   user_rated     48795 non-null  int64  
 10  num_owned      48795 non-null  int64  
 11  trading        48795 non-null  int64  
 12  wanting        48795 non-null  int64  
 13  wishing        48795 non-null  int64  
 14  numcomments    48795 non-null  int64  
 15  numweights     48795 non-null  int64  
 16  averageweight  48795 non-null  float64
dtypes: float64(2), int64(15)
memory usage: 6.7 MB


### I look for outliers in df_main_stats_masked table

In [72]:
df_main_stats_masked.describe()

Unnamed: 0,id,yearpublished,min_players,max_players,playtime,min_playtime,max_playtime,min_age,average,user_rated,num_owned,trading,wanting,wishing,numcomments,numweights,averageweight
count,48795.0,48795.0,48795.0,48795.0,48795.0,48795.0,48795.0,48795.0,48795.0,48795.0,48795.0,48795.0,48795.0,48795.0,48795.0,48795.0,48795.0
mean,142401.013792,1981.290276,1.923496,5.872917,78.791331,55.727513,77.930321,9.239697,6.640524,476.855006,997.909827,25.459391,26.783523,133.950999,118.423076,25.632462,1.730011
std,109703.640873,236.374177,0.719411,15.282816,384.468289,305.18436,384.547079,4.233991,1.087096,2693.970859,4094.283649,74.2558,83.838609,578.764236,484.439348,144.78574,1.118108
min,1.0,-3500.0,0.0,0.0,0.0,0.0,0.0,0.0,1.16208,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,25343.5,2004.0,2.0,3.0,20.0,20.0,20.0,8.0,5.93728,20.0,70.0,2.0,1.0,7.0,9.0,1.0,1.0
50%,148556.0,2013.0,2.0,4.0,45.0,30.0,45.0,10.0,6.71875,49.0,192.0,6.0,6.0,21.0,22.0,3.0,1.801
75%,236840.5,2017.0,2.0,6.0,90.0,60.0,90.0,12.0,7.441495,168.0,595.0,20.0,19.0,69.0,66.0,10.0,2.5
max,365104.0,2024.0,15.0,1000.0,60000.0,60000.0,60000.0,17.0,10.0,112718.0,175618.0,2655.0,2008.0,20091.0,19916.0,7809.0,5.0


In [78]:
mask = df_main_stats_masked['min_playtime'] > 600


In [80]:
df_main_stats_masked.loc[mask,:]

Unnamed: 0,id,yearpublished,min_players,max_players,playtime,min_playtime,max_playtime,min_age,average,user_rated,num_owned,trading,wanting,wishing,numcomments,numweights,averageweight
2391,32327,1996,2,6,720,720,720,0,7.09184,49,75,1,3,6,18,11,4.1818
4177,13855,2005,1,2,1800,1800,1800,12,7.42381,312,1220,28,38,164,163,76,3.9868
5954,88190,2012,2,2,2400,1000,2400,12,8.03816,76,429,5,18,83,48,18,4.5000
7101,96026,2014,2,8,720,720,720,14,8.12456,283,596,3,100,288,103,49,4.5510
7540,29691,2007,2,2,810,810,810,0,6.97297,37,279,6,10,21,35,14,3.7857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132041,203752,2017,2,6,1200,900,1200,14,6.35714,14,110,5,2,5,7,0,0.0000
134194,26804,0,2,6,6000,6000,6000,12,3.43750,16,22,0,16,39,31,6,5.0000
135578,283,1992,2,6,2480,2480,2480,12,6.92536,750,1633,101,28,99,376,122,4.5328
135623,329,1985,2,2,1200,1200,1200,12,7.23998,686,1578,56,31,84,293,95,3.3053


# Cleaning Marketplace table

### Clean-out all offers above 250 dollars

I first import the table from our SQL-Database

In [55]:
sql = f'''SELECT * FROM {schema}.marketplace_listings;
      '''
df_marketplace = sf.get_dataframe(sql)

In [57]:
df_marketplace.head(2)

Unnamed: 0,id,listdate,price,currency,condition,conv_currency,price_in_dollars
0,98443,2018-08-27,50.0,GBP,verygood,1.21,60.5
1,98443,2019-03-25,115.0,EUR,likenew,1.03,118.45


In [59]:
df_marketplace.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241557 entries, 0 to 241556
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   id                241557 non-null  int64         
 1   listdate          241557 non-null  datetime64[ns]
 2   price             241557 non-null  float64       
 3   currency          241557 non-null  object        
 4   condition         241547 non-null  object        
 5   conv_currency     241557 non-null  float64       
 6   price_in_dollars  241557 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(1), object(2)
memory usage: 12.9+ MB


In [67]:
mask_too_exp = (df_marketplace['price_in_dollars'] < 251) & (df_marketplace['price_in_dollars'] > 0)

In [68]:
df_marketplace_filt = df_marketplace.loc[mask_too_exp,:]

In [69]:
df_marketplace_filt.head(2)

Unnamed: 0,id,listdate,price,currency,condition,conv_currency,price_in_dollars
0,98443,2018-08-27,50.0,GBP,verygood,1.21,60.5
1,98443,2019-03-25,115.0,EUR,likenew,1.03,118.45


In [70]:
df_marketplace_filt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 238629 entries, 0 to 241556
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   id                238629 non-null  int64         
 1   listdate          238629 non-null  datetime64[ns]
 2   price             238629 non-null  float64       
 3   currency          238629 non-null  object        
 4   condition         238619 non-null  object        
 5   conv_currency     238629 non-null  float64       
 6   price_in_dollars  238629 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(1), object(2)
memory usage: 14.6+ MB


In [71]:
df_marketplace_filt.groupby('condition').count()

Unnamed: 0_level_0,id,listdate,price,currency,conv_currency,price_in_dollars
condition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
acceptable,3145,3145,3145,3145,3145,3145
good,12767,12767,12767,12767,12767,12767
likenew,56907,56907,56907,56907,56907,56907
new,117682,117682,117682,117682,117682,117682
verygood,48118,48118,48118,48118,48118,48118
