# Early Data-Cleaning BGG Boardgame Data

First step is to import all modules and set up schemas and tables

In [2]:
import pandas as pd
from lxml import etree
import Capstone_functions as cf
import os
import sql_functions as sf

parser = etree.XMLParser(recover=True)

xtree = etree.parse("data/old_API_id_260200.xml", parser= parser).getroot()

engine = sf.get_engine()

schema = "BGG_Data"

subnodes = [
    'boardgamecategory', #ok
    'boardgamesubdomain', #ok
    'boardgamemechanic', #ok
    'boardgamefamily', # ok
    'boardgameexpansion', #ok
    'boardgamehonor', #ok
    'boardgamedesigner', #ok
    'boardgameartist', #ok
    'boardgamepublisher', #ok
    'boardgamepodcastepisode',#ok
    'boardgameimplementation', #ok
    'videogamebg',#ok
    'statistics', #ok
    'marketplacelistings' # ok
]

# df_main ok
# df_poll ok
# df_name
# df_ids ok

In [None]:
df_main = pd.DataFrame()

directory = os.fsencode("data/")
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".xml"): 
        file_path = os.path.join(str(directory)[2:].replace("'",""), filename)
        xtree_temp = etree.parse(file_path, parser= parser).getroot()
        df_main = pd.concat([df_main,cf.df_main(xtree_temp)])

## Data Cleaning df_main

### Transform ids in numbers

In [None]:
df_main["id"] = df_main["id"].astype('int')

In [None]:
df_main['description'] = df_main['description'].str.replace("<br/>"," ")

## Make ID 

In [None]:
df_ids = df_main["id"]

In [None]:
df_ids

In [None]:
sf.build_table(engine=engine,table_name="unique_ids",dataframe=df_ids,schema=schema)

### Drop null values

In [None]:
df_main_clean = df_main.dropna(thresh=4)

In [None]:
df_main_clean

In [None]:
for column in df_main_clean.columns[-7:]:
    df_main_clean[column] = df_main_clean[column].astype('int')


In [None]:
df_main_clean.drop("description",axis=1,inplace=True)

In [None]:
df_main_clean

In [None]:
sf.build_table(engine=engine,table_name="main",dataframe=df_main_clean,schema=schema)

# Dict with other tables

In [None]:
df_dict = dict()
directory = os.fsencode("data/")
for entrypoint in subnodes:
    df_temp = pd.DataFrame()
    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        if filename.endswith(".xml"): 
            file_path = os.path.join(str(directory)[2:].replace("'",""), filename)
            xtree_temp = etree.parse(file_path, parser= parser).getroot()
            df_temp = pd.concat([df_temp,cf.df_subnodes(xtree_temp, entrypoint=entrypoint)])
    df_dict[entrypoint] = df_temp

In [None]:
df_dict.keys()

# Cleaning Marketplace listings

I remove the hour:minute:seconds from listdate in marketplacetable and store the date in format YYYY-MM-DD

In [None]:
df_market = df_dict['marketplacelistings'].copy()

df_market['listdate'] = df_market['listdate'].astype('datetime64[D]')

In [None]:
df_market.info()

In [None]:
df_market["id"] = df_market["id"].astype('int')

df_market["price"] = df_market["price"].astype('float')
df_market

In [None]:
sf.build_table(engine=engine,table_name="marketplace_listings",dataframe=df_market,schema=schema)

# Cleaning family table

In [None]:
b_family = df_dict['boardgamefamily'].copy()

b_family[['family_type', 'family']] = b_family['cat_name'].str.split(':',expand=True,n=1)

In [None]:
b_family.drop(columns=["cat_name"],inplace=True)

In [None]:
b_family["id"] = b_family["id"].astype('int')

b_family["cat_id"] = b_family["cat_id"].astype('int')

In [None]:
b_family

In [None]:
b_family.info()

In [None]:
sf.build_table(engine=engine,table_name="family",dataframe=b_family,schema=schema)

# Cleaning Honor Table

In [None]:
b_honor = df_dict['boardgamehonor'].copy()

In [None]:
b_honor["year"] = b_honor["cat_name"].str[0:4]
b_honor["cat_name"] = b_honor["cat_name"].str[5:]


In [None]:
b_honor["placement"] = b_honor["cat_name"].str.split().str[-1]

In [None]:
b_honor["id"] = b_honor["id"].astype('int')

b_honor["cat_id"] = b_honor["cat_id"].astype('int')

b_honor["year"] = b_honor["year"].astype('int')

In [None]:
b_honor

In [None]:
b_honor.info()

In [None]:
sf.build_table(engine=engine,table_name="honor",dataframe=b_honor,schema=schema)

# Cleaning Boardgame category

In [None]:
b_category = df_dict['boardgamecategory'].copy()

In [None]:
b_category

In [None]:
b_category["id"] = b_category["id"].astype('int')

b_category["cat_id"] = b_category["cat_id"].astype('int')


In [None]:
b_category.info()

In [None]:
sf.build_table(engine=engine,table_name="category",dataframe=b_category,schema=schema)

# Cleaning B_Subdomain

In [None]:
b_subdomain = df_dict['boardgamesubdomain'].copy()

In [None]:
b_subdomain["cat_name"] = b_subdomain["cat_name"].str.replace("Games","")

In [None]:
b_subdomain["id"] = b_subdomain["id"].astype('int')

b_subdomain["cat_id"] = b_subdomain["cat_id"].astype('int')


In [None]:
b_subdomain.info()

In [None]:
sf.build_table(engine=engine,table_name="subdomain",dataframe=b_subdomain,schema=schema)

# Cleaning Mechanics

In [None]:
b_mech = df_dict['boardgamemechanic'].copy() 

In [None]:
b_mech["id"] = b_mech["id"].astype('int')

b_mech["cat_id"] = b_mech["cat_id"].astype('int')


In [None]:
b_mech.info()

In [None]:
sf.build_table(engine=engine,table_name="mechanics",dataframe=b_mech,schema=schema)

# Cleaning Expansions

In [None]:
b_exp = df_dict['boardgameexpansion'].copy() 

In [None]:
b_exp["id"] = b_exp["id"].astype('int')

b_exp["cat_id"] = b_exp["cat_id"].astype('int')

In [None]:
b_exp.info()

In [None]:
sf.build_table(engine=engine,table_name="expansions",dataframe=b_exp,schema=schema)

# Cleaning Boardgame Designer

In [None]:
b_design = df_dict['boardgamedesigner'].copy()  

b_design

In [None]:
b_design["id"] = b_design["id"].astype('int')

b_design["cat_id"] = b_design["cat_id"].astype('int')

In [None]:
b_design.info()

In [None]:
sf.build_table(engine=engine,table_name="designer",dataframe=b_design,schema=schema)

# Cleaning Boardgame Artist

In [None]:
b_artist = df_dict['boardgameartist'].copy()   

In [None]:
b_artist["id"] = b_artist["id"].astype('int')

b_artist["cat_id"] = b_artist["cat_id"].astype('int')

In [None]:
sf.build_table(engine=engine,table_name="artist",dataframe=b_artist,schema=schema)

# Cleaning boardgamepublisher

In [None]:
b_publ = df_dict['boardgamepublisher'].copy()  

In [None]:
b_publ["id"] = b_publ["id"].astype('int')

b_publ["cat_id"] = b_publ["cat_id"].astype('int')

In [None]:
sf.build_table(engine=engine,table_name="publisher",dataframe=b_publ,schema=schema)

# Cleaning boardgamepodcastepisode

In [None]:
b_podcast = df_dict['boardgamepodcastepisode'].copy()   

In [None]:
b_podcast["id"] = b_podcast["id"].astype('int')

b_podcast["cat_id"] = b_podcast["cat_id"].astype('int')

In [None]:
sf.build_table(engine=engine,table_name="podcast",dataframe=b_podcast,schema=schema)

# Cleaning boardgameimplementation

In [None]:
b_impl = df_dict['boardgameimplementation'].copy()   

In [None]:
b_impl[['implementation', 'implementation_type']] = b_impl['cat_name'].str.split(':',expand=True,n=1)

In [None]:
b_impl.drop(columns=["cat_name"],inplace=True)

In [None]:
b_impl

In [None]:
b_impl["id"] = b_impl["id"].astype('int')

b_impl["cat_id"] = b_impl["cat_id"].astype('int')

In [None]:
sf.build_table(engine=engine,table_name="implementation",dataframe=b_impl,schema=schema)

# Cleaning videogamebg

In [None]:
b_videogame = df_dict['videogamebg'].copy()   

In [None]:
b_videogame

In [None]:
b_videogame["id"] = b_videogame["id"].astype('int')

b_videogame["cat_id"] = b_videogame["cat_id"].astype('int')

In [None]:
sf.build_table(engine=engine,table_name="videogames",dataframe=b_videogame,schema=schema)

# Cleaning Statistics

In [None]:
b_stats = df_dict['statistics'].copy() 

In [None]:
b_stats = b_stats.dropna()

In [None]:
b_stats["id"] = b_stats["id"].astype('int')
b_stats["average"] = b_stats["average"].astype('float')
b_stats["user_rated"] = b_stats["user_rated"].astype('int')
b_stats["num_owned"] = b_stats["num_owned"].astype('int')
b_stats["trading"] = b_stats["trading"].astype('int')
b_stats["wanting"] = b_stats["wanting"].astype('int')
b_stats["wishing"] = b_stats["wishing"].astype('int')
b_stats["numcomments"] = b_stats["numcomments"].astype('int')
b_stats["numweights"] = b_stats["numweights"].astype('int')
b_stats["averageweight"] = b_stats["averageweight"].astype('float')

In [None]:
b_stats.info()

In [None]:
sf.build_table(engine=engine,table_name="statistics",dataframe=b_stats,schema=schema)

# Cleaning df_poll

In [None]:
df_poll = pd.DataFrame()

directory = os.fsencode("data/")
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".xml"): 
        file_path = os.path.join(str(directory)[2:].replace("'",""), filename)
        xtree_temp = etree.parse(file_path, parser= parser).getroot()
        df_poll = pd.concat([df_poll,cf.df_poll(xtree_temp)])

In [None]:
df_poll

In [None]:
df_poll["id"] = df_poll["id"].astype('int')

df_poll["num_votes"] = df_poll["num_votes"].astype('int')

In [None]:
df_poll.info()

In [None]:
sf.build_table(engine=engine,table_name="community_poll",dataframe=df_poll,schema=schema)

# Cleaning Names

In [3]:
df_name = pd.DataFrame()

directory = os.fsencode("data/")
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".xml"): 
        file_path = os.path.join(str(directory)[2:].replace("'",""), filename)
        xtree_temp = etree.parse(file_path, parser= parser).getroot()
        df_name = pd.concat([df_name,cf.df_subnodes(xtree_temp,"name")])

In [4]:
df_name["id"] = df_name["id"].astype('int')

In [5]:
df_name.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 433515 entries, 0 to 238
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   id       433515 non-null  int64 
 1   name     433514 non-null  object
 2   primary  433515 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 13.2+ MB


In [6]:
df_name

Unnamed: 0,id,name,primary
0,98400,Piclings,1
1,98401,UNLOCKED,1
2,98402,Le Pâtissier,1
3,98403,Backswords & Bucklers: Book One – Basic Rules,1
4,98404,BattleTechnology (Issue 13 - Year-End 3049),1
...,...,...,...
234,298195,Gloomhaven: Return of the Lost Cabal (Promo Sc...,1
235,298196,The Secret Frequency Files,1
236,298197,Ascension's Landscape: Setting Refinements and...,1
237,298198,Bordeaux: The Court,1


In [7]:
sf.build_table(engine=engine,table_name="names",dataframe=df_name,schema=schema)

The names table was imported successfully.
