# LXML Parser for BGG XML API

In [13]:
import pandas as pd
from lxml import etree
import Capstone_functions as cf
import os

parser = etree.XMLParser(recover=True)

xtree = etree.parse("data/old_API_id_260200.xml", parser= parser).getroot()


subnodes = [
    'boardgamecategory',
    'boardgamesubdomain',
    'boardgamemechanic',
    'boardgamefamily',
    'boardgameexpansion',
    'boardgamehonor',
    'boardgamedesigner',
    'boardgameartist',
    'boardgamepublisher',
    'boardgamepodcastepisode',
    'boardgameimplementation',
    'videogamebg',
    'statistics',
    'marketplacelistings'
]

In [14]:
df_main = pd.DataFrame()

directory = os.fsencode("data/")
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".xml"): 
        file_path = os.path.join(str(directory)[2:].replace("'",""), filename)
        xtree_temp = etree.parse(file_path, parser= parser).getroot()
        df_main = pd.concat([df_main,cf.df_main(xtree_temp)])

## Data Cleaning df_main

### Transform ids in numbers

In [15]:
df_main["id"] = df_main["id"].astype('int')

In [24]:
df_main['description'] = df_main['description'].str.replace("<br/>"," ")

## Make ID 

In [25]:
df_ids = df_main[["id","description"]]

In [26]:
df_ids

Unnamed: 0,id,description
0,98400,Piclings is a platform game in which players a...
1,98401,The game is as simple as the name. The whole o...
2,98402,The winner of the yearly pastry contest is gua...
3,98403,website blurb:Backswords &amp; Bucklers is a n...
4,98404,Cover headline: Mystery Attackers!Cover art: S...
...,...,...
181,298195,"This is a digital, single-scenario promo item ..."
182,298196,From publisher blurb:The Secret Frequecy Files...
183,298197,From publisher blurb:The Secret Frequecy Files...
184,298198,From publisher blurb:A new sleek and modern la...


### Drop null values

In [27]:
df_main_clean = df_main.dropna(thresh=4)

In [28]:
df_main_clean

Unnamed: 0,id,description,yearpublished,min_players,max_players,playtime,min_playtime,max_playtime,min_age
1,98401,The game is as simple as the name. The whole o...,2010,2,4,30,30,30,10
2,98402,The winner of the yearly pastry contest is gua...,2011,2,5,30,30,30,8
6,98406,Players in this game try to build the best chi...,2011,2,4,30,30,30,10
11,98416,Planet Raiders is an abstract strategy game.Th...,2007,2,4,50,50,50,8
12,98417,Publisher Blurb:Appointed by the Queen of Spad...,2009,2,2,60,60,60,12
...,...,...,...,...,...,...,...,...,...
176,298189,Rhetorical Oracle is a storytelling party card...,2020,3,7,120,30,120,7
177,298190,"WWE: Headlock, Paper, Scissors is a game of st...",2020,3,6,30,30,30,12
178,298191,Alleycat is based on the informal sport of the...,2022,2,5,80,30,80,8
180,298193,"Zev, the great grey wolf, caught little Scarle...",2020,2,6,20,10,20,10


In [29]:
for column in df_main_clean.columns[-7:]:
    df_main_clean[column] = df_main_clean[column].astype('int')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_main_clean[column] = df_main_clean[column].astype('int')


In [31]:
df_main_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 137153 entries, 1 to 181
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             137153 non-null  int64 
 1   description    137153 non-null  object
 2   yearpublished  137153 non-null  int64 
 3   min_players    137153 non-null  int64 
 4   max_players    137153 non-null  int64 
 5   playtime       137153 non-null  int64 
 6   min_playtime   137153 non-null  int64 
 7   max_playtime   137153 non-null  int64 
 8   min_age        137153 non-null  int64 
dtypes: int64(8), object(1)
memory usage: 10.5+ MB


In [33]:
df_main_clean.to_csv('main_clean.csv')

# Dict with other tables

In [50]:
df_dict = dict()
directory = os.fsencode("data/")
for entrypoint in subnodes:
    df_temp = pd.DataFrame()
    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        if filename.endswith(".xml"): 
            file_path = os.path.join(str(directory)[2:].replace("'",""), filename)
            xtree_temp = etree.parse(file_path, parser= parser).getroot()
            df_temp = pd.concat([df_temp,cf.df_subnodes(xtree_temp, entrypoint=entrypoint)])
    df_dict[entrypoint] = df_temp

In [38]:
df_dict.keys()

dict_keys(['boardgamecategory', 'boardgamesubdomain', 'boardgamemechanic', 'boardgamefamily', 'boardgameexpansion', 'boardgamehonor', 'boardgamedesigner', 'boardgameartist', 'boardgamepublisher', 'boardgamepodcastepisode', 'boardgameimplementation', 'videogamebg', 'statistics', 'marketplacelistings'])

### Some early logical cleanings

I remove the hour:minute:seconds from listdate in marketplacetable and store the date in format YYYY-MM-DD

In [37]:
df_dict['marketplacelistings']['listdate'] = df_dict['marketplacelistings']['listdate'].astype('datetime64[D]')

df_dict['marketplacelistings']

Unnamed: 0,id,listdate,price,currency,condition
0,98443,2018-08-27,50.00,GBP,verygood
1,98443,2019-03-25,115.00,EUR,likenew
2,98443,2019-08-25,110.00,EUR,likenew
3,98443,2020-03-06,150.00,USD,verygood
4,98443,2020-12-24,60.00,EUR,verygood
...,...,...,...,...,...
171,298166,2021-08-07,10.00,USD,new
172,298173,2022-01-30,5.00,EUR,verygood
173,298175,2020-08-08,34.00,EUR,new
174,298175,2020-11-03,33.99,EUR,new


# Replace error n fonts in description column.

In [48]:
b_family = df_dict['boardgamefamily']
#b_family['cat_name'].str.split(':',expand=True)

In [44]:
b_family.drop(columns=["cat_name"],inplace=True)

In [49]:
b_family

Unnamed: 0,id,cat_id
0,98402,5841
1,98416,5679
2,98417,16800
3,98417,61979
4,98417,1691
...,...,...
131,298195,8374
132,298195,45610
133,298195,24281
134,298195,25404


In [None]:
b_honor = df_dict['boardgamehonor']

In [None]:
b_honor["year"] = b_honor["cat_name"].str[0:4]
b_honor["cat_name"] = b_honor["cat_name"].str[5:]


In [None]:
b_honor["placement"] = b_honor["cat_name"].str.split().str[-1]

In [None]:
b_honor

In [None]:
b_subdomain

In [None]:
b_implementation