# Goal is to create a table that connects unique BGG ids and KS slugs

### I get the tables from the Database

In [1]:
import pandas as pd
import sql_functions as sf
import Capstone_functions as cp
from IPython.display import clear_output

engine = sf.get_engine()
 
schema = "bgg_data"

In [8]:
sql = f'''SELECT * FROM {schema}.kickstarter_bgg;
      '''
df_ks_bgg = sf.get_dataframe(sql)

In [2]:
sql = f'''SELECT * FROM {schema}.publisher;
      '''
df_publ = sf.get_dataframe(sql)

In [30]:
sql = f'''SELECT * FROM {schema}.unique_publisher;
      '''
df_upubl = sf.get_dataframe(sql)

In [3]:
sql = f'''SELECT * FROM {schema}.designer;
      '''
df_designer = sf.get_dataframe(sql)

In [26]:
sql = f'''SELECT * FROM {schema}.unique_designer;
      '''
df_udesigner = sf.get_dataframe(sql)

### I merge again the publisher and designer tables to uniques to get publisher and designer names

In [31]:
df_publisher_complete = pd.merge(df_publ,df_upubl,on='publisher_id')
df_designer_complete = pd.merge(df_designer,df_udesigner,on='designer_id')

In [32]:
df_designer_complete

Unnamed: 0,id,designer_id,designer
0,98401,42353,Sean Howard
1,98377,42353,Sean Howard
2,203170,42353,Sean Howard
3,157435,42353,Sean Howard
4,98868,42353,Sean Howard
...,...,...,...
147944,298137,123647,Elliot Kruszynski
147945,298176,123548,David M Palmquist
147946,298176,123549,Joyce Stallfort Williamson
147947,298191,123709,Carlos Guevara Moscol


In [33]:
df_publisher_designer_complete = pd.merge(df_publisher_complete, df_designer_complete, on='id',how='outer')

### Here I drop some columns, that I don't need for this cleaning task

In [34]:
df_publisher_designer_complete.drop(['publisher_id','designer_id'],inplace=True,axis=1)

In [35]:
df_publisher_designer_complete

Unnamed: 0,id,publisher_name,designer
0,98401,"The Game Crafter, LLC",Sean Howard
1,98401,Knight's Crest Games,Sean Howard
2,173051,"The Game Crafter, LLC",Michael Cofer
3,173120,"The Game Crafter, LLC",Topias Uotila
4,173120,(Web published),Topias Uotila
...,...,...,...
267493,333565,,Ryan Hitt
267494,302660,,Leonice Brown-Young Jr
267495,255206,,Estefania Rodriguez
267496,60517,,Peter Keuls


## BGG-ids and KS-slugs

I first identifiy the unique relationship between BGG ids and KS slugs

In [36]:
df_ks_bgg_unique_slugs = df_ks_bgg[['slug','bgg_id']].drop_duplicates()

In [37]:
df_ks_bgg_unique_slugs # I get 5322 rows

Unnamed: 0,slug,bgg_id
0,relic,128442
1,stormsunder-heirs-of-ruin,302679
2,crazy-taco,289397
3,fray,303430
4,doomsday-bots,253683
...,...,...
8807,roll-camera-the-filmmaking-board-game,298102
8810,dark-venture-battle-of-the-ancients,298163
8811,dilemma,244239
8813,rhetorical-oracle-0,298189


Using the groupby attribute I count how many ids has each unique slug. The goal is to assign to a slug only one BGG id.

In [39]:
df_slug_count = df_ks_bgg_unique_slugs.groupby('slug').count()

In [40]:
df_slug_count.sort_values('bgg_id',ascending=False) # There are some KS slugs with more BGG id

Unnamed: 0_level_0,bgg_id
slug,Unnamed: 1_level_1
robin-hood,14
speculation,10
totem-1,10
jackpot-1,9
treasure-island-1,9
...,...
fossilis,1
foundations,1
foundations-of-rome,1
founders-of-gloomhaven,1


## Filter slugs with more than 1 BGG Id

In [41]:
mask = df_slug_count['bgg_id'] > 1

In [42]:
df_slug_count_filt = df_slug_count.loc[mask,:]

In [43]:
df_slug_count_filt # 325 slugs have more then one BGG id

Unnamed: 0_level_0,bgg_id
slug,Unnamed: 1_level_1
2-kings-relaunch,2
2kings,2
4-letter-words,2
9-lives,2
adorable-monsters,2
...,...
witchs-brew,2
wordos,2
world-cup,4
xerxes,2


In [44]:
df_slug_count_filt = df_slug_count_filt.reset_index()

In [45]:
slugs_to_control = df_slug_count_filt['slug']

I create a Pandas Series with only the slugs name that have more than one BGG Id assigned.

In [46]:
slugs_to_control

0       2-kings-relaunch
1                 2kings
2         4-letter-words
3                9-lives
4      adorable-monsters
             ...        
320          witchs-brew
321               wordos
322            world-cup
323               xerxes
324              yucatan
Name: slug, Length: 325, dtype: object

## Manually check the multiple BGG ids for each KS slug

I have to create first a dataframe that connects the KS values to BGG publishers and designers

In [64]:
df_ks_creator = pd.merge(df_ks_bgg,df_publisher_designer_complete,left_on='bgg_id',right_on='id')

In [65]:
df_ks_creator_filter = df_ks_creator[['bgg_id','game_name','slug','creator_name','publisher_name','designer']] # I need only some columns

I iterate through the list of KS slugs that have more than one BGG id assigned.
For each step I show the dataframe that connect the KS slugs to BGG id, publisher and designers.
You have to enter the row id for the correct BGG id.
There results will be stored in a dictionary with KS slug as key and selected BGG id as value

In [148]:
slug_iter = iter(slugs_to_control)
i = 0
choice = ""
cleaning_dict = dict()

while choice != "exit": 
    mask_check = df_ks_creator_filter["slug"].isin([next(slug_iter)])
    display(df_ks_creator_filter.loc[mask_check, :].reset_index())
    
    choice = input("select the row with the correct BGG ID")
    if choice.isnumeric():
        cleaning_dict[next(slug_iter)] = df_ks_creator_filter.loc[mask_check, :].iat[int(choice),0] 

    clear_output(wait=True)    

StopIteration: 

### Here are the saved dictionary with the saved manual BGG ids/KS Slugs pairs

In [None]:
cleaning_dict_save = {
 '2kings': 319727,
 '9-lives': 267618,
 'adorable-monsters-0': 231388,
 'aftermath-5': 231388,
 'alibi': 209538,
 'aurora-2': 134567,
 'baby-blues': 348554,
 'battle-of-thermopylae': 204003,
 'bingo-dice': 217430,
 'black-swan-0': 301946,
 'cabal': 255928,
 'cancan': 284646,
 'cash': 290236,
 'catacombs-0': 150485,
 'charge': 331747,
 'conan': 264982,
 'conspiracy-theory': 176306,
 'crack-the-code': 197944,
 'crimes-in-history-h-h-holmes-murder-castle': 266937,
 'cube-0': 250442,
 'cursed-0': 329002,
 'dear-leader': 98443,
 'dice-towers': 173059,
 'dilemma': 325491,
 'dive-1': 221194,
 'domination': 158435,
 'dragons-hoard': 157083,
 'dungeon-escape': 310636,
 'dynamite': 193483,
 'elevator-0': 191199,
 'endangered-1': 220133,
 'exploration': 155703,
 'face-to-face': 233673,
 'filibuster-0': 229836,
 'four-2': 184672,
 'freedom-4': 348303,
 'galaxy-command': 197443,
 'giant-0': 282475,
 'give-and-take-0': 254498,
 'glory-0': 229714,
 'glyph-1': 226835,
 'greed': 317118,
 'gulp': 256317,
 'hammers-and-anvils': 290795,
 'high-noon-reloaded': 274442,
 'historia': 254888,
 'hyperspace': 318185,
 'import-export': 348602,
 'inferno-4': 207310,
 'jabberwocky-0': 136385,
 'jungla': 135835,
 'kaos-0': 98350,
 'lancer': 288605,
 'leviathan': 244506,
 'maharaja': 202667,
 'matches': 148074,
 'medieval': 46,
 'megalomania': 322549,
 'monkey-business': 165020,
 'monster-combat': 152868,
 'monster-mania': 236885,
 'monster-trap': 143520,
 'napoleon-0': 260228,
 'on-the-rocks-3': 286294,
 'onslaught-0': 269418,
 'overlord-a-boss-monster-adventure': 245060,
 'papillon': 216013,
 'paris-0': 241066,
 'pirate-attack-2': 198077,
 'planetarium': 309665,
 'pocket-landship': 232201,
 'prism-1': 249500,
 'project-elite': 145496,
 'quarantine-0': 256999,
 'realm-0': 312959,
 'regency': 320294,
 'reload': 307002,
 'ritual': 170604,
 'royalty-0': 258137,
 'rumble-0': 276856,
 'russian-and-roulette-the-edge-of-your-seat-dice-ga': 118315,
 'sakura': 252854,
 'serpents': 295126,
 'shelter-0': 274157,
 'shiver-me-timbers-0': 239175,
 'snapshot-0': 176071,
 'space-time': 175510,
 'spin-2': 1036,
 'starlight-2': 193577,
 'super-chess': 184439,
 'sweet-tooth': 336120,
 'the-battle-of-armageddon': 236250,
 'the-brain-game': 226445,
 'the-great-debate': 159492,
 'the-marble-game': 173105,
 'the-road-a-solo-card-game-of-survival': 229223,
 'the-wonderful-wizard-of-oz': 313016,
 'this-war-without-an-enemy': 30869,
 'titan-0': 250725,
 'tontine-0': 188301,
 'tortuga-2199': 153939,
 'treasure-hunter': 316335,
 'vabanque': 179172,
 'victory-in-europe': 145872,
 'virus-2': 265369,
 'werewolf': 262543,
 'wild-cards': 230769,
 'yucatan': 259501
}

In [47]:
bggidks_slugs_manual = cleaning_dict.copy()

NameError: name 'cleaning_dict' is not defined

In [48]:
bggidks_slugs_manual

NameError: name 'bggidks_slugs_manual' is not defined

## Concat the dictionary values with the KS-slugs and BGG-ids table

In [49]:
mask = ~df_ks_bgg_unique_slugs['slug'].isin(slugs_to_control) # ~ invert the bool values

df_ks_bgg_unique_slugs_only = df_ks_bgg_unique_slugs.loc[mask,:]

In [50]:
df_ks_bgg_unique_slugs_only


Unnamed: 0,slug,bgg_id
0,relic,128442
1,stormsunder-heirs-of-ruin,302679
2,crazy-taco,289397
3,fray,303430
4,doomsday-bots,253683
...,...,...
8802,coalitions,331317
8807,roll-camera-the-filmmaking-board-game,298102
8810,dark-venture-battle-of-the-ancients,298163
8813,rhetorical-oracle-0,298189


In [51]:
df_slugs_bgg_manual = pd.DataFrame([bggidks_slugs_manual]).T



df_slugs_bgg_manual = df_slugs_bgg_manual.reset_index()


df_slugs_bgg_manual.columns = ['slug','bgg_id']



df_slug_bgg_last = pd.concat([df_slugs_bgg_manual,df_ks_bgg_unique_slugs_only],axis=0)

NameError: name 'bggidks_slugs_manual' is not defined

I have successfully recovered more then 100 KS slugs

### Put the table on SQL server

In [204]:
sf.build_table(engine=engine,table_name="unique-slug-bgg-id",dataframe=df_slug_bgg_last,schema=schema)

The unique-slug-bgg-id table was imported successfully.


In [52]:
sql = f'''SELECT * FROM {schema}.unique_slug_bgg_id;
      '''
      

In [53]:
df = sf.get_dataframe(sql)

## There are some BGG ids that are assigned to more Slugs

I want to count how much BGG ids have more than 1 KS slug assigned.

In [88]:
df_too_much_slugs = df.groupby('bgg_id').count().sort_values('slug',ascending=False)

In [91]:
df_too_much_slugs = df_too_much_slugs.reset_index()

In [100]:
df_too_much_slugs

Unnamed: 0,bgg_id,slug
0,331754,3
1,241829,2
2,263665,2
3,319271,2
4,224594,2
...,...,...
4400,208543,1
4401,208568,1
4402,208569,1
4403,208700,1


In [106]:
mask = df_too_much_slugs['slug'] > 1
too_much_slugs = df_too_much_slugs.loc[mask,'bgg_id']

too_much_slugs.info()


<class 'pandas.core.series.Series'>
Int64Index: 120 entries, 0 to 119
Series name: bgg_id
Non-Null Count  Dtype
--------------  -----
120 non-null    int64
dtypes: int64(1)
memory usage: 1.9 KB


I iterate through the list with BGG ids with more then I slug to see in detail why it is happening.

In [112]:
slug_iter = iter(too_much_slugs)
choice = ""

while choice != "exit": 
    choice = input("select the row with the correct BGG ID")
    if choice == '':
        mask_check = df_ks_bgg["bgg_id"].isin([next(slug_iter)])
        display(df_ks_bgg.loc[mask_check, :].reset_index())
    
    # bellum-0 = 217430 -- added manually in SQL table

    # adorable-monsters = 241279 added manually in SQL table


    clear_output(wait=True)    

StopIteration: 

There are some creators that start more Kickstarter campaigns with the same game and they usually do it because the first one fails, so they start again with a lower goal.

## Remove manual check double BGG IDs

Some games were automatically assigned to th fake BGG id. I need to delete the connection. I have done this directly on DBeaver.

In [113]:
    # captains-log-0 __ ID has to be cancelled
    # alternate-souls-arena-card-game-0 __ ID has to be cancelled
    # maquis-0 __ ID has to be cancelled
    # king-of-indecision __ ID has to be cancelled
    # the_outlaws-0 __ ID has to be cancelled
    # monkeys-on-your-back-0 __ ID has to be cancelled
    # champions-of-hara - ID has to be cancelled
