In [1]:
import pandas as pd

In [2]:
# Load all the data
data_runs = pd.read_csv('data/v1/runs-data.csv')
data_games = pd.read_csv('data/v1/games-data.csv')
data_categories = pd.read_csv('data/v1/categories-data.csv')
data_leaderboards = pd.read_csv('data/v1/leaderboards-data.csv')
data_levels = pd.read_csv('data/v1/levels-data.csv')
data_users = pd.read_csv('data/v1/users-data.csv')
data_platforms = pd.read_csv('data/other_dataset_with_platforms.csv')



In [3]:
# Only keep the games that are per-game type
data_categories = data_categories[data_categories['type'] == 'per-game']

In [4]:
data_categories

Unnamed: 0,parentGameID,ID,name,rules,type,numPlayers
0,j1n8nj91,8249gled,Moose's Simulator,"# Version\r\nMichael \""Moose\"" O'Malley's simu...",per-game,1
1,j1n8nj91,9d8n4owd,Kevin's Simulator,# Version\r\nKevin Karstens's simulator releas...,per-game,1
2,j1n8nj91,vdon14vk,Original hardware,,per-game,1
3,ldejl7j1,w20m70zk,Any%,The run starts upon entering the dungeon for t...,per-game,1
4,9d389y91,vdopjgy2,Beat The Game,## Main Goal\r\n* The player must make it to t...,per-game,1
...,...,...,...,...,...,...
147953,j1n48ll6,zdn3gpx2,Hard,,per-game,1
147954,j1n48ll6,w204lxvk,Hints,,per-game,1
147955,j1n48ll6,jdr0y6ld,Dies%,,per-game,1
147956,j1n48ll6,wk6gleqd,Enemy Death%,,per-game,1


In [5]:
# Only keep id and plateform for data_platforms
data_platforms = data_platforms[["platform_id","platform_name"]]
# Remove duplicates
data_platforms = data_platforms.drop_duplicates()

In [6]:
# Merge all the datasets
data = data_runs.merge(data_games, left_on='gameID', right_on='ID', how='left', suffixes=('', '_game'))
data = data.merge(data_categories, left_on='categoryID', right_on='ID', how='left', suffixes=('', '_category'))
data = data.merge(data_levels, left_on='levelID', right_on='ID', how='left', suffixes=('', '_level'))
data = data.merge(data_users, left_on='players', right_on='ID', how='left', suffixes=('', '_user'))
data = data.merge(data_leaderboards, left_on='ID', right_on='runID', how='left', suffixes=('', '_leaderboard'))
data = data.merge(data_platforms, left_on='platform', right_on='platform_id', how='left', suffixes=('', '_plateform'))



In [7]:
# Delete the arrays to free up memory
del data_runs
del data_games
del data_categories
del data_leaderboards
del data_levels
del data_users
del data_platforms


In [8]:
# Only keep verified runs
data = data[data['status'] == 'verified']


In [9]:
# Delete all columns if _id or ID in the name
data = data[[col for col in data.columns if '_id' not in col and 'ID' not in col]] 

# Remove the rows with NaN values in the 'type' column (we don't want per-level data)
data = data.dropna(subset=['type', 'location'])

In [10]:
# Delete the other columns that are not needed
data = data.drop(
    columns=[
        "platform",
        "players",
        "examiner",
        "values",
        "status",
        "statusReason",
        "verifiedDate",
        "URL",
        "createdDate",
        "numCategories",
        "numLevels",
        "rules",
        "type",
        "numPlayers",
        "name_level",
        "rules_level",
        "signupDate",
        "platform_leaderboard",
        "emulated_leaderboard",
        "players_leaderboard",
        "examiner_leaderboard",
        "verifiedDate_leaderboard",
        "variablesAndValues",
        "date_leaderboard",
        "primaryTime_leaderboard",
        "place",
        "numRuns",
    ]
)

In [11]:
# Drop the rows with NaN values
data = data.dropna()

In [12]:
print(len(data))
data.head()


1821316


Unnamed: 0,date,primaryTime,emulated,name,releaseDate,name_category,name_user,location,platform_name
8,2021-12-22,449.533,False,RE:RUN,2020-08-08,Any%,quebecpower,ca/qc,PC
9,2019-02-09,194.0,False,Time Warpers,2018-11-02,Zones 1-100,_zuR,br,PC
11,2014-11-17,857.0,True,Kirby's Dream Land,1992-04-27,Normal Mode,SapphireYoshi,us,Game Boy
12,2014-11-17,3062.0,False,Kirby: Nightmare in Dream Land,2002-10-25,Any%,SapphireYoshi,us,Game Boy Advance
13,2014-11-17,2454.0,False,Kirby & The Amazing Mirror,2004-04-15,Any%,SapphireYoshi,us,Game Boy Advance


In [14]:
# Write the data to feather
data.to_feather('data/temp/data.feather')