## Dependencias

In [45]:
import numpy as np
import pandas as pd
import json #JSON = Java Script Object Notation
from sqlalchemy import create_engine
from sqlalchemy.types import VARCHAR,FLOAT,INTEGER,DATE,CHAR,DATETIME
import os

pd.set_option('display.max_columns',None)

## Credenciales

In [46]:
creds = json.load(open('credenciales_local.json','rb'))

## Crear conexión a base de datos

In [47]:
url = f"mysql+pymysql://{creds['user']}:{creds['password']}@{creds['servidor']}/vg"
cnx = create_engine(url,encoding='utf8')
cnx = cnx.connect()

In [48]:
cnx.closed

False

## Lectura y limpieza de datos

In [49]:
ruta = '/home/jose/Documentos/bd/video juegos/vgsales.csv'

In [50]:
datos = pd.read_csv(ruta)

In [51]:
datos.shape

(16598, 11)

In [52]:
datos.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [53]:
datos = datos.groupby(['Name','Platform','Genre','Publisher']).agg({'Year':'max',
                                        'NA_Sales':'sum',
                                        'EU_Sales':'sum',
                                        'JP_Sales':'sum',
                                        'Other_Sales':'sum'
                                       }).reset_index()


## Separación de Entidades

### Entidad Plataforma

In [54]:
entPlataforma = datos[['Platform']].drop_duplicates().reset_index(drop=True)
entPlataforma.columns = ['platform']
entPlataforma.insert(0,'platformId',entPlataforma.index+1)
entPlataforma

Unnamed: 0,platformId,platform
0,1,PS
1,2,PS2
2,3,PSP
3,4,PS3
4,5,DS
5,6,PC
6,7,Wii
7,8,X360
8,9,N64
9,10,3DS


In [55]:
entPlataforma['platform'].map(len).max()

4

In [56]:
entPlataforma.to_sql(con=cnx,
                   name='platform',
                   if_exists='replace',
                   index=False,
                   chunksize=10000,
                   dtype={'platformId':INTEGER,
                          'platform':VARCHAR(4)})

31

### Entidad Género

In [57]:
entGenero = datos[['Genre']].drop_duplicates().reset_index(drop=True)
entGenero.columns = ['genre']
entGenero.insert(0,'genreId',entGenero.index+1)
entGenero

Unnamed: 0,genreId,genre
0,1,Sports
1,2,Role-Playing
2,3,Action
3,4,Racing
4,5,Shooter
5,6,Misc
6,7,Adventure
7,8,Puzzle
8,9,Simulation
9,10,Platform


In [58]:
entGenero['genre'].map(len).max()

12

In [59]:
entGenero.to_sql(con=cnx,
                   name='genre',
                   if_exists='replace',
                   index=False,
                   chunksize=10000,
                   dtype={'genreId':INTEGER,
                          'genre':VARCHAR(12)})

12

### Entidad Editor

In [60]:
entEditor = datos[['Publisher']].drop_duplicates().reset_index(drop=True)
entEditor.columns = ['publisher']
entEditor.insert(0,'publisherId',entEditor.index+1)
entEditor

Unnamed: 0,publisherId,publisher
0,1,Magical Company
1,2,Namco Bandai Games
2,3,Atari
3,4,Electronic Arts
4,5,Activision
...,...,...
573,574,Aria
574,575,responDESIGN
575,576,Karin Entertainment
576,577,Rebellion Developments


In [61]:
entEditor['publisher'].map(len).max()

38

In [62]:
entEditor.to_sql(con=cnx,
                   name='publisher',
                   if_exists='replace',
                   index=False,
                   chunksize=10000,
                   dtype={'publisherId':INTEGER,
                          'publisher':VARCHAR(38)})

578

### Entidad Región

In [63]:
entRegion = pd.Series(dict(zip(range(1,5),['NA','EU','JP','OT']))).to_frame()
entRegion.columns = ['region']
entRegion.insert(0,'regionId',entRegion.index)
entRegion

Unnamed: 0,regionId,region
1,1,
2,2,EU
3,3,JP
4,4,OT


In [64]:
entRegion.to_sql(con=cnx,
                   name='region',
                   if_exists='replace',
                   index=False,
                   chunksize=10000,
                   dtype={'regionId':INTEGER,
                          'region':CHAR(2)})

4

In [65]:
datos.head()

Unnamed: 0,Name,Platform,Genre,Publisher,Year,NA_Sales,EU_Sales,JP_Sales,Other_Sales
0,'98 Koshien,PS,Sports,Magical Company,1998.0,0.15,0.1,0.12,0.03
1,.hack//G.U. Vol.1//Rebirth,PS2,Role-Playing,Namco Bandai Games,2006.0,0.0,0.0,0.17,0.0
2,.hack//G.U. Vol.2//Reminisce,PS2,Role-Playing,Namco Bandai Games,2006.0,0.11,0.09,0.0,0.03
3,.hack//G.U. Vol.2//Reminisce (jp sales),PS2,Role-Playing,Namco Bandai Games,2006.0,0.0,0.0,0.16,0.0
4,.hack//G.U. Vol.3//Redemption,PS2,Role-Playing,Namco Bandai Games,2007.0,0.0,0.0,0.17,0.0


### Entidad Juego

In [66]:
entJuego = datos[['Name','Platform','Genre','Publisher']].drop_duplicates().reset_index(drop=True)
entJuego = entJuego.merge(entPlataforma,left_on='Platform',right_on='platform',how='inner')
entJuego = entJuego.merge(entGenero,left_on='Genre',right_on='genre',how='inner')
entJuego = entJuego.merge(entEditor,left_on='Publisher',right_on='publisher',how='inner')
entJuego.drop(['Platform','Genre','Publisher','genre','platform','publisher'],axis=1,inplace=True)
entJuego.insert(0,'gameId',entJuego.index+1)
entJuego.rename(columns={'Name':'name'},inplace=True)
entJuego

Unnamed: 0,gameId,name,platformId,genreId,publisherId
0,1,'98 Koshien,1,1,1
1,2,2Xtreme,1,1,22
2,3,Cool Boarders,1,1,22
3,4,Cool Boarders 2001,1,1,22
4,5,ESPN Extreme Games,1,1,22
...,...,...,...,...,...
16531,16532,Wrestle Angels: Survivor 2,2,12,571
16532,16533,Hajime no Ippo Portable: Victorious Spirits,3,12,364
16533,16534,Deadliest Warrior: Ancient Combat,4,12,281
16534,16535,Deadliest Warrior: Ancient Combat,8,12,281


In [67]:
entJuego['name'].map(len).max()

132

In [68]:
entJuego.to_sql(con=cnx,
                   name='game',
                   if_exists='replace',
                   index=False,
                   chunksize=10000,
                   dtype={'gameId':INTEGER,
                          'name':VARCHAR(132),
                          'platformId':INTEGER,
                          'genreId':INTEGER,
                          'publisherId':INTEGER
                         })

16536

### Entidad Ventas

In [84]:
entVentas = datos.merge(entJuego[['name','gameId']],left_on='Name',right_on='name')
entVentas = entVentas[['gameId','NA_Sales','EU_Sales','JP_Sales','Other_Sales']]
entVentas = entVentas.melt(value_vars=['NA_Sales','EU_Sales','JP_Sales','Other_Sales'],id_vars='gameId')
entVentas['region'] = entVentas['variable'].map(lambda x:x[:2].upper())
entVentas = entVentas.merge(entRegion,on='region',how='inner')
entVentas = entVentas[['gameId','regionId','value']].rename(columns={'value':'sales'})
entVentas.insert(0,'salesId',entVentas.index+1)
entVentas

Unnamed: 0,salesId,gameId,regionId,sales
0,1,1,1,0.15
1,2,1921,1,0.00
2,3,1922,1,0.11
3,4,1923,1,0.00
4,5,1924,1,0.00
...,...,...,...,...
140363,140364,3922,4,0.02
140364,140365,3928,4,0.02
140365,140366,3922,4,0.00
140366,140367,3928,4,0.00


In [85]:
entVentas.to_sql(con=cnx,
                   name='sales',
                   if_exists='replace',
                   index=False,
                   chunksize=10000,
                   dtype={'salesId':INTEGER,
                           'gameId':INTEGER,
                          'regionId':INTEGER,
                          'sales':FLOAT
                         })

140368