## Dependencias

In [1]:
import pandas as pd
import json #JSON = Java Script Object Notation
from sqlalchemy import create_engine
from sqlalchemy.types import VARCHAR,FLOAT,INTEGER,DATE,CHAR,DATETIME
import os

pd.set_option('display.max_columns',None)

## Credenciales

In [2]:
creds = json.load(open('credenciales_local.json','rb'))

## Crear conexión a base de datos

In [4]:
url = f"mysql+pymysql://{creds['user']}:{creds['password']}@{creds['servidor']}/retail"
cnx = create_engine(url,encoding='utf8')
cnx = cnx.connect()

In [5]:
cnx.closed

False

## Lectura y limpieza de datos

In [9]:
datos = pd.read_excel('/home/jose/Documentos/bd/retail/Online Retail.xlsx')

In [11]:
datos.shape

(541909, 8)

In [12]:
datos.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


## Separación de Entidades

### Entidad Customer

In [98]:
entCustomer = datos[['CustomerID']].drop_duplicates().dropna().reset_index(drop=True)
entCustomer['CustomerID'] = entCustomer['CustomerID'].astype(int)

In [99]:
entCustomer.to_sql(con=cnx,
                   name='Customer',
                   if_exists='replace',
                   index=False,
                   chunksize=10000,
                   dtype={'CustomerID':INTEGER})

4372

### Entidad Product

In [94]:
entProducto = datos[['StockCode','Description']].drop_duplicates().dropna().reset_index(drop=True).astype(str)
entProducto['StockCode'] = entProducto['StockCode'].map(str.upper)
entProducto = entProducto.groupby('StockCode').first().reset_index()
entProducto.to_sql(con=cnx,
                   name='Product',
                   if_exists='replace',
                   index=False,
                   chunksize=10000,
                   dtype={'StockCode':VARCHAR(12),'Description':VARCHAR(35)})

3848

In [95]:
entProducto.loc[entProducto['StockCode']=='84509A']

Unnamed: 0,StockCode,Description
2957,84509A,SET OF 4 ENGLISH ROSE PLACEMATS


### Entidad Invoice

In [96]:
entInvoice = datos[['InvoiceNo','InvoiceDate','CustomerID']].drop_duplicates().dropna().copy().reset_index(drop=True)
entInvoice = entInvoice.sort_values(by=['InvoiceNo','InvoiceDate']).groupby('InvoiceNo').first().reset_index()
entInvoice.to_sql(con=cnx,
                   name='Invoice',
                   if_exists='replace',
                   index=False,
                   chunksize=10000,
                   dtype={'InvoiceNo':CHAR(7),
                           'InvoiceDate':DATETIME,
                           'CustomerID':INTEGER
                         }
             )

22190

### Entidad Transaction

In [149]:
entTxn = datos[['InvoiceNo','StockCode','Quantity','UnitPrice','Country']].copy().dropna().reset_index(drop=True)
entTxn.insert(0,'TxnID', entTxn.index+1) 
entTxn['StockCode'] = entTxn['StockCode'].map(str).map(str.upper)
entTxn = entTxn.merge(entInvoice,on='InvoiceNo',how='inner')
print(entTxn.shape)
entTxn = entTxn.merge(entProducto,on='StockCode',how='inner')
print(entTxn.shape)
entTxn.dropna().to_sql(con=cnx,
                   name='Transaction',
                   if_exists='replace',
                   index=False,
                   chunksize=10000,
                   dtype={'TxnID':INTEGER,
                           'InvoiceNo':CHAR(7),
                           'StockCode':VARCHAR(12),
                           'Quantity':INTEGER,
                          'UnitPrice':FLOAT,
                          'Country':VARCHAR(20)
                         }
             )

(406829, 8)
(406829, 9)


406829

In [147]:
entInvoice.loc[entInvoice['InvoiceNo']=='536414']

Unnamed: 0,InvoiceNo,InvoiceDate,CustomerID
