In [1]:
import pandas as pd
import glob
import hashlib
import sqlite3
import os
from faker import Faker
import random

fake = Faker()

Unire tutti i dati nei CSV files dentro `data` in un singolo Pandas dataframe in memoria.

In [2]:
db_filename = '3nf_schema_example.db'
if os.path.exists(db_filename):
    os.remove(db_filename)
cnx = sqlite3.connect(db_filename)
dfs = []
for csv_f in glob.glob("data/*.csv"):
    dfs.append(pd.read_csv(csv_f, index_col=None, header=0))

df = pd.concat(dfs, axis=0, ignore_index=True)
df.dropna(how="any", inplace=True)
df.head()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userId
0,Harmonia,Logged In,Ryan,M,0,Smith,655.77751,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541020000000.0,583,Sehr kosmisch,200,1542240000000.0,26.0
1,The Prodigy,Logged In,Ryan,M,1,Smith,260.07465,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541020000000.0,583,The Big Gundown,200,1542240000000.0,26.0
2,Train,Logged In,Ryan,M,2,Smith,205.45261,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541020000000.0,583,Marry Me,200,1542240000000.0,26.0
5,Sony Wonder,Logged In,Samuel,M,0,Gonzalez,218.06975,free,"Houston-The Woodlands-Sugar Land, TX",PUT,NextSong,1540490000000.0,597,Blackbird,200,1542250000000.0,61.0
9,Van Halen,Logged In,Tegan,F,2,Levine,289.38404,paid,"Portland-South Portland, ME",PUT,NextSong,1540790000000.0,602,Best Of Both Worlds (Remastered Album Version),200,1542260000000.0,80.0


Arricchico il dataframe con dati sintetici per avere una demo piu' corposa.

In [3]:
df["song_artist"]=df["song"]+" - "+df["artist"]
df["song_id"] = df["song_artist"].apply(lambda x: hashlib.md5(x.encode('utf-8')).hexdigest())
df.drop(["song_artist"],axis=1,inplace=True)
df["company_name"] = [fake.company() for _ in range(df.shape[0])]
df["company_address"] = [fake.street_address() for _ in range(df.shape[0])]
df["company_suffix"] = [fake.company_suffix() for _ in range(df.shape[0])]
df["companyName_Suffix"] = df["company_name"]+" - "+df["company_suffix"]
df["company_id"] = df["companyName_Suffix"].apply(lambda x: hashlib.md5(x.encode('utf-8')).hexdigest())
df.drop(["companyName_Suffix"],axis=1,inplace=True)
df["length_class"] = ['long' if x >= 232.972605 else 'short' for x in df['length']]
df["file_format"] = [random.choice(['mp3','flac','wav']) for _ in range(df.shape[0])]
df["song_artist_format"]=df["song"]+" - "+df["artist"]+" - "+df["file_format"]
df["meta_id"] = df["song_artist_format"].apply(lambda x: hashlib.md5(x.encode('utf-8')).hexdigest())
df.drop(["song_artist_format"],axis=1,inplace=True)
df.head()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,...,ts,userId,song_id,company_name,company_address,company_suffix,company_id,length_class,file_format,meta_id
0,Harmonia,Logged In,Ryan,M,0,Smith,655.77751,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,...,1542240000000.0,26.0,eeac062e56a7e543d7d6e517eab89560,Cooper-Sanchez,617 Sarah Mountain Apt. 375,Ltd,d68972f967fe77d09c853cde410d0e7c,long,wav,696558899c038df4b1bc8c1ac7a445e7
1,The Prodigy,Logged In,Ryan,M,1,Smith,260.07465,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,...,1542240000000.0,26.0,525671f443d2425a0583bb03afaa1c28,Cuevas-Lee,5584 Kim Common Apt. 742,and Sons,3a8bfff2717e601cf7cbc6e075b2efac,long,wav,c49212847115420b3a280f4f86878386
2,Train,Logged In,Ryan,M,2,Smith,205.45261,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,...,1542240000000.0,26.0,8807df8a9a4b422534027b675d40d78a,Dalton Ltd,80813 Amy Forge,Group,c9163e6084cf39889f1937cc15716792,short,wav,cb600ec94abf04b3d23ae2f46480b528
5,Sony Wonder,Logged In,Samuel,M,0,Gonzalez,218.06975,free,"Houston-The Woodlands-Sugar Land, TX",PUT,...,1542250000000.0,61.0,2a4d00ff68a501f27d381152ad471b94,Diaz Group,47940 King Neck,LLC,55d6a83e69b10b598cdc8b77c69aa9ac,short,wav,fa0942aea355634f759517d628e101b7
9,Van Halen,Logged In,Tegan,F,2,Levine,289.38404,paid,"Portland-South Portland, ME",PUT,...,1542260000000.0,80.0,3036905f8ce3396815bf12699d8904a0,Stephens-Cobb,25053 Melissa Parks Suite 921,Inc,7e98ccd4eb19ccc68d824cbddec3e11e,long,wav,f518a23de6b25079cc6ea8f60cf59ead


Crea tabelle base:


In [4]:
songs = df[["song_id","artist","song"]].drop_duplicates()
metadatas = df[["meta_id","length_class","file_format"]].drop_duplicates()
users = df[["userId","firstName","lastName","gender","location",]].drop_duplicates()
companies = df[["company_id","company_name","company_address","company_suffix"]].drop_duplicates()
sessions = df[["sessionId","page","itemInSession","method"]].drop_duplicates()
times = df[["ts"]].drop_duplicates()

Crea tabelle di lookup:

In [5]:
session_time = df[["sessionId","ts"]].drop_duplicates()
session_user = df[["sessionId","userId"]].drop_duplicates()
user_company = df[["userId","company_id"]].drop_duplicates()
song_metadata = df[["meta_id","song_id"]].drop_duplicates()

Arricchisci le *time_ds* con piu' informazioni:

In [6]:
times["timestamp"] = pd.to_datetime(times["ts"],unit="ms")
times["day"]= times["timestamp"].dt.day
times["month"]= times["timestamp"].dt.month
times["year"]= times["timestamp"].dt.year
times["quarter"]= times["timestamp"].dt.quarter
times.head()

Unnamed: 0,ts,timestamp,day,month,year,quarter
0,1542240000000.0,2018-11-15 00:00:00,15,11,2018,4
5,1542250000000.0,2018-11-15 02:46:40,15,11,2018,4
9,1542260000000.0,2018-11-15 05:33:20,15,11,2018,4
24,1542270000000.0,2018-11-15 08:20:00,15,11,2018,4
39,1542280000000.0,2018-11-15 11:06:40,15,11,2018,4


Salva in un SQLite database.

In [7]:
songs.to_sql(name='songs', con=cnx)
metadatas.to_sql(name='metadatas', con=cnx)
users.to_sql(name='users', con=cnx)
companies.to_sql(name='companies', con=cnx)
sessions.to_sql(name='sessions', con=cnx)
times.to_sql(name='times', con=cnx)
session_time.to_sql(name='session_time', con=cnx)
session_user.to_sql(name='session_user', con=cnx)
user_company.to_sql(name='user_company', con=cnx)
song_metadata.to_sql(name='song_metadata', con=cnx)


5986

Test di lettura:

In [8]:
results = pd.read_sql("""SELECT u.userId,
                                c.company_name
                        FROM users u INNER JOIN user_company uc ON u.userId = uc.userId
                                     INNER JOIN companies c ON uc.company_id = c.company_id""", cnx)
results.head()

Unnamed: 0,userId,company_name
0,26.0,Jones and Sons
1,26.0,Johnson and Sons
2,26.0,Johnson and Sons
3,26.0,Powell-Collins
4,26.0,Miller-Richardson
