In [1]:
import pandas as pd
import glob
import hashlib
import sqlite3
import os

Unire tutti i dati nei CSV files dentro `data` in un singolo Pandas dataframe in memoria.

In [2]:
db_filename = 'star_schema_example.db'
if os.path.exists(db_filename):
    os.remove(db_filename)
cnx = sqlite3.connect(db_filename)
dfs = []
for csv_f in glob.glob("data/*.csv"):
    dfs.append(pd.read_csv(csv_f, index_col=None, header=0))

df = pd.concat(dfs, axis=0, ignore_index=True)
df.dropna(how="any", inplace=True)
df["song_artist"]=df["song"]+" - "+df["artist"]
df["song_id"] = df["song_artist"].apply(lambda x: hashlib.md5(x.encode('utf-8')).hexdigest())
df.drop(["song_artist"],axis=1,inplace=True)
df.head()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userId,song_id
0,Harmonia,Logged In,Ryan,M,0,Smith,655.77751,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541020000000.0,583,Sehr kosmisch,200,1542240000000.0,26.0,eeac062e56a7e543d7d6e517eab89560
1,The Prodigy,Logged In,Ryan,M,1,Smith,260.07465,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541020000000.0,583,The Big Gundown,200,1542240000000.0,26.0,525671f443d2425a0583bb03afaa1c28
2,Train,Logged In,Ryan,M,2,Smith,205.45261,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541020000000.0,583,Marry Me,200,1542240000000.0,26.0,8807df8a9a4b422534027b675d40d78a
5,Sony Wonder,Logged In,Samuel,M,0,Gonzalez,218.06975,free,"Houston-The Woodlands-Sugar Land, TX",PUT,NextSong,1540490000000.0,597,Blackbird,200,1542250000000.0,61.0,2a4d00ff68a501f27d381152ad471b94
9,Van Halen,Logged In,Tegan,F,2,Levine,289.38404,paid,"Portland-South Portland, ME",PUT,NextSong,1540790000000.0,602,Best Of Both Worlds (Remastered Album Version),200,1542260000000.0,80.0,3036905f8ce3396815bf12699d8904a0


Crea un nuovo dataframe per la *fact table*:


In [3]:
fact_df = df[["auth","userId","sessionId","itemInSession","ts","level","song_id"]].drop_duplicates()
fact_df.head()

Unnamed: 0,auth,userId,sessionId,itemInSession,ts,level,song_id
0,Logged In,26.0,583,0,1542240000000.0,free,eeac062e56a7e543d7d6e517eab89560
1,Logged In,26.0,583,1,1542240000000.0,free,525671f443d2425a0583bb03afaa1c28
2,Logged In,26.0,583,2,1542240000000.0,free,8807df8a9a4b422534027b675d40d78a
5,Logged In,61.0,597,0,1542250000000.0,free,2a4d00ff68a501f27d381152ad471b94
9,Logged In,80.0,602,2,1542260000000.0,paid,3036905f8ce3396815bf12699d8904a0


Crea *dimension tables* base per le rispettive dimensioni:

- song
- user
- session
- time

In [4]:
song_dim = df[["song_id","artist","song","length"]].drop_duplicates()
user_dim = df[["userId","firstName","lastName","gender","location"]].drop_duplicates()
session_dim = df[["sessionId","page","itemInSession","method"]].drop_duplicates()
time_dim = df[["ts"]].drop_duplicates()

Arricchisci le *time_ds* con piu' informazioni:

In [5]:
time_dim["timestamp"] = pd.to_datetime(time_dim["ts"],unit="ms")
time_dim["day"]= time_dim["timestamp"].dt.day
time_dim["month"]= time_dim["timestamp"].dt.month
time_dim["year"]= time_dim["timestamp"].dt.year
time_dim["quarter"]= time_dim["timestamp"].dt.quarter
time_dim.head()

Unnamed: 0,ts,timestamp,day,month,year,quarter
0,1542240000000.0,2018-11-15 00:00:00,15,11,2018,4
5,1542250000000.0,2018-11-15 02:46:40,15,11,2018,4
9,1542260000000.0,2018-11-15 05:33:20,15,11,2018,4
24,1542270000000.0,2018-11-15 08:20:00,15,11,2018,4
39,1542280000000.0,2018-11-15 11:06:40,15,11,2018,4


Salva in un SQLite database.

In [6]:
fact_df.to_sql(name='songplay_fact', con=cnx)
song_dim.to_sql(name='song_dim', con=cnx)
user_dim.to_sql(name='user_dim', con=cnx)
session_dim.to_sql(name='session_dim', con=cnx)
time_dim.to_sql(name='time_dim', con=cnx)

242

Test di lettura:

In [7]:
results = pd.read_sql('select sp.userId,sp.level,t.timestamp,t.day,t.month,t.year,t.quarter from songplay_fact sp inner join time_dim t on sp.ts = t.ts', cnx)
results.head()

Unnamed: 0,userId,level,timestamp,day,month,year,quarter
0,26.0,free,1542236400,15,11,2018,4
1,26.0,free,1542236400,15,11,2018,4
2,26.0,free,1542236400,15,11,2018,4
3,61.0,free,1542246400,15,11,2018,4
4,80.0,paid,1542256400,15,11,2018,4
