### Test notebook for connecting to AWS buckets and designing tables/ETL

In [1]:
import configparser
import psycopg2
import pandas as pd
import boto3
import json
import time

### Check datawarehouse config file

In [2]:
#Get credentials
config = configparser.ConfigParser()
config.read('song_dwh.cfg')

KEY = config.get('AWS', 'key')
SECRET = config.get('AWS', 'secret')
ARN = config.get("ARN", "arn")

DWH_REGION = config.get("DWH", "dwh_region")
DWH_CLUSTER_TYPE = config.get("DWH", "dwh_cluster_type")
DWH_NUM_NODES = config.get("DWH","dwh_num_nodes")
DWH_NODE_TYPE = config.get("DWH","dwh_node_type")
DWH_IAM_ROLE_NAME = config.get("DWH", "dwh_iam_role_name")
DWH_CLUSTER_IDENTIFIER = config.get("DWH","dwh_cluster_identifier")
DWH_DB = config.get("DWH","dwh_db")
DWH_DB_USER = config.get("DWH","dwh_db_user")
DWH_DB_PASSWORD = config.get("DWH","dwh_db_password")
DWH_PORT = config.get("DWH","dwh_port")
DWH_ENDPOINT = config.get("DWH","dwh_endpoint")

LOG_DATA = config.get('S3','log_data')
SONG_DATA = config.get('S3', 'song_data')

In [4]:
s3 = boto3.resource('s3', aws_access_key_id=KEY,
                          aws_secret_access_key=SECRET,
                          region_name="us-west-2")

In [27]:
#Download sample files
sampleDbBucket =  s3.Bucket("udacity-dend")
for obj in sampleDbBucket.objects.filter(Prefix="log-data/2018/11/2018-11-02-events.json"):
    print(obj)
sampleDbBucket.download_file("log-data/2018/11/2018-11-02-events.json", "2018-11-02-events.json")
#sampleDbBucket.download_file("song-data/A/A/A/TRAAAAK128F9318786.json", "TRAAAAK128F9318786.json")
#sampleDbBucket.download_file("log_json_path.json", "log_json_path.json")

s3.ObjectSummary(bucket_name='udacity-dend', key='log-data/2018/11/2018-11-02-events.json')


In [44]:
# Check sample log file
with open("samples/2018-11-02-events.json", "r") as f:    
    data = f.readlines()


d = json.loads(data[0])
print(list(d.keys()))
#print(d)
#df = pd.DataFrame(data[0], cols=list(data[0].keys()))
df = pd.DataFrame(d, columns=list(d.keys()), index=[0])
df.head()
#df.head()

['artist', 'auth', 'firstName', 'gender', 'itemInSession', 'lastName', 'length', 'level', 'location', 'method', 'page', 'registration', 'sessionId', 'song', 'status', 'ts', 'userAgent', 'userId']


Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,N.E.R.D. FEATURING MALICE,Logged In,Jayden,M,0,Fox,288.9922,free,"New Orleans-Metairie, LA",PUT,NextSong,1541034000000.0,184,Am I High (Feat. Malice),200,1541121934796,"""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebK...",101


In [45]:
# Check sample song file
with open("samples/TRAAAAK128F9318786.json", "r") as f:    
    data = json.load(f)

cols = list(data.keys())
print(cols)
df = pd.DataFrame(data, columns=cols, index=[0])

df.head()



['song_id', 'num_songs', 'title', 'artist_name', 'artist_latitude', 'year', 'duration', 'artist_id', 'artist_longitude', 'artist_location']


Unnamed: 0,song_id,num_songs,title,artist_name,artist_latitude,year,duration,artist_id,artist_longitude,artist_location
0,SOBLFFE12AF72AA5BA,1,Scream,Adelitas Way,,2009,213.9424,ARJNIUY12298900C91,,


In [3]:
%load_ext sql

In [4]:

#conn_string="postgresql://{}:{}@{}:{}/{}".format(DWH_DB_USER, DWH_DB_PASSWORD, DWH_ENDPOINT, DWH_PORT, DWH_DB)
conn_string="postgresql://{}:{}@{}".format(DWH_DB_USER, DWH_DB_PASSWORD, DWH_ENDPOINT)
print(conn_string)
%sql $conn_string

postgresql://dwhuser:Passw0rd@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh


'Connected: dwhuser@songdwh'

In [9]:
%%sql
CREATE DATABASE IF NOT EXISTS songdwh;

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
(psycopg2.ProgrammingError) syntax error at or near "NOT"
LINE 1: CREATE DATABASE IF NOT EXISTS songdwh;
                           ^
 [SQL: 'CREATE DATABASE IF NOT EXISTS songdwh;']


In [26]:
%%sql
DROP TABLE IF EXISTS staging_songs;

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
Done.


[]

In [80]:
%%sql
DROP TABLE IF EXISTS staging_events;

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
Done.


[]

In [6]:
%%sql

CREATE TABLE IF NOT EXISTS staging_events (
        artist varchar(200),
        auth varchar(50),
        firstName varchar(100),
        gender varchar(1),
        itemInSession int,
        lastName varchar(100), 
        length decimal(10,5),
        level varchar(5), 
        location varchar(255),
        method varchar(5),
        page varchar(25), 
        registration varchar(100),
        sessionId int,
        song varchar(200),
        status varchar(5),
        ts bigint,
        userAgent varchar(255),
        userId varchar(255)
    );

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
Done.


[]

In [7]:
%%sql
CREATE TABLE IF NOT EXISTS staging_songs (
        song_id varchar(100),
        num_songs int,
        title varchar(200),
        artist_name varchar(200),
        artist_latitude decimal(8,6),
        year int,
        duration decimal(9,4),
        artist_id varchar(200),
        artist_longitude decimal(9,6),
        artist_location varchar(255)
    );

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
Done.


[]

In [20]:
staging_events_copy = (""" 
    COPY staging_events 
    FROM {}
    IAM_ROLE '{}'
    REGION '{}'
""").format(LOG_DATA, ARN, DWH_REGION)
print(staging_events_copy)

staging_songs_copy = ("""
    COPY staging_songs
    FROM {}
    IAM_ROLE '{}'
    REGION '{}'
""").format(SONG_DATA, ARN, DWH_REGION)
print(staging_songs_copy)

 
    COPY staging_events 
    FROM ''s3://udacity-dend/log_data''
    IAM_ROLE 'arn:aws:iam::095184657221:role/dwhuser'
    REGION 'us-west-2'


    COPY staging_songs
    FROM ''s3://udacity-dend/song_data''
    IAM_ROLE 'arn:aws:iam::095184657221:role/dwhuser'
    REGION 'us-west-2'



In [8]:
%%sql
COPY staging_events 
    FROM 's3://udacity-dend/log_data'
    IAM_ROLE 'arn:aws:iam::095184657221:role/dwhuser'
    FORMAT AS JSON 's3://udacity-dend/log_json_path.json'
    REGION 'us-west-2'
    ;
    
#SELECT COUNT(song), song from staging_events group by song having count(song) > 1 limit 10;

#SELECT COUNT(artist), artist from staging_events group by artist having count(artist) > 1 limit 10;

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
Done.


[]

In [8]:
%%sql
select count(*) from staging_songs;

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
1 rows affected.


count
14896


In [9]:
%%sql
select * from staging_events
where artist = 'Cat Stevens'
and song = 'Wild World';

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
4 rows affected.


artist,auth,firstname,gender,iteminsession,lastname,length,level,location,method,page,registration,sessionid,song,status,ts,useragent,userid
Cat Stevens,Logged In,Sara,F,43,Johnson,200.202,paid,"Winston-Salem, NC",PUT,NextSong,1540809153796,152,Wild World,200,1541269636796,"""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like Mac OS X) AppleWebKit/537.51.2 (KHTML, like Gecko) Version/7.0 Mobile/11D257 Safari/9537.53""",95
Cat Stevens,Logged In,Kynnedi,F,1,Sanchez,200.202,free,"Cedar Rapids, IA",PUT,NextSong,1541079034796,88,Wild World,200,1541150017796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36""",89
Cat Stevens,Logged In,Tegan,F,25,Levine,200.202,paid,"Portland-South Portland, ME",PUT,NextSong,1540794356796,910,Wild World,200,1543083025796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",80
Cat Stevens,Logged In,Mohammad,M,3,Rodriguez,200.202,paid,"Sacramento--Roseville--Arden-Arcade, CA",PUT,NextSong,1540511766796,999,Wild World,200,1543430350796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",88


In [23]:
%%sql

SELECT COUNT(CONCAT(song,artist)), song, artist from staging_events group by song,artist having COUNT(CONCAT(song,artist)) > 1 limit 10;

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
10 rows affected.


count,song,artist
2,A Lack Of Color (Album Version),Death Cab for Cutie
6,Lucky (Album Version),Jason Mraz & Colbie Caillat
2,Journey To The Past (LP Version),Liz Callaway
2,Everything I Try to Do_ Nothing Seems to Turn Out Right,The Decemberists
17,Secrets,OneRepublic
4,Wild World,Cat Stevens
2,Swing Javanaise,Anis
2,Clouds (Of Color Bright Album Version),Velour 100
3,Bottom of a Bottle (Explicit Album Version),Smile Empty Soul
3,Eye Of The Tiger,Survivor


In [16]:
%%sql
SELECT 
*
FROM staging_events se
LEFT JOIN (
    SELECT
    title,
    song_id
    FROM staging_songs
) ss1
ON ss1.title = song


 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
1 rows affected.


count,count_1,count_2
8056,3148,5189


In [10]:
%%sql
COPY staging_songs
    FROM 's3://udacity-dend/song_data'
    IAM_ROLE 'arn:aws:iam::095184657221:role/dwhuser'
    FORMAT AS JSON 'auto'
    REGION 'us-west-2'
    ;

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
Done.


[]

In [37]:
%%sql

SELECT * FROM staging_songs where artist_name = 'Cat Stevens'

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
6 rows affected.


song_id,num_songs,title,artist_name,artist_latitude,year,duration,artist_id,artist_longitude,artist_location
SOQRRKS12AB0187374,1,Daytime,Cat Stevens,,1978,239.0199,ARVHQNN1187B9B9FA3,,"London, England"
SOLAYSZ12A6701F5BE,1,Sun / C79,Cat Stevens,,1974,275.8787,ARVHQNN1187B9B9FA3,,"London, England"
SONJNQI12A6310EDEE,1,Morning Has Broken,Cat Stevens,,1971,199.7579,ARVHQNN1187B9B9FA3,,"London, England"
SONCLNU12A6D4F86FB,1,I See A Road,Cat Stevens,,1966,128.6003,ARVHQNN1187B9B9FA3,,"London, England"
SOGWYVC12A6701F5DC,1,Last Love Song,Cat Stevens,,1978,211.7742,ARVHQNN1187B9B9FA3,,"London, England"
SOCOBMY12A58A7A161,1,Mona Bone Jakon,Cat Stevens,,1970,104.2542,ARVHQNN1187B9B9FA3,,"London, England"


In [67]:
%%sql

SELECT COUNT(*) FROM staging_events limit 5;

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
1 rows affected.


count
8056


In [65]:
%%sql

SELECT COUNT(CONCAT(title, artist_name)), count(song_id) FROM staging_songs
LIMIT 10;

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
1 rows affected.


count,count_1
14896,14896


In [54]:
%%sql
SELECT artist_name, title, COUNT(title) FROM staging_songs group by artist_name, title having count(title) > 1;

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
5 rows affected.


artist_name,title,count
Sonic Division,Day And Night,2
Inspiral Carpets,Commercial Reign,2
Béla Fleck,Moto Perpetuo_ Op. 11_ No. 2,2
Thrice,The Earth Will Shake,2
Fever Ray,When I Grow Up,2


In [25]:
%%sql
select * from stl_load_errors
WHERE starttime = (select max(starttime) from stl_load_errors);

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
2 rows affected.


userid,slice,tbl,starttime,session,query,filename,line_number,colname,type,col_length,position,raw_line,raw_field_value,err_code,err_reason,is_partial,start_offset
100,1,101606,2021-08-22 18:14:06.053405,20795,318,s3://udacity-dend/song_data/A/Z/C/TRAZCCG128E0798789.json,1,artist_name,varchar,100,0,"{""artist_id"":""ARQ846I1187B9A7083"",""artist_latitude"":null,""artist_location"":"""",""artist_longitude"":null,""artist_name"":""Yvonne S. Moriarty \\/ Walt Fowler \\/ Ladd McIntosh \\/ Elizabeth Finch \\/ Jack Smalley \\/ Bruce Fowler \\/ Gavin Greenaway \\/ The Lyndhurst Orchestra \\/ Lisa Gerrard \\/ Hans Zimmer"",""duration"":196.04853,""num_songs"":1,""song_id"":""SOEPTVC12A67ADD0DA"",""title"":""To Zucchabar [\\""Gladiator\\"" - Music from the Motion Picture]"",""year"":0}",,1204,String length exceeds DDL length,0,0
100,3,101606,2021-08-22 18:14:06.053405,20795,318,s3://udacity-dend/song_data/A/L/T/TRALTXO128F930843C.json,1,artist_name,varchar,100,0,"{""artist_id"":""ARV481W1187FB38CD9"",""artist_latitude"":null,""artist_location"":""Lajatico, Italy"",""artist_longitude"":null,""artist_name"":""Andrea Bocelli \\/ Vladimir Fedoseyev \\/ Moscow Radio Symphony Orchestra \\/ Victor Popov \\/ Academy Of Choir Art Of Russia"",""duration"":191.13751,""num_songs"":1,""song_id"":""SOKUATC12AB01853F3"",""title"":""Turandot: Nessun Dorma (Act 3)"",""year"":1997}",,1204,String length exceeds DDL length,0,0


In [20]:
%%sql
DROP TABLE IF EXISTS dim_user CASCADE;
DROP TABLE IF EXISTS dim_time CASCADE;
DROP TABLE IF EXISTS dim_artist CASCADE;
DROP TABLE IF EXISTS dim_song CASCADE;
DROP TABLE IF EXISTS fact_songPlay CASCADE;

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
Done.
Done.
Done.
Done.
Done.


[]

In [21]:
%%sql
CREATE TABLE IF NOT EXISTS dim_user (
        user_id int NOT NULL PRIMARY KEY, 
        first_name varchar(100), 
        last_name varchar(100), 
        gender varchar(1), 
        level varchar(5)
    );

CREATE TABLE IF NOT EXISTS dim_song (
        song_id varchar(100) NOT NULL PRIMARY KEY, 
        title varchar(200), 
        artist_id varchar(200), 
        year int, 
        duration decimal(9,4)
    );

CREATE TABLE IF NOT EXISTS dim_artist (
        artist_id varchar(200) NOT NULL PRIMARY KEY, 
        name varchar(200), 
        location varchar(255), 
        latitude decimal(8,6),
        longitude decimal(9,6)
    );

CREATE TABLE IF NOT EXISTS dim_time (
        start_time timestamp PRIMARY KEY, 
        hour int NOT NULL, 
        day int NOT NULL, 
        week int NOT NULL, 
        month int NOT NULL, 
        year int NOT NULL, 
        weekday int NOT NULL
    );

CREATE TABLE IF NOT EXISTS fact_songPlay (
        songplay_id int IDENTITY(0,1) PRIMARY KEY,
        start_time timestamp REFERENCES dim_time (start_time),
        user_id int REFERENCES dim_user (user_id),
        level varchar(5),
        song_id varchar(100) REFERENCES dim_song (song_id),
        artist_id varchar(200) REFERENCES dim_artist (artist_id),
        session_id int,
        location varchar(255),
        user_agent varchar(255)
    );

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
Done.
Done.
Done.
Done.
Done.


[]

In [22]:
%%sql
    INSERT INTO fact_songPlay (
    start_time, 
    user_id, 
    level, 
    song_id, 
    artist_id, 
    session_id, 
    location, 
    user_agent
    )
    SELECT
        date_add('ms',se.ts,'1970-01-01') as start_time,
        CAST(se.userId as int) as user_id,
        se.level,
        ss.song_id as song_id,
        ss.artist_id as artist_id,
        se.sessionId as session_id,
        se.location,
        se.useragent as user_agent
    FROM staging_events se
    LEFT JOIN (
        SELECT DISTINCT
            song_id,
            title,
            artist_id,
            artist_name,
            duration
        FROM staging_songs
    ) ss
    ON se.song = ss.title
    AND se.artist = ss.artist_name
    AND se.length = ss.duration
    WHERE userId <> ' '
    ;

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
7770 rows affected.


[]

In [24]:
%%sql
SELECT count(songplay_id), count(distinct songplay_id), count(song_id), count(distinct song_id) from fact_songPlay;

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
1 rows affected.


count,count_1,count_2,count_3
7770,7770,60,22


In [28]:
%%sql
select
*
from staging_songs
where song_id = 'SOSNMJN12A58A7A649'

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
1 rows affected.


song_id,num_songs,title,artist_name,artist_latitude,year,duration,artist_id,artist_longitude,artist_location
SOSNMJN12A58A7A649,1,Walking On Sunshine,Katrina & The Waves,,1985,220.9171,AR3TRNJ1187B9AE8AB,,


In [30]:
%%sql
select
*
from staging_events
where userId = '67'
and song = 'Walking On Sunshine'

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
1 rows affected.


artist,auth,firstname,gender,iteminsession,lastname,length,level,location,method,page,registration,sessionid,song,status,ts,useragent,userid
Katrina & The Waves,Logged In,Colm,M,4,Santana,220.9171,free,"Nashville-Davidson--Murfreesboro--Franklin, TN",PUT,NextSong,1540856629796,807,Walking On Sunshine,200,1543062639796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36""",67


In [25]:
%%sql
SELECT
*
from fact_songPlay
WHERE song_id is not null
limit 10;

# user_id 67
# song_id SOSNMJN12A58A7A649
# artist_id AR3TRNJ1187B9AE8AB
# songplay 14

# 73
# SOHDWWH12A6D4F7F6A
# ARC0IOF1187FB3F6E6
# 1

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
10 rows affected.


songplay_id,start_time,user_id,level,song_id,artist_id,session_id,location,user_agent
14,2018-11-24 12:30:39.796000,67,free,SOSNMJN12A58A7A649,AR3TRNJ1187B9AE8AB,807,"Nashville-Davidson--Murfreesboro--Franklin, TN","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"""
1,2018-11-05 17:49:42.796000,73,paid,SOHDWWH12A6D4F7F6A,ARC0IOF1187FB3F6E6,255,"Tampa-St. Petersburg-Clearwater, FL","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2"""
17,2018-11-28 18:48:55.796000,24,paid,SOZQSGL12AF72A9145,AR050VJ1187B9B13A7,984,"Lake Havasu City-Kingman, AZ","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"""
11,2018-11-06 23:32:39.796000,97,paid,SOBRFPG12A8AE4837D,AR1N3PB1187B9B25C1,293,"Lansing-East Lansing, MI","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36"""
27,2018-11-20 20:17:20.796000,85,paid,SOTVSNZ12A8C13DA01,ARF8JDZ1187FB37A42,776,"Red Bluff, CA","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"""
4,2018-11-28 21:05:13.796000,73,paid,SOAHVKA12A8C146C5F,ARPBMSQ1187B98AE69,954,"Tampa-St. Petersburg-Clearwater, FL","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2"""
20,2018-11-29 17:55:54.796000,82,paid,SOBONKR12A58A7A7E0,AR5E44Z1187B9A1D74,1017,"Atlanta-Sandy Springs-Roswell, GA","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36"""
36,2018-11-28 22:56:08.796000,73,paid,SOBONKR12A58A7A7E0,AR5E44Z1187B9A1D74,954,"Tampa-St. Petersburg-Clearwater, FL","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2"""
52,2018-11-22 01:59:04.796000,15,paid,SOBONKR12A58A7A7E0,AR5E44Z1187B9A1D74,818,"Chicago-Naperville-Elgin, IL-IN-WI","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36"""
68,2018-11-23 06:15:22.796000,6,free,SOBONKR12A58A7A7E0,AR5E44Z1187B9A1D74,847,"Atlanta-Sandy Springs-Roswell, GA",Mozilla/5.0 (Windows NT 6.1; WOW64; rv:32.0) Gecko/20100101 Firefox/32.0


In [14]:
%%sql
select * from fact_songPlay LIMIT 5;

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
5 rows affected.


songplay_id,start_time,user_id,level,song_id,artist_id,session_id,location,user_agent
14,2018-11-07 01:42:43.796000,8,free,SOWTZNU12AB017EADB,AR6NYHH1187B9BA128,181,"Phoenix-Mesa-Scottsdale, AZ","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"""
30,2018-11-13 17:28:33.796000,97,paid,SOIBHYW12AB0188F49,ARWNARC122BCFCAFEB,537,"Lansing-East Lansing, MI","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36"""
46,2018-11-30 07:47:05.796000,49,paid,SOKMDJJ12AB0181C64,AR6M65W1187FB3611E,1079,"San Francisco-Oakland-Hayward, CA",Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0
62,2018-11-05 02:30:17.796000,44,paid,SONTFNG12A8C13FF69,AR52EZT1187B9900BF,237,"Waterloo-Cedar Falls, IA",Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:31.0) Gecko/20100101 Firefox/31.0
78,2018-11-14 15:24:12.796000,80,paid,SOARUPP12AB01842E0,ARD46C811C8A414F3F,574,"Portland-South Portland, ME","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"""


In [7]:
%%sql
SELECT count(*), 
count(songplay_id), 
count(distinct songplay_id), 
count(distinct song_id), 
count(distinct artist_id)
from fact_songPlay limit 10;

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
1 rows affected.


count,count_1,count_2,count_3,count_4
319,319,319,209,194


In [9]:
%%sql
SELECT 
count(*),
count(start_time),
count(distinct start_time)
FROM dim_time limit 10;

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
1 rows affected.


count,count_1,count_2
8023,8023,8023


In [10]:
%%sql
SELECT 
count(*),
count(user_id),
count(distinct user_id)
FROM dim_user limit 10;

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
1 rows affected.


count,count_1,count_2
97,97,97


In [11]:
%%sql
SELECT 
count(*),
count(artist_id),
count(distinct artist_id)
FROM dim_artist limit 10;

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
1 rows affected.


count,count_1,count_2
9553,9553,9553


In [12]:
%%sql
SELECT 
count(*),
count(song_id),
count(distinct song_id)
FROM dim_song limit 10;

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
1 rows affected.


count,count_1,count_2
14896,14896,14896


In [10]:
%%sql
SELECT *
FROM pg_catalog.pg_tables
WHERE schemaname != 'pg_catalog' AND 
    schemaname != 'information_schema';

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
7 rows affected.


schemaname,tablename,tableowner,tablespace,hasindexes,hasrules,hastriggers
public,dim_time,dwhuser,,True,False,True
public,dim_user,dwhuser,,True,False,True
public,dim_song,dwhuser,,True,False,True
public,fact_songplay,dwhuser,,True,False,True
public,dim_artist,dwhuser,,True,False,True
public,staging_events,dwhuser,,False,False,False
public,staging_songs,dwhuser,,False,False,False


In [12]:
%%sql
select current_database()

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
1 rows affected.


current_database
songdwh


In [8]:
%%sql
select "schema", "table", diststyle from SVV_TABLE_INFO
where "table" like 'dim%';

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
4 rows affected.


schema,table,diststyle
public,dim_artist,EVEN
public,dim_time,EVEN
public,dim_song,EVEN
public,dim_user,EVEN


In [6]:
%%sql
SELECT
    COUNT(DISTINCT fsp.session_id) as session_count,
    du.first_name,
    du.last_name,
    du.user_id
FROM fact_songPlay fsp
JOIN dim_user du
ON du.user_id = fsp.user_id
GROUP BY 
	du.first_name,
	du.last_name,
	du.user_id
ORDER BY session_count DESC
LIMIT 10
;

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
10 rows affected.


session_count,first_name,last_name,user_id
21,Chloe,Cuevas,49
17,Tegan,Levine,80
11,Kate,Harrell,97
8,Lily,Koch,15
7,Aleena,Kirby,44
7,Mohammad,Rodriguez,88
6,Jacob,Klein,73
6,Jacqueline,Lynch,29
6,Matthew,Jones,36
6,Jayden,Graves,25


In [8]:
%%sql
SELECT
    COUNT(fsp.song_id) as song_count,
	ds.title,
	da.name
FROM fact_songPlay fsp
JOIN dim_song ds
ON ds.song_id = fsp.song_id
JOIN dim_artist da
ON da.artist_id = fsp.artist_id
GROUP BY 
	ds.title,
	da.name
ORDER BY song_count DESC
LIMIT 10
;

 * postgresql://dwhuser:***@songcluster.cy513anz522l.us-west-2.redshift.amazonaws.com:5439/songdwh
10 rows affected.


song_count,title,name
37,You're The One,Dwight Yoakam
9,I CAN'T GET STARTED,Ron Carter
9,Catch You Baby (Steve Pitron & Max Sanna Radio Edit),Lonnie Gordon
8,Nothin' On You [feat. Bruno Mars] (Album Version),B.o.B
6,Hey Daddy (Daddy's Home),Usher
5,Make Her Say,Kid Cudi / Kanye West / Common
5,Up Up & Away,Kid Cudi / Kanye West / Common
4,Mr. Jones,Counting Crows
4,Supermassive Black Hole (Album Version),Muse
4,Unwell (Album Version),matchbox twenty
