# Part I. ETL Pipeline for Pre-Processing the Files

#### Import Python packages 

In [3]:
# Import Python packages 
import pandas as pd
import cassandra
import re
import os
import glob
import numpy as np
import json
import csv
from zipfile import ZipFile

#### Creating list of filepaths to process original event csv data files

In [4]:
# checking current working directory
print(os.getcwd())

# Getting current folder and subfolder event data
filepath = os.getcwd() + '/event_data'

# Creating a for loop to create a list of files and collect each filepath
for root, dirs, files in os.walk(filepath):
    
# Joining the file path and roots with the subdirectories using glob
    file_path_list = glob.glob(os.path.join(root,'*'))
    print(file_path_list)

/home/workspace
['/home/workspace/event_data/2018-11-27-events.csv', '/home/workspace/event_data/2018-11-04-events.csv', '/home/workspace/event_data/2018-11-07-events.csv', '/home/workspace/event_data/2018-11-09-events.csv', '/home/workspace/event_data/2018-11-19-events.csv', '/home/workspace/event_data/2018-11-05-events.csv', '/home/workspace/event_data/2018-11-22-events.csv', '/home/workspace/event_data/2018-11-16-events.csv', '/home/workspace/event_data/2018-11-26-events.csv', '/home/workspace/event_data/2018-11-24-events.csv', '/home/workspace/event_data/2018-11-29-events.csv', '/home/workspace/event_data/2018-11-15-events.csv', '/home/workspace/event_data/2018-11-20-events.csv', '/home/workspace/event_data/2018-11-06-events.csv', '/home/workspace/event_data/2018-11-18-events.csv', '/home/workspace/event_data/2018-11-21-events.csv', '/home/workspace/event_data/2018-11-10-events.csv', '/home/workspace/event_data/2018-11-23-events.csv', '/home/workspace/event_data/2018-11-02-events.c

#### Processing the files to create the data file csv that will be used for Apache Casssandra tables
#### Method 1: Using event_data folder

In [5]:
# initiating an empty list of rows that will be generated from each file
full_data_rows_list = [] 
    
# for every filepath in the file path list 
for f in file_path_list:

# reading csv file 
    with open(f, 'r', encoding = 'utf8', newline='') as csvfile:
        
        # creating a csv reader object 
        csvreader = csv.reader(csvfile) 
        next(csvreader)
       
 # extracting each data row one by one and append it        
        for line in csvreader:
            #print(line)
            full_data_rows_list.append(line) 
            
# Printing the total number of rows 
print(len(full_data_rows_list))

# creating a smaller event data csv file called event_datafile_full csv that will be used to insert data into the \
# Apache Cassandra tables
csv.register_dialect('myDialect', quoting=csv.QUOTE_ALL, skipinitialspace=True)

with open('event_datafile_new.csv', 'w', encoding = 'utf8', newline='') as f:
    writer = csv.writer(f, dialect='myDialect')
    writer.writerow(['artist','firstName','gender','itemInSession','lastName','length',\
                'level','location','sessionId','song','userId'])
    for row in full_data_rows_list:
        if (row[0] == ''):
            continue
        writer.writerow((row[0], row[2], row[3], row[4], row[5], row[6], row[7], row[8], row[12], row[13], row[16]))   

186


#### Method 2: Using event_data.zip folder

In [6]:
#Accessing the .zip file from directory
file_name = 'event_data.zip'

# initiating an empty list of rows that will be generated from each file
full_data_rows_list = [] 

#Extracting the files from the .zip folder

with ZipFile(file_name, 'r') as zip:
    zip.extractall()

#Iterating over each file in the .zip folder

    for f in zip.namelist():
        
        #ignoring the empty directory 'event_data' and .ipynb checkpoint files
        
        if str(f) == 'event_data/':
            continue
        elif str(f) == 'event_data/.ipynb_checkpoints/':
            continue
        elif str(f) == 'event_data/.ipynb_checkpoints/2018-11-02-events-checkpoint.csv':
            continue
        elif str(f) == 'event_data/.ipynb_checkpoints/2018-11-01-events-checkpoint.csv':
            continue
        else:
            #print(f)
          
        # creating a csv reader object
            with open(f, 'r', encoding = 'utf8', newline='') as csvfile:
                csvreader = csv.reader(csvfile) 
                next(csvreader)
       
 # extracting each data row one by one and append it        
                for line in csvreader:
                    #print(line)
                    full_data_rows_list.append(line)        
# reading csv file   
# uncomment the code below if you would like to get total number of rows 
print(len(full_data_rows_list))

# creating a smaller event data csv file called event_datafile_full csv that will be used to insert data into the \
# Apache Cassandra tables
csv.register_dialect('myDialect', quoting=csv.QUOTE_ALL, skipinitialspace=True)

with open('event_datafile_new.csv', 'w', encoding = 'utf8', newline='') as f:
    writer = csv.writer(f, dialect='myDialect')
    writer.writerow(['artist','firstName','gender','itemInSession','lastName','length',\
                'level','location','sessionId','song','userId'])
    for row in full_data_rows_list:
        if (row[0] == ''):
            continue
        writer.writerow((row[0], row[2], row[3], row[4], row[5], row[6], row[7], row[8], row[12], row[13], row[16]))


8056


In [7]:
#creating a dataframe from the new csv file
df1=pd.DataFrame(full_data_rows_list)
df1.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,Barry Tuckwell/Academy of St Martin-in-the-Fie...,Logged In,Mohammad,M,0,Rodriguez,277.15873,paid,"Sacramento--Roseville--Arden-Arcade, CA",PUT,NextSong,1540510000000.0,961,Horn Concerto No. 4 in E flat K495: II. Romanc...,200,1543280000000.0,88
1,Jimi Hendrix,Logged In,Mohammad,M,1,Rodriguez,239.82975,paid,"Sacramento--Roseville--Arden-Arcade, CA",PUT,NextSong,1540510000000.0,961,Woodstock Inprovisation,200,1543280000000.0,88
2,Building 429,Logged In,Mohammad,M,2,Rodriguez,300.61669,paid,"Sacramento--Roseville--Arden-Arcade, CA",PUT,NextSong,1540510000000.0,961,Majesty (LP Version),200,1543280000000.0,88
3,The B-52's,Logged In,Gianna,F,0,Jones,321.54077,free,"New York-Newark-Jersey City, NY-NJ-PA",PUT,NextSong,1540870000000.0,107,Love Shack,200,1543280000000.0,38
4,Die Mooskirchner,Logged In,Gianna,F,1,Jones,169.29914,free,"New York-Newark-Jersey City, NY-NJ-PA",PUT,NextSong,1540870000000.0,107,Frisch und g'sund,200,1543280000000.0,38


In [8]:
# checking the number of rows in the csv file
with open('event_datafile_new.csv', 'r', encoding = 'utf8') as f:
    print(sum(1 for line in f))

6821


# Part II. Creating Apache Cassandra Tables based on Queries


#### Creating a Cluster

In [9]:
# This makes a connection to a Cassandra instance the local machine 
# (127.0.0.1)

from cassandra.cluster import Cluster
cluster = Cluster()

# This establishes connection in order to begin executing queries by creating a session
session = cluster.connect()

#### Creating Keyspace

In [10]:
try:
    session.execute("""
    CREATE KEYSPACE IF NOT EXISTS udacity 
    WITH REPLICATION = 
    { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }"""
)
    
except Exception as e:
    print(e)

#### Setting Keyspace as specified above

In [11]:
try:
    session.set_keyspace('udacity')
except Exception as e:
    print(e)

#### Viewing csv using pandas datatrame

In [12]:
import pandas as pd

df = pd.read_csv('event_datafile_new.csv')

df.head()

Unnamed: 0,artist,firstName,gender,itemInSession,lastName,length,level,location,sessionId,song,userId
0,Barry Tuckwell/Academy of St Martin-in-the-Fie...,Mohammad,M,0,Rodriguez,277.15873,paid,"Sacramento--Roseville--Arden-Arcade, CA",961,Horn Concerto No. 4 in E flat K495: II. Romanc...,88
1,Jimi Hendrix,Mohammad,M,1,Rodriguez,239.82975,paid,"Sacramento--Roseville--Arden-Arcade, CA",961,Woodstock Inprovisation,88
2,Building 429,Mohammad,M,2,Rodriguez,300.61669,paid,"Sacramento--Roseville--Arden-Arcade, CA",961,Majesty (LP Version),88
3,The B-52's,Gianna,F,0,Jones,321.54077,free,"New York-Newark-Jersey City, NY-NJ-PA",107,Love Shack,38
4,Die Mooskirchner,Gianna,F,1,Jones,169.29914,free,"New York-Newark-Jersey City, NY-NJ-PA",107,Frisch und g'sund,38


## Query 1
### Objective: Return artist, song title and song's lenght given a particular sessionId and itemInSesion
### For this query, sessionId was chosen as the partition key while itemInSession was selected as clustering column.
### Each partition is uniquely identified by sessionId while itemInSession was used to uniquely identify the rows within that to sort the data

#### Creating table and setting primary key

In [13]:
#Return artist, song title and song's length during sessionId = 338, and itemInSession = 4

query = 'DROP table IF EXISTS session_info'

session.execute(query)

query = 'CREATE table IF NOT EXISTS session_info'

query = query + '(sessionID text, itemInSession text, artist text, song text, length text, PRIMARY KEY(sessionID, itemInSession))'
session.execute(query)

<cassandra.cluster.ResultSet at 0x7f0328232080>

#### Inserting data into the table

In [14]:
file = 'event_datafile_new.csv'

with open(file, encoding = 'utf8') as f:
    csvreader = csv.reader(f)
    next(csvreader) # skip header
    for line in csvreader:
        query = 'INSERT INTO session_info (sessionId, itemInSession, artist, song, length)'
        query = query + 'VALUES(%s,%s,%s,%s,%s)'
        session.execute(query, (line[8], line[3], line[0], line[9], line[5]))

#### Running the required query

In [15]:
query = 'SELECT * from session_info WHERE sessionID = \'338\' AND itemInSession = \'4\''
rows = session.execute(query)
for row in rows:
    print(row.sessionid, row.iteminsession, row.artist, row.song, row.length)

338 4 Faithless Music Matters (Mark Knight Dub) 495.3073


## Query 2
### Objective: Return name of artist, song (sorted by itemInSession) and user (first and last name)
### For this query, userId and sessionId was chosen as the partition key while itemInSession was selected as the clustering column.
### Each partition is uniquely identified by the combination userId and sessionId while itemInSession was used to uniquely identify the rows within that to sort the data

#### Creating the table and setting primary key

In [16]:
## Return the name of artist, song (sorted by itemInSession) and user (first and last name) for userid = 10, sessionid = 182
query = 'DROP table IF EXISTS user_info'
session.execute(query)
query = 'CREATE table IF NOT EXISTS user_info'
query = query + '(userId text, firstName text, lastName text, sessionId text, artist text, song text, length text, itemInSession text, PRIMARY KEY((userId,sessionId),itemInSession))'
session.execute(query)


<cassandra.cluster.ResultSet at 0x7f02fcfa8128>

#### Inserting data into the table

In [17]:
file = 'event_datafile_new.csv'

with open(file, encoding = 'utf8') as f:
    csvreader = csv.reader(f)
    next(csvreader) # skip header
    for line in csvreader:
        query = 'INSERT INTO user_info (userId, firstName, lastName, sessionId, artist, song, length, itemInSession)'
        query = query + 'VALUES(%s,%s,%s,%s,%s,%s,%s,%s)'
        ## TO-DO: Assign which column element should be assigned for each column in the INSERT statement.
        ## For e.g., to INSERT artist_name and user first_name, you would change the code below to `line[0], line[1]`
        session.execute(query, (line[10], line[1], line[4], line[8], line[0], line[9],line[5], line[3]))


#### Running the required query

In [18]:
query = 'SELECT artist, song, firstName, lastName, itemInSession from user_info WHERE userId = \'10\' AND sessionId = \'182\''
rows = session.execute(query)
for row in rows:
    print(row.artist,'  ', row.song, '  ', row.firstname, row.lastname)

Down To The Bone    Keep On Keepin' On    Sylvie Cruz
Three Drives    Greece 2000    Sylvie Cruz
Sebastien Tellier    Kilometer    Sylvie Cruz
Lonnie Gordon    Catch You Baby (Steve Pitron & Max Sanna Radio Edit)    Sylvie Cruz


## Query 3
### Objective: Return every user name (first and last) who listened to a particular song
### For this query, song was chosen as the partition key while userId was selected as the clustering column.
### Each partition is uniquely identified by the song while userId was used to uniquely identify the rows within that to sort the data


#### Creating the table and setting primary key

In [19]:
## Query 3: Return every user name (first and last) who listened to the song 'All Hands Against His Own'

query = 'DROP table IF EXISTS listen_info'
session.execute(query)
query = 'CREATE table IF NOT EXISTS listen_info'
query = query + '(userId text, firstName text, lastName text, artist text, song text, itemInSession text, sessionId text, PRIMARY KEY(song,userId))'
session.execute(query)


<cassandra.cluster.ResultSet at 0x7f02fcf66748>

#### Inserting data into the table

In [20]:
file = 'event_datafile_new.csv'

with open(file, encoding = 'utf8') as f:
    csvreader = csv.reader(f)
    next(csvreader) # skip header
    for line in csvreader:
## TO-DO: Assign the INSERT statements into the `query` variable
        query = 'INSERT INTO listen_info (userId, firstName, lastName, artist, song, itemInSession, sessionId)'
        query = query + 'VALUES(%s,%s,%s,%s,%s,%s,%s)'
        ## TO-DO: Assign which column element should be assigned for each column in the INSERT statement.
        ## For e.g., to INSERT artist_name and user first_name, you would change the code below to `line[0], line[1]`
        session.execute(query, (line[10], line[1], line[4], line[0], line[9], line[3], line[8]))


#### Running the required query

In [21]:
query = 'SELECT userId, firstName, lastName from listen_info WHERE song = \'All Hands Against His Own\''
rows = session.execute(query)
for row in rows:
    print(row.userid, row.firstname, row.lastname)

29 Jacqueline Lynch
80 Tegan Levine
95 Sara Johnson


### Dropping the table before closing out the sessions

In [22]:
query = 'DROP table IF EXISTS session_info'
session.execute(query)
query = 'DROP table IF EXISTS user_info'
session.execute(query)
query = 'DROP table IF EXISTS listen_info'
session.execute(query)

<cassandra.cluster.ResultSet at 0x7f02fcf3dc50>

### Closing the session and cluster connection¶

In [23]:
session.shutdown()
cluster.shutdown()