# Introduction

In this jupyter notebook we load the data from the dataset that we have created from kaggle_dataset and scraping in the mongoDB and neo4j databases. 

In [5]:
import numpy as np
import pandas as pd
import random
import datetime
import pymongo
import math
import string

In [2]:
df = pd.read_csv('playlist_info.csv') # Load dataset that contains songs and playlists info 

In [3]:
df.head()

Unnamed: 0,user_id,artist,track,playlist,artist_track,album,date,lyrics
0,9cc0cfd4d7d7885102480dd99e7a90d6,Crosby Stills Nash,Helplessly Hoping,HARD ROCK 2010,Crosby-Stills-Nash-Helplessly-Hoping-lyrics,,,
1,07f0fc3be95dcd878966b1f9572ff670,C418,Chris,C418,C418-Chris-lyrics,Minecraft - Volume Alpha,"March 4, 2011",This song is an instrumental
2,07f0fc3be95dcd878966b1f9572ff670,C418,Équinoxe,C418,C418-Équinoxe-lyrics,,,
3,07f0fc3be95dcd878966b1f9572ff670,Pretty Lights,I Know the Truth,Chill out,Pretty-Lights-I-Know-the-Truth-lyrics,,,I'm too blind\nI'm too blind.....to see the li...
4,07f0fc3be95dcd878966b1f9572ff670,Ludwig van Beethoven,Sonata No 23 In F Minor Op 57 Appassionata Ass...,Classique,Ludwig-van-Beethoven-Sonata-No-23-In-F-Minor-O...,,,


In [4]:
df_comments = pd.read_csv('comments_info.csv') # Load dataset that contains users and comments info 

In [5]:
df_comments.head()

Unnamed: 0,author,date,comment,artist_track
0,DeeCee,1403638235,We should embrace the contractions here! The “...,Papa-Roach-To-Be-Loved-lyrics
1,CySorcerorD,1494570401,replace the A with an I to correct the “A love...,Papa-Roach-To-Be-Loved-lyrics
2,guesg,1620154841,I love that this song was the theme song for W...,Papa-Roach-To-Be-Loved-lyrics
3,SpaceGhostBenoit,1626561019,stayed up every school night as a kid just to ...,Papa-Roach-To-Be-Loved-lyrics
4,Meandmylife,1408078858,you should add ‘hold your drinks’ before the c...,The-Wanted-We-Own-The-Night-lyrics


***

# Username mapping 

Map users_id with comment's authors to have a valid username in the databases. 

In [6]:
df_unique_users = df['user_id'].unique() # Get unique username values
df_unique_authors = (df_comments['author'].unique())[:df_unique_users.shape[0]] # Get unique author values
zip_iterator = zip(df_unique_users, df_unique_authors) 
key_value_dict = dict(zip_iterator) # Create a dict {username: author}
df["user_id"] = df["user_id"].map(key_value_dict) # Replace user_id with author value

In [7]:
%%capture
df = df.rename(columns={"user_id": "username"}) # Rename the "user_id" column in "username"

# Fix dates and album data 

Transform songs dates into songs release and fix album names removing strange caracthers scraped from web scraping.

In [8]:
# Transform songs dates into songs release
def get_year(x): 
    
    if(str(x) == 'nan'): return x
    
    date_string = x.split(" ")
    if (len(date_string)==1): 
        return date_string[0]
    if (len(date_string)==2):
        return date_string[1]
    if (len(date_string)==3):
        return date_string[2]
    

In [9]:
df['date'] = df['date'].apply(lambda x: get_year(x))

In [10]:
# Fix album names removing strange caracthers scraped from web scraping.
def fix_album_names(x):
    if (type(x)==str): 
        x.replace('\xa0','') 
    return x

In [11]:
df['album'] = df['album'].apply(lambda x: fix_album_names(x))

In [12]:
df.head()

Unnamed: 0,username,artist,track,playlist,artist_track,album,date,lyrics
0,DeeCee,Crosby Stills Nash,Helplessly Hoping,HARD ROCK 2010,Crosby-Stills-Nash-Helplessly-Hoping-lyrics,,,
1,CySorcerorD,C418,Chris,C418,C418-Chris-lyrics,Minecraft - Volume Alpha,2011.0,This song is an instrumental
2,CySorcerorD,C418,Équinoxe,C418,C418-Équinoxe-lyrics,,,
3,CySorcerorD,Pretty Lights,I Know the Truth,Chill out,Pretty-Lights-I-Know-the-Truth-lyrics,,,I'm too blind\nI'm too blind.....to see the li...
4,CySorcerorD,Ludwig van Beethoven,Sonata No 23 In F Minor Op 57 Appassionata Ass...,Classique,Ludwig-van-Beethoven-Sonata-No-23-In-F-Minor-O...,,,


***

<h1> Setup MongoDB connection and insert data </h1>

In [13]:
# Connection to mongoDB, SYP database 

myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient['SYP']

<h3> Prepare playlist's data and add them to MongoDB </h3>

In [14]:
df_playlist = pd.DataFrame(df)

In [15]:
df_playlist = df_playlist.drop_duplicates(subset=['playlist']) # Take unique playlist name 

In [16]:
# Generate a random date_of_creation for each playlist

start_date = datetime.datetime(2010, 1, 1)
end_date = datetime.datetime(2020, 2, 1)
time_between_dates = end_date - start_date
days_between_dates = time_between_dates.days

df_playlist['date_of_creation'] = [ (start_date + datetime.timedelta(random.randrange(days_between_dates))) for k in df_playlist.playlist]

In [17]:
# Drop useless columns for playlists 
df_playlist_inserted = df_playlist.drop(['artist','track','artist_track','album','date','lyrics','username'],axis=1)

In [18]:
df_playlist_inserted = df_playlist.reindex(columns=['playlist','date_of_creation']) # Change columns order

In [19]:
df_playlist_inserted.head()

Unnamed: 0,playlist,date_of_creation
0,HARD ROCK 2010,2014-05-03
1,C418,2012-02-19
3,Chill out,2012-10-09
4,Classique,2018-12-28
5,Electro,2011-10-19


<h5> INSERT INTO PLAYLIST COLLECTION </h5>

In [20]:
playlist_coll = mydb['playlists'] # Select the playlists collection

In [21]:
playlist_dict = df_playlist_inserted.to_dict("records") # Transform the dataframe in a dict 

In [22]:
# INSERT INTO PLAYLIST COLLECTION
playlists_id = playlist_coll.insert_many(playlist_dict) # returns the MongoDB ids for playlist entities

<h3> Prepare song's data and add them to MongoDB </h3>

In [24]:
df_songs = pd.DataFrame(df)

In [25]:
df_songs = df_songs.drop_duplicates(subset=['artist_track']) # Take unique songs combination artist_trackname 

In [26]:
df_songs_inserted = df_songs.drop(['username','artist_track','playlist'],axis=1) # Drop useless columns

In [27]:
# Change columns order
df_songs_inserted = df_songs_inserted.reindex(columns=['track','artist','album','date','lyrics']) 

In [28]:
df_songs_inserted.head()

Unnamed: 0,track,artist,album,date,lyrics
0,Helplessly Hoping,Crosby Stills Nash,,,
1,Chris,C418,Minecraft - Volume Alpha,2011.0,This song is an instrumental
2,Équinoxe,C418,,,
3,I Know the Truth,Pretty Lights,,,I'm too blind\nI'm too blind.....to see the li...
4,Sonata No 23 In F Minor Op 57 Appassionata Ass...,Ludwig van Beethoven,,,


<h5> INSERT INTO SONGS COLLECTION </h5>

In [29]:
songs_coll = mydb['songs'] # Select the songs collection

In [30]:
# Transform the dataframe in a dict removing the null values 
song_dict = [{k:v for k,v in m.items() if pd.notnull(v)} for m in df_songs_inserted.to_dict(orient='records')]

In [31]:
# INSERT INTO SONGS COLLECTION
songs_id = songs_coll.insert_many(song_dict) # returns the MongoDB ids for songs entities

<h3> Prepare user's data and add them to MongoDB </h3>

In [32]:
df_users = pd.DataFrame(df)

In [33]:
df_users = df_users.drop_duplicates(subset=['username']) # Take unique usernames 

In [34]:
# Generate a random account date_of_creation for each users
start_date = datetime.datetime(2010, 1, 1)
end_date = datetime.datetime(2020, 2, 1)
time_between_dates = end_date - start_date
days_between_dates = time_between_dates.days

df_users['date_of_creation'] = [ (start_date + datetime.timedelta(random.randrange(days_between_dates))) for k in df_users.username]

In [35]:
# Generate a random date_of_birth for each users
start_date = datetime.datetime(1950, 1, 1)
end_date = datetime.datetime(2010, 2, 1)
time_between_dates = end_date - start_date
days_between_dates = time_between_dates.days

df_users['date_of_birth'] = [ (start_date + datetime.timedelta(random.randrange(days_between_dates))) for k in df_users.username]

In [36]:
# Drop useless columns
df_users_inserted = df_users.drop(['artist','track','playlist','artist_track','album','date','lyrics'],axis=1)

In [37]:
# Change columns order
df_users_inserted = df_users.reindex(columns=['username','date_of_creation','date_of_birth'])

In [38]:
df_users_inserted.head()

Unnamed: 0,username,date_of_creation,date_of_birth
0,DeeCee,2013-01-28,1952-08-29
1,CySorcerorD,2011-01-13,1977-05-16
6,guesg,2013-10-05,1958-01-01
8,SpaceGhostBenoit,2016-09-06,1951-05-05
9,Meandmylife,2013-05-11,1962-06-24


### INSERT INTO USERS COLLECTION

In [39]:
users_coll = mydb['users'] # Select the users collection

In [40]:
users_dict = df_users_inserted.to_dict("records")  # Transform the dataframe in a dict

In [41]:
# INSERT INTO USERS COLLECTION
users_id = users_coll.insert_many(users_dict) # returns the MongoDB ids for users entities

***

# Associate MongoDB ids with instances

<h3> song_id - artist_track </h3>

In [43]:
df_unique_songs = list(df_songs['artist_track']) # Get unique artist_track values
df_unique_songs_id = songs_id.inserted_ids # Get unique songs_id values
zip_iterator = zip(df_unique_songs, df_unique_songs_id) 
key_value_dict = dict(zip_iterator) # Create a dict {artist_track: song_id}
df["song_id"] = df["artist_track"].map(key_value_dict) # Match artist_track with song_id

In [44]:
df.head()

Unnamed: 0,username,artist,track,playlist,artist_track,album,date,lyrics,song_id
0,DeeCee,Crosby Stills Nash,Helplessly Hoping,HARD ROCK 2010,Crosby-Stills-Nash-Helplessly-Hoping-lyrics,,,,61cc6cb83bd8fb88c65fc8aa
1,CySorcerorD,C418,Chris,C418,C418-Chris-lyrics,Minecraft - Volume Alpha,2011.0,This song is an instrumental,61cc6cb83bd8fb88c65fc8ab
2,CySorcerorD,C418,Équinoxe,C418,C418-Équinoxe-lyrics,,,,61cc6cb83bd8fb88c65fc8ac
3,CySorcerorD,Pretty Lights,I Know the Truth,Chill out,Pretty-Lights-I-Know-the-Truth-lyrics,,,I'm too blind\nI'm too blind.....to see the li...,61cc6cb83bd8fb88c65fc8ad
4,CySorcerorD,Ludwig van Beethoven,Sonata No 23 In F Minor Op 57 Appassionata Ass...,Classique,Ludwig-van-Beethoven-Sonata-No-23-In-F-Minor-O...,,,,61cc6cb83bd8fb88c65fc8ae


In [45]:
check_for_nan_songs = df['song_id'].isnull().values.any()
print (check_for_nan_songs)

False


<h3> playlist_id - playlist_name </h3>

In [46]:
df_unique_playlist = list(df_playlist['playlist']) # Get unique playlist values
df_unique_playlist_id = playlists_id.inserted_ids # Get unique playlist_id values
zip_iterator = zip(df_unique_playlist, df_unique_playlist_id) 
key_value_dict = dict(zip_iterator) # Create a dict {playlist: playlist_id}
df["playlist_id"] = df["playlist"].map(key_value_dict) # Match playlist with playlist_id

In [47]:
df.head()

Unnamed: 0,username,artist,track,playlist,artist_track,album,date,lyrics,song_id,playlist_id
0,DeeCee,Crosby Stills Nash,Helplessly Hoping,HARD ROCK 2010,Crosby-Stills-Nash-Helplessly-Hoping-lyrics,,,,61cc6cb83bd8fb88c65fc8aa,61cc6cb53bd8fb88c65ec87c
1,CySorcerorD,C418,Chris,C418,C418-Chris-lyrics,Minecraft - Volume Alpha,2011.0,This song is an instrumental,61cc6cb83bd8fb88c65fc8ab,61cc6cb53bd8fb88c65ec87d
2,CySorcerorD,C418,Équinoxe,C418,C418-Équinoxe-lyrics,,,,61cc6cb83bd8fb88c65fc8ac,61cc6cb53bd8fb88c65ec87d
3,CySorcerorD,Pretty Lights,I Know the Truth,Chill out,Pretty-Lights-I-Know-the-Truth-lyrics,,,I'm too blind\nI'm too blind.....to see the li...,61cc6cb83bd8fb88c65fc8ad,61cc6cb53bd8fb88c65ec87e
4,CySorcerorD,Ludwig van Beethoven,Sonata No 23 In F Minor Op 57 Appassionata Ass...,Classique,Ludwig-van-Beethoven-Sonata-No-23-In-F-Minor-O...,,,,61cc6cb83bd8fb88c65fc8ae,61cc6cb53bd8fb88c65ec87f


In [48]:
check_for_nan_playlist = df['playlist_id'].isnull().values.any()
print (check_for_nan_playlist)

False


<h3> user_id - username </h3>

In [49]:
df_unique_users = list(df_users['username']) # Get unique username values
df_unique_users_id = users_id.inserted_ids # Get unique users_id values
zip_iterator = zip(df_unique_users, df_unique_users_id) 
key_value_dict = dict(zip_iterator) # Create a dict {username: users_id}
df["users_id"] = df["username"].map(key_value_dict) # Match username with users_id

In [50]:
df.head()

Unnamed: 0,username,artist,track,playlist,artist_track,album,date,lyrics,song_id,playlist_id,users_id
0,DeeCee,Crosby Stills Nash,Helplessly Hoping,HARD ROCK 2010,Crosby-Stills-Nash-Helplessly-Hoping-lyrics,,,,61cc6cb83bd8fb88c65fc8aa,61cc6cb53bd8fb88c65ec87c,61cc6cbb3bd8fb88c6628b8b
1,CySorcerorD,C418,Chris,C418,C418-Chris-lyrics,Minecraft - Volume Alpha,2011.0,This song is an instrumental,61cc6cb83bd8fb88c65fc8ab,61cc6cb53bd8fb88c65ec87d,61cc6cbb3bd8fb88c6628b8c
2,CySorcerorD,C418,Équinoxe,C418,C418-Équinoxe-lyrics,,,,61cc6cb83bd8fb88c65fc8ac,61cc6cb53bd8fb88c65ec87d,61cc6cbb3bd8fb88c6628b8c
3,CySorcerorD,Pretty Lights,I Know the Truth,Chill out,Pretty-Lights-I-Know-the-Truth-lyrics,,,I'm too blind\nI'm too blind.....to see the li...,61cc6cb83bd8fb88c65fc8ad,61cc6cb53bd8fb88c65ec87e,61cc6cbb3bd8fb88c6628b8c
4,CySorcerorD,Ludwig van Beethoven,Sonata No 23 In F Minor Op 57 Appassionata Ass...,Classique,Ludwig-van-Beethoven-Sonata-No-23-In-F-Minor-O...,,,,61cc6cb83bd8fb88c65fc8ae,61cc6cb53bd8fb88c65ec87f,61cc6cbb3bd8fb88c6628b8c


In [51]:
check_for_nan_playlist = df['users_id'].isnull().values.any()
print (check_for_nan_playlist)

False


In [101]:
# df.to_csv('FinalDatasetWithMongoIDS.csv',index = False) # Save all changes in a dataframe

***

In [7]:
df = pd.read_csv('./DatasetsUsed/FinalDatasetWithMongoIDS.csv')

# Handle embedded documents 

<h3> Add embedded document songs to playlist </h3>

In [52]:
# Get the list of songs contained in each playlist
group = df.drop_duplicates(['playlist_id','song_id']).reindex(columns=['song_id','artist','track','album','date','playlist_id']).rename(columns={"song_id": "_id"}).groupby('playlist_id')
series = group.apply(lambda x: [{k:v for k,v in m.items() if pd.notnull(v)} for m in x.to_dict(orient='records')])
df_playlist_song = pd.DataFrame({
    'playlist_id': series.index,
    'songs_in_playlist': series.values
})
df_playlist_song.head()

Unnamed: 0,playlist_id,songs_in_playlist
0,61cc6cb53bd8fb88c65ec87c,"[{'_id': 61cc6cb83bd8fb88c65fc8aa, 'artist': '..."
1,61cc6cb53bd8fb88c65ec87d,"[{'_id': 61cc6cb83bd8fb88c65fc8ab, 'artist': '..."
2,61cc6cb53bd8fb88c65ec87e,"[{'_id': 61cc6cb83bd8fb88c65fc8ad, 'artist': '..."
3,61cc6cb53bd8fb88c65ec87f,"[{'_id': 61cc6cb83bd8fb88c65fc8ae, 'artist': '..."
4,61cc6cb53bd8fb88c65ec880,"[{'_id': 61cc6cb83bd8fb88c65fc8af, 'artist': '..."


In [53]:
#Remove values not needed 
def remove_song_values(x):
    for song in x:
        del song['playlist_id']
    return x 

In [54]:
df_playlist_song['songs_in_playlist'] = df_playlist_song['songs_in_playlist'].apply(remove_song_values) #Remove values not needed 

In [55]:
df_playlist_song['songs_in_playlist'][0]

[{'_id': ObjectId('61cc6cb83bd8fb88c65fc8aa'),
  'artist': 'Crosby Stills Nash',
  'track': 'Helplessly Hoping'}]

In [56]:
# Add the column of embedded documents to the principal df and save it as "df_playlist_embedded"
df_playlist_embedded = (pd.merge(df.drop_duplicates(subset=['playlist_id']), df_playlist_song, on='playlist_id')) 

<h6> INSERT EMBEDDED DOCUMENT IN PLAYLIST </h6>

In [57]:
def insert_playlist_embedded(playlist_id,embedded_doc, username, users_id):
    playlist_coll.update_one(
        {"_id" : playlist_id}, 
        { "$set": { 'songs_in_playlist': embedded_doc, 'created_by': {'_id': users_id, 'username': username}} }
    )

In [58]:
%%capture
df_playlist_embedded.apply(lambda x: insert_playlist_embedded(x.playlist_id, x.songs_in_playlist, x.username, x.users_id), axis=1)

<h3> Add embedded document  playlists to songs </h3>

In [142]:
# Get the list of playlists in which a song is contained, for each song
group = df.drop_duplicates(['playlist_id','song_id']).reindex(columns=['playlist_id','playlist','username','users_id','song_id']).rename(columns={"playlist_id": "_id", "playlist":"name"}).groupby('song_id')
series = group.apply(lambda x: x.to_dict('records'))
df_song_playlist = pd.DataFrame({
    'song_id': series.index,
    'playlists_that_contains_it': series.values
})
df_song_playlist.head()

Unnamed: 0,song_id,playlists_that_contains_it
0,61cc6cb83bd8fb88c65fc8aa,"[{'_id': 61cc6cb53bd8fb88c65ec87c, 'name': 'HA..."
1,61cc6cb83bd8fb88c65fc8ab,"[{'_id': 61cc6cb53bd8fb88c65ec87d, 'name': 'C4..."
2,61cc6cb83bd8fb88c65fc8ac,"[{'_id': 61cc6cb53bd8fb88c65ec87d, 'name': 'C4..."
3,61cc6cb83bd8fb88c65fc8ad,"[{'_id': 61cc6cb53bd8fb88c65ec87e, 'name': 'Ch..."
4,61cc6cb83bd8fb88c65fc8ae,"[{'_id': 61cc6cb53bd8fb88c65ec87f, 'name': 'Cl..."


In [143]:
#Remove values not needed 
def remove_playlist_values(x):
    for playlist in x:
        playlist['creator'] = {"_id": playlist['users_id'], "username": playlist['username']}
        del playlist['users_id']
        del playlist['username']
        del playlist['song_id']
    return x 

In [144]:
#Remove values not needed
df_song_playlist['playlists_that_contains_it'] = df_song_playlist['playlists_that_contains_it'].apply(remove_playlist_values)  

In [145]:
df_song_playlist['playlists_that_contains_it'][0]

[{'_id': ObjectId('61cc6cb53bd8fb88c65ec87c'),
  'name': 'HARD ROCK 2010',
  'creator': {'_id': ObjectId('61cc6cbb3bd8fb88c6628b8b'),
   'username': 'DeeCee'}}]

In [146]:
# Add the column of embedded documents to the principal df and save it as "df_songs_embedded"
df_songs_embedded = (pd.merge(df.drop_duplicates(subset=['song_id']), df_song_playlist, on='song_id'))

<h5> INSERT EMBEDDED DOCUMENT IN SONGS </h5>

In [147]:
def insert_songs_embedded(song_id,embedded_doc):
    songs_coll.update_one(
        {"_id" : song_id}, 
        { "$set": { 'playlists': embedded_doc } }
    )

In [148]:
%%capture
df_songs_embedded.apply(lambda x: insert_songs_embedded(x.song_id, x.playlists_that_contains_it), axis=1)

<h3> Add embedded document  playlists to users </h3>

In [135]:
# Get the list of playlists created by each user
group = df.drop_duplicates(['users_id','playlist_id']).reindex(columns=['playlist_id','playlist','users_id']).rename(columns={"playlist_id": "_id", "playlist": "name"}).groupby('users_id')
series = group.apply(lambda x: x.to_dict('records'))
df_user_playlist = pd.DataFrame({
    'users_id': series.index,
    'playlist_created': series.values
})
df_user_playlist.head()

Unnamed: 0,users_id,playlist_created
0,61cc6cbb3bd8fb88c6628b8b,"[{'_id': 61cc6cb53bd8fb88c65ec87c, 'name': 'HA..."
1,61cc6cbb3bd8fb88c6628b8c,"[{'_id': 61cc6cb53bd8fb88c65ec87d, 'name': 'C4..."
2,61cc6cbb3bd8fb88c6628b8d,"[{'_id': 61cc6cb53bd8fb88c65ec881, 'name': 'So..."
3,61cc6cbb3bd8fb88c6628b8e,"[{'_id': 61cc6cb53bd8fb88c65ec882, 'name': '20..."
4,61cc6cbb3bd8fb88c6628b8f,"[{'_id': 61cc6cb53bd8fb88c65ec883, 'name': 'La..."


In [136]:
#Remove values not needed 
def remove_playlist_user_values(x):
    for playlist in x:
        del playlist['users_id']
    return x 

In [137]:
#Remove values not needed 
df_user_playlist['playlist_created'] = df_user_playlist['playlist_created'].apply(remove_playlist_user_values) 

In [138]:
df_user_playlist['playlist_created'][0]

[{'_id': ObjectId('61cc6cb53bd8fb88c65ec87c'), 'name': 'HARD ROCK 2010'}]

In [139]:
# Add the column of embedded documents to the principal df and save it as "df_users_embedded"
df_users_embedded = (pd.merge(df.drop_duplicates(subset=['users_id']), df_user_playlist, on='users_id')) # Merge user_id column with dataset

<h6> GENERATE PASSWORD AND ADD ATTRIBUTE "isAdmin" AS FALSE FOR EACH USER </h6>

In [157]:
def get_random_string(length):
    # With combination of lower and upper case
    result_str = ''.join(random.choice(string.ascii_letters) for i in range(length))
    # print random string
    return result_str

In [158]:
df_users_embedded['password'] = [ get_random_string(16)  for k in df_users_embedded.users_id]

In [159]:
df_users_embedded['isAdmin'] = [ False for k in df_users_embedded.users_id]

<h5> INSERT EMBEDDED DOCUMENT, PASSWORD AND isAdmin IN USERS </h5>

In [162]:
def insert_users_embedded(user_id,embedded_doc, password, isAdmin):
    users_coll.update_one(
        {"_id" : user_id}, 
        { "$set": { 'createdPlaylists': embedded_doc, 'password': password, 'isAdmin': isAdmin } }
    )

In [163]:
%%capture
df_users_embedded.apply(lambda x: insert_users_embedded(x.users_id, x.playlist_created, x.password, x.isAdmin), axis=1)

***

# Change attribute names of documents in MongoDB

In [104]:
playlist_coll.update_many( {}, { "$rename": { 
    "playlist": "name",
    "date_of_creation": "creationDate",
    "created_by": "creator",
    "songs_in_playlist": "songs"
} } )

<pymongo.results.UpdateResult at 0x7fe7623cbcc0>

In [134]:
songs_coll.update_many( {}, { "$rename": { 
    "date": "year",
    "lyrics": "lyric",
    "playlists_that_contains_it": "playlists",
} } )

songs_coll.update_many( {}, { "$unset": {
    "playlists.$[].creator":""
} } )

<pymongo.results.UpdateResult at 0x7fe787e783c0>

In [151]:
users_coll.update_many( {}, { "$rename": { 
    "date_of_creation":"dateOfCreation",
    "date_of_birth": "dateOfBirth"
} } )

<pymongo.results.UpdateResult at 0x7fe701f0d140>

***

<h1> Setup Neo4j connection and insert data </h1>

<h3> Setup data for Neo4j nodes </h3>

In [73]:
# PLAYLIST CSV
playlist_csv = df.drop_duplicates(subset=['playlist_id']).drop(['artist','track','artist_track','album','date','lyrics','song_id','users_id'],axis=1).reindex(columns=['playlist_id','playlist','username'])
playlist_csv['playlist'] = playlist_csv['playlist'].apply(lambda x: str(x).replace('"',"'"))
playlist_csv.head()

Unnamed: 0,playlist_id,playlist,username
0,61cc6cb53bd8fb88c65ec87c,HARD ROCK 2010,DeeCee
1,61cc6cb53bd8fb88c65ec87d,C418,CySorcerorD
3,61cc6cb53bd8fb88c65ec87e,Chill out,CySorcerorD
4,61cc6cb53bd8fb88c65ec87f,Classique,CySorcerorD
5,61cc6cb53bd8fb88c65ec880,Electro,CySorcerorD


In [74]:
# SONG CSV
songs_csv = df.drop_duplicates(subset=['song_id']).drop(['username','playlist','artist_track','playlist_id','users_id','date','lyrics'],axis=1).reindex(columns=['track','song_id','artist','album'])
songs_csv['track'] = songs_csv['track'].apply(lambda x: str(x).replace('"',"'"))
songs_csv['artist'] = songs_csv['artist'].apply(lambda x: str(x).replace('"',"'"))
songs_csv['album'] = songs_csv['album'].apply(lambda x: str(x).replace('"',"'"))
songs_csv.head()

Unnamed: 0,track,song_id,artist,album
0,Helplessly Hoping,61cc6cb83bd8fb88c65fc8aa,Crosby Stills Nash,
1,Chris,61cc6cb83bd8fb88c65fc8ab,C418,Minecraft - Volume Alpha
2,Équinoxe,61cc6cb83bd8fb88c65fc8ac,C418,
3,I Know the Truth,61cc6cb83bd8fb88c65fc8ad,Pretty Lights,
4,Sonata No 23 In F Minor Op 57 Appassionata Ass...,61cc6cb83bd8fb88c65fc8ae,Ludwig van Beethoven,


In [75]:
# USERS CSV
users_csv = df.drop_duplicates(subset=['users_id']).drop(['artist','track','artist_track','playlist','album','date','lyrics','song_id','playlist_id'],axis=1).reindex(columns=['users_id', 'username'])
users_csv['username'] = users_csv['username'].apply(lambda x: str(x).replace('"',"'"))
users_csv.head()

Unnamed: 0,users_id,username
0,61cc6cbb3bd8fb88c6628b8b,DeeCee
1,61cc6cbb3bd8fb88c6628b8c,CySorcerorD
6,61cc6cbb3bd8fb88c6628b8d,guesg
8,61cc6cbb3bd8fb88c6628b8e,SpaceGhostBenoit
9,61cc6cbb3bd8fb88c6628b8f,Meandmylife


In [96]:
# Random number of playlist followed by each user
users_csv['n_playlist_followed'] = [ random.randint(0,50)  for k in users_csv.users_id] 

In [98]:
# Random number of users followed by each user
users_csv['n_user_followed'] = [ random.randint(0,100)  for k in users_csv.users_id]

<h3> Associate comment_author and artist_track to the comments </h3>

In [76]:
df_unique_songs = list(df_songs['artist_track']) # Get unique songs values
df_unique_songs_id = songs_id.inserted_ids # Get unique songs_id values
zip_iterator = zip(df_unique_songs, df_unique_songs_id) 
key_value_dict = dict(zip_iterator) # Create a dict {user_id: author}
df_comments["song_id"] = df_comments["artist_track"].map(key_value_dict) # Replace user_id with author value

In [77]:
df_unique_users = list(df['username']) # Get unique user_id values
df_unique_users_id = list(df['users_id']) # Get unique author values
zip_iterator = zip(df_unique_users, df_unique_users_id) 
key_value_dict = dict(zip_iterator) # Create a dict {user_id: author}
df_comments["users_id"] = df_comments["author"].map(key_value_dict) # Replace user_id with author value

In [82]:
# COMMENTS CSV
comments_csv = df_comments.drop(['artist_track'],axis=1)
comments_csv['author'] = comments_csv['author'].apply(lambda x: str(x).replace('"',"'"))
comments_csv['comment'] = comments_csv['comment'].apply(lambda x: str(x).replace('"',"'"))
comments_csv['comment'] = comments_csv['comment'].apply(lambda x: str(x).replace('“',"'"))
comments_csv['date'] = comments_csv['date'].apply(lambda x: datetime.datetime.fromtimestamp(x))

In [91]:
comments_csv['vote'] = [ random.randint(1,5)  for k in comments_csv.comment] # Create a random vote foreach comment

In [83]:
comments_csv.head()

Unnamed: 0,author,date,comment,song_id,users_id
0,DeeCee,2014-06-24 21:30:35,We should embrace the contractions here! The '...,61cc6cb83bd8fb88c65fe4f2,61cc6cbb3bd8fb88c6628b8b
1,CySorcerorD,2017-05-12 08:26:41,replace the A with an I to correct the 'A love...,61cc6cb83bd8fb88c65fe4f2,61cc6cbb3bd8fb88c6628b8c
2,guesg,2021-05-04 21:00:41,I love that this song was the theme song for W...,61cc6cb83bd8fb88c65fe4f2,61cc6cbb3bd8fb88c6628b8d
3,SpaceGhostBenoit,2021-07-18 00:30:19,stayed up every school night as a kid just to ...,61cc6cb83bd8fb88c65fe4f2,61cc6cbb3bd8fb88c6628b8e
4,Meandmylife,2014-08-15 07:00:58,you should add ‘hold your drinks’ before the c...,61cc6cb83bd8fb88c66067bf,61cc6cbb3bd8fb88c6628b8f


***

<h3> Setup neo4j connection and insert nodes / create relationships </h3>

In [84]:
from neo4j import GraphDatabase

In [85]:
class Neo4jConnection:
    
    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
        
    def close(self):
        if self.__driver is not None:
            self.__driver.close()
        
    def query(self, query, db=None):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session() 
            response = list(session.run(query))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response

In [94]:
conn = Neo4jConnection(uri="neo4j://127.0.0.1:7687/browser/",user = "", pwd = "") # Neo4j connection

In [None]:
# INSERT PLAYLIST NODES

for index, row in playlist_csv.iterrows():
    query_string = f'''
        CREATE (:Playlist {{name: "{row.playlist}", id: "{row.playlist_id}" }})
    '''
    conn.query(query_string,db='SYP')

In [88]:
# INSERT SONG NODES 

for index, row in songs_csv.iterrows():
    query_string = ''
    if(str(row.album) == 'nan'):
        query_string = f'''
            CREATE (:Song {{
            id: "{row.song_id}",
            track: "{row.track}",
            artist: "{row.artist}"
            }})
        '''  
    else:
        query_string = f'''
            CREATE (:Song {{
            id: "{row.song_id}",
            track: "{row.track}",
            artist: "{row.artist}",
            album: "{row.album}"
            }})
        '''
    conn.query(query_string,db='SYP')

In [89]:
# INSERT USER NODES

for index, row in users_csv.iterrows():
    query_string = f'''
        CREATE (:User {{name: "{row.username}", id: "{row.users_id}" }})
    '''
    conn.query(query_string,db='SYP')

<h6> CREATE RELATIONSHIPS </h6>

In [95]:
# CREATE COMMENT NODES AND RELATIONSHIPS
# User ->: WRITES -> Comment
# Comment ->: RELATED -> Song

for index, row in comments_csv.iterrows():
    query_string = f'''
        MATCH (u:User) WHERE u.id = "{row.users_id}"
        MATCH (s:Song) WHERE s.id = "{row.song_id}"
        CREATE (c:Comment {{date: "{row.date}", body: "{row.comment}", vote: "{row.vote}"}})
        CREATE (u)-[:WRITE]->(c)
        CREATE (c)-[:RELATED]->(s)
    '''
    conn.query(query_string,db='SYP')

In [97]:
# CREATE RELATIONSHIP
# User ->: LIKES -> Playlist

for index, row in users_csv.iterrows():
    for index2, row2 in playlist_csv.sample(row.n_playlist_followed).iterrows():
        query_string = f'''
        MATCH (u:User) WHERE u.id = "{row.users_id}"
        MATCH (p:Playlist) WHERE p.id = "{row2.playlist_id}"
        CREATE (u)-[:LIKES]->(p)
        '''

        conn.query(query_string,db='SYP')

In [99]:
# CREATE RELATIONSHIP
# User ->: FOLLOWS -> User

for index, row in users_csv.iterrows():
    for index2, row2 in users_csv.sample(row.n_user_followed).iterrows():
        query_string = f'''
        MATCH (u:User) WHERE u.id = "{row.users_id}"
        MATCH (u_2:User) WHERE u_2.id = "{row2.users_id}"
        CREATE (u)-[:FOLLOWS]->(u_2)
        '''
        #print(query_string)
        conn.query(query_string,db='SYP')

In [100]:
# Delete, if exists, self-reletionships from neo4j graph
query_string = f'''
        MATCH (a:User)-[rel:FOLLOWS]->(a) 
        DELETE rel;
        '''
conn.query(query_string,db='SYP')

[]