In [85]:
from py2neo import Graph
import pandas as pd
from collections import Counter

In [6]:
graph = Graph(bolt = True, host = "localhost", name = "Spotify", user = "neo4j", password = "qrks")

## 1. Selection of ALL Songs for ALL Artists in Neo4j

In [89]:
query_all = """
     MATCH (a:Artist)-[:ART_TR]->(t:Track)
     RETURN a.artist_id, a.artist_name, a.popularity, a.is_main, t.track_id, t.track_name, t.streams
     ORDER BY a.popularity DESC, a.artist_id, t.streams DESC
"""

In [90]:
cursor = graph.run(query_all)
df_all = pd.DataFrame.from_records(cursor, columns=cursor.keys())

In [99]:
df_all.columns = ["artist_id", "artist_name", "popularity", "is_main", "track_id", "track_name", "streams"]
df_all.reset_index(inplace=True)

### Export to csv (/Spotify/data/01_queries_yt)

In [101]:
out = "../data/01_queries_yt/"

In [102]:
df_all.to_csv(out + "queries.csv", header=True, sep=";", index=False)

## 2. Selection of top Songs for top Artists in Neo4j

In [79]:
query_most_pop_artist = """

    // Get the 300 top artists sorted by popularity in a list to slice it to only 300
    MATCH (a:Artist)
    WITH a
    ORDER BY a.popularity DESC
    
    WITH COLLECT(a)[1..300] as TopArtists
    
    // After slicing, UNWIND the collection and match the tracks for each artist
    UNWIND TopArtists as aa
    MATCH (aa:Artist)-[r:ART_TR]->(t:Track)
    
    // For each artist unwinded, order its tracks by streams descendingly
    WITH aa, t
    ORDER BY aa.artist_id, t.streams DESC
    
    // Convert this table into a row per artist (aa) where all the tracks are summarixed ina trackList
    WITH aa, COLLECT(t)[1..20] AS trackList
    
    // After slicing that list into 20 elements, get the artist_name and the track_name after UNWINDIND that list
    UNWIND trackList as tracks
    RETURN aa.artist_name, tracks.track_name
"""

In [80]:
cursor = graph.run(query_most_pop_artist)
df = pd.DataFrame.from_records(cursor, columns=cursor.keys())

In [81]:
df.columns = ["artist", "track"]

In [82]:
df_grouped = pd.DataFrame(df.groupby("artist")["track"].count())
df_grouped.sort_values("track", ascending = False, inplace = True)

In [87]:
# Make sure all artists have only 20 songs associated as maximum
df_grouped

Unnamed: 0_level_0,track
artist,Unnamed: 1_level_1
2 Chainz,19
Ne-Yo,19
Marília Mendonça,19
Matheus & Kauan,19
Meek Mill,19
...,...
Who Is Fancy,1
MC Zuka,1
Luis Figueroa,1
Tay Keith,1
