# Top Spotify Hits
# Dataset: https://www.kaggle.com/datasets/paradisejoy/top-hits-spotify-from-20002019?resource=download



In [99]:
#Initial Set Up

import pandas as pd
file_path = "./Data/songs_normalize.csv"

df = pd.read_csv(file_path)
df.dtypes

artist               object
song                 object
duration_ms           int64
explicit               bool
year                  int64
popularity            int64
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
genre                object
dtype: object

In [100]:
df.head()

Unnamed: 0,artist,song,duration_ms,explicit,year,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre
0,Britney Spears,Oops!...I Did It Again,211160,False,2000,77,0.751,0.834,1,-5.444,0,0.0437,0.3,1.8e-05,0.355,0.894,95.053,pop
1,blink-182,All The Small Things,167066,False,1999,79,0.434,0.897,0,-4.918,1,0.0488,0.0103,0.0,0.612,0.684,148.726,"rock, pop"
2,Faith Hill,Breathe,250546,False,1999,66,0.529,0.496,7,-9.007,1,0.029,0.173,0.0,0.251,0.278,136.859,"pop, country"
3,Bon Jovi,It's My Life,224493,False,2000,78,0.551,0.913,0,-4.063,0,0.0466,0.0263,1.3e-05,0.347,0.544,119.992,"rock, metal"
4,*NSYNC,Bye Bye Bye,200560,False,2000,65,0.614,0.928,8,-4.806,0,0.0516,0.0408,0.00104,0.0845,0.879,172.656,pop


In [101]:
#Question 1 - How many records are in the dataset?

num_rows = df.shape[0]
print("There are " + str(num_rows) + " records.")
df.tail()


There are 2000 records.


Unnamed: 0,artist,song,duration_ms,explicit,year,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre
1995,Jonas Brothers,Sucker,181026,False,2019,79,0.842,0.734,1,-5.065,0,0.0588,0.0427,0.0,0.106,0.952,137.958,pop
1996,Taylor Swift,Cruel Summer,178426,False,2019,78,0.552,0.702,9,-5.707,1,0.157,0.117,2.1e-05,0.105,0.564,169.994,pop
1997,Blanco Brown,The Git Up,200593,False,2019,69,0.847,0.678,9,-8.635,1,0.109,0.0669,0.0,0.274,0.811,97.984,"hip hop, country"
1998,Sam Smith,Dancing With A Stranger (with Normani),171029,False,2019,75,0.741,0.52,8,-7.513,1,0.0656,0.45,2e-06,0.222,0.347,102.998,pop
1999,Post Malone,Circles,215280,False,2019,85,0.695,0.762,0,-3.497,1,0.0395,0.192,0.00244,0.0863,0.553,120.042,hip hop


In [102]:
#Question 2  What year is the oldest song in the dataset from?

min_value_row = df.loc[df['year'].idxmin()]
print("The oldest song is " + min_value_row['song'] +" (" + str(int(min_value_row['year'])) + ")") 


The oldest song is Hot Boyz (1998)


In [103]:
#Question 3 What is the most popular song in the dataset?
max_value_row = df.loc[df['popularity'].idxmax()]
print("The most popular song is " + max_value_row['song'] +" (" + str(int(max_value_row['popularity'])) + ")") 

The most popular song is Sweater Weather (89)


In [104]:
#Question 4 How many unique genres are there?
df_copy = df.copy()
df_copy['genre'] = df_copy['genre'].astype(str)
df_copy['genre'] = df_copy['genre'].str.split(',')    # Some songs have more than one genre
df_copy = df_copy.explode('genre')
df_copy['genre'] = df_copy['genre'].apply(lambda x: x.strip())  #Formatting of genres is inconsistent, some have extra whitespace

unique_count_column1 = df_copy['genre'].nunique()
print(f"Number of unique genres: {unique_count_column1}")


Number of unique genres: 15


In [105]:
# Question 5 What is the most popular unique individual genre by number of songs on the list?

def get_count_genre(group):     #Helper function to find the number of genres in a group. Then use groupby to apply that function to all groups
    return group.shape[0]

top_genres_by_count = df_copy.groupby('genre').apply(get_count_genre, include_groups=False)
print(top_genres_by_count.idxmax())


pop


In [106]:
# Question 6 What is the most popular artist by total number of songs on the list

df_copy_2 = df.copy()
df_copy_2['artist'] = df_copy_2['artist'].astype(str)
df_copy_2['artist'] = df_copy_2['artist'].str.split(',')    # Some songs might have more than one artist. It appears to be not so in this list, but just in case.
df_copy_2 = df_copy_2.explode('artist')
df_copy_2['artist'] = df_copy_2['artist'].apply(lambda x: x.strip()) 


top_genres_by_count = df_copy.groupby('artist').apply(get_count_genre, include_groups=False)   #Re-using the previously defined helper function
print(top_genres_by_count.idxmax())


Rihanna


In [107]:
# Question 7 Who is the artist of the longest song name?

# Create a column called length and then delete it later
df['Length'] = df['song'].apply(len)
index_of_longest = df['Length'].idxmax()

# Get the longest string
longest_string = df.at[index_of_longest, 'song']
print(f"The longest string in song is: {longest_string}")
print("The artist is: " + df.at[index_of_longest, 'artist'] )

df.drop(columns=['Length'], inplace=True)


The longest string in song is: I Don’t Wanna Live Forever (Fifty Shades Darker) - From "Fifty Shades Darker (Original Motion Picture Soundtrack)"
The artist is: ZAYN


In [108]:
# Question 8 total duration of songs in ms

# Sum the values in Column1
sum_ms = df['duration_ms'].sum()
print(f"Duration of songs in millseconds = {sum_ms}")

Duration of songs in millseconds = 457496249


In [109]:
#Question 9 What is the second most popular explicit song from 2010?

filtered_df = df[ (df['explicit']) & (df['year'] == 2010) ]

sorted_df = filtered_df.sort_values(by='popularity', ascending=False)

second_highest_row = sorted_df.iloc[1]

print("The second most popular explicit song from 2010 is " + second_highest_row['song'] + " (" + str(int(second_highest_row['popularity'])) + ")")


The second most popular explicit song from 2010 is Not Afraid (79)


In [111]:
#Question 10 Convert to parquet and diff

file_path_parquet ='./Data/spotify_top.parquet'
df.to_parquet(file_path_parquet)

import os

# File paths


# Get the sizes of the files
file1_size = os.path.getsize(file_path)
file2_size = os.path.getsize(file_path_parquet)

print(f"Size of {file_path}: {file1_size} bytes")
print(f"Size of {file_path_parquet}: {file2_size} bytes")
difference = file1_size - file2_size

print("The new file is " + str(difference) + " bytes smaller than the original file.")


Size of ./Data/songs_normalize.csv: 254987 bytes
Size of ./Data/spotify_top.parquet: 147259 bytes
The new file is 107728 bytes smaller than the original file.
