In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('dataset.csv')

In [3]:
df.columns

Index(['Unnamed: 0', 'track_id', 'artists', 'album_name', 'track_name',
       'popularity', 'duration_ms', 'explicit', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature',
       'track_genre'],
      dtype='object')

How does the presence of one or more highly popular songs in an album affect the
popularity and listener engagement of other tracks within the same album?

Lets see the popularity values

In [4]:
# df.sort_values(by='popularity', ascending=False)
# df[df['popularity'] >= 90]
#df

In [5]:
# Filter the df
grouped_df = df[['album_name', 'track_name', 'popularity']]

In [6]:
# Check for null values
grouped_df['popularity'].apply(type).unique()
# This tells us that all the values are integers, so we do not have to deal with NaN or Null values.

array([<class 'int'>], dtype=object)

In [7]:
# Classify each song as highly popular or not 
popular_song_value = grouped_df['popularity'].quantile(0.9)
grouped_df['highly_popular'] = np.where(grouped_df['popularity'] > popular_song_value, 'True', 'False')
grouped_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grouped_df['highly_popular'] = np.where(grouped_df['popularity'] > popular_song_value, 'True', 'False')


Unnamed: 0,album_name,track_name,popularity,highly_popular
0,Comedy,Comedy,73,True
1,Ghost (Acoustic),Ghost - Acoustic,55,False
2,To Begin Again,To Begin Again,57,False
3,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,True
4,Hold On,Hold On,82,True
...,...,...,...,...
113995,#mindfulness - Soft Rain for Mindful Meditatio...,Sleep My Little Boy,21,False
113996,#mindfulness - Soft Rain for Mindful Meditatio...,Water Into Light,22,False
113997,Best Of,Miss Perfumado,22,False
113998,Change Your World,Friends,41,False


In [8]:
# Group albums
grouped_df = grouped_df.sort_values(by='album_name')

# Remove the rows where the value is NaN for album_name and track_name
clean_df = grouped_df.dropna(subset=['album_name'])
clean_df = grouped_df.dropna(subset=['track_name'])
clean_df

Unnamed: 0,album_name,track_name,popularity,highly_popular
101954,! ! ! ! ! Whispers ! ! ! ! !,Static Rainstorm,32,False
101543,! ! ! ! 300 Sounds of the Ocean ! ! ! !,Fast Water Background,36,False
101961,! ! % > (( Shelter )) < % ! !,Pink Noise Waves,32,False
101586,"! !""#Reboot#""! !",Transport,35,False
101582,"!!!"" Baby Sleep Aid Rain Sounds ""!!!",Fall Asleep Rain,23,False
...,...,...,...,...
65855,공감 (Empathy) - The 1st Mini Album,I'm Gonna Love You,66,True
99504,당신이 잠든 사이에 Pt. 1 Original Television Soundtrack,긴 밤이 오면,56,False
102506,당신이 잠든 사이에 Pt. 1 Original Television Soundtrack,긴 밤이 오면,56,False
65317,당신이 잠든 사이에 Pt. 2 Original Television Soundtrack,It's You,68,True


In [9]:
# Get stats for each album

album_stats_df = clean_df.groupby('album_name').agg(
    total_popularity=('popularity', 'sum'),
    total_tracks=('track_name', 'count'),
    mean_popularity=('popularity', 'mean'),
    highly_popular=('highly_popular', lambda x: (x == 'True').sum())
).reset_index()

album_stats_df.sort_values(by='highly_popular', ascending=False)

Unnamed: 0,album_name,total_popularity,total_tracks,mean_popularity,highly_popular
40276,Un Verano Sin Ti,5048,56,90.142857,56
1501,AM,1941,24,80.875000,24
31232,Return of the Dream Canteen,2874,45,63.866667,23
17395,Hybrid Theory (Bonus Edition),2152,31,69.419355,22
37077,The Black Parade,1480,21,70.476190,20
...,...,...,...,...,...
16354,Heaven Knows,26,1,26.000000,0
16355,Heaven On Earth,83,2,41.500000,0
16356,Heaven Or Hell,18,1,18.000000,0
16357,Heaven Takes You Home (Alternative Mix),59,1,59.000000,0


In [10]:
df.isna().sum()

Unnamed: 0          0
track_id            0
artists             1
album_name          1
track_name          1
popularity          0
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
track_genre         0
dtype: int64

In [11]:
df[df['album_name'].isna()]

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
65900,65900,1kR4gIb7nGxHPI3D2ifs59,,,,0,0,False,0.501,0.583,...,-9.46,0,0.0605,0.69,0.00396,0.0747,0.734,138.391,4,k-pop
