In [16]:
import pandas as pd

In [17]:
df = pd.read_csv('../data/artist_top10_track_clean.csv')
df.shape

(80, 9)

In [18]:
#Review data in dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   track_rank          80 non-null     int64 
 1   album               80 non-null     object
 2   artist              80 non-null     object
 3   track_name          80 non-null     object
 4   popularity          80 non-null     int64 
 5   track_in_album      80 non-null     int64 
 6   duration            80 non-null     int64 
 7   follower            80 non-null     int64 
 8   duration_formatted  80 non-null     object
dtypes: int64(5), object(4)
memory usage: 5.8+ KB


In [19]:
#Basic Statistical Overview of the Dataset
df.describe()

Unnamed: 0,track_rank,popularity,track_in_album,duration,follower
count,80.0,80.0,80.0,80.0,80.0
mean,5.5,58.2,3.525,228675.025,2428132.0
std,2.890403,12.719366,3.638664,42046.522745,3283280.0
min,1.0,35.0,1.0,126312.0,20000.0
25%,3.0,46.0,1.0,196899.25,183377.0
50%,5.5,59.5,1.5,226500.0,1096752.0
75%,8.0,67.0,6.0,259668.0,2832424.0
max,10.0,85.0,17.0,337240.0,10278330.0


In [20]:
#Get data from input index
df.loc[0:2]

Unnamed: 0,track_rank,album,artist,track_name,popularity,track_in_album,duration,follower,duration_formatted
0,1,PUN,PUN,ที่เดิม,71,9,236062,610084,3:56
1,2,PUN,PUN,I Just Wanna Know,69,1,223500,610084,3:43
2,3,Living Death,PUN,Living Death,68,1,258823,610084,4:18


In [21]:
#Get index of most value in each dataframe column  
df.idxmax() 

track_rank             9
album                 16
artist                20
track_name            72
popularity            30
track_in_album        36
duration              43
follower              30
duration_formatted    43
dtype: int64

In [22]:
#Find the most popular track
index_of_most_popular_song = df['popularity'].idxmax()
most_popular_song = df.loc[index_of_most_popular_song]
most_popular_song

track_rank                        1
album                   SMITHEREENS
artist                         Joji
track_name            Glimpse of Us
popularity                       85
track_in_album                    1
duration                     233453
follower                   10278328
duration_formatted             3:53
Name: 30, dtype: object

In [23]:
#Sort artists by their average track popularity
artist_popularity = df.groupby('artist')['popularity'].mean().sort_values(ascending=False)
artist_popularity

artist
Joji             76.3
Fujii Kaze       67.6
PUN              64.5
Cocktail         63.1
THE TOYS         55.5
Paul Partohap    54.9
oftn             42.6
J_ust            41.1
Name: popularity, dtype: float64

In [24]:
#Sort artists by the number of followers
#Use double square brackets [[]] to select multiple columns and return a DataFrame.
#A single pair of brackets [] returns a Series (one column only).
artist_follower = df[['artist','follower']].drop_duplicates().sort_values(by='follower', ascending=False, ignore_index=True,)
artist_follower

Unnamed: 0,artist,follower
0,Joji,10278328
1,Fujii Kaze,4266974
2,Cocktail,2354241
3,THE TOYS,1583421
4,PUN,610084
5,Paul Partohap,210749
6,J_ust,101261
7,oftn,20000


In [25]:
#Find the artist with the highest number of followers
most_follower_idx = df['follower'].idxmax()
most_follower = df.loc[most_follower_idx]
most_follower

track_rank                        1
album                   SMITHEREENS
artist                         Joji
track_name            Glimpse of Us
popularity                       85
track_in_album                    1
duration                     233453
follower                   10278328
duration_formatted             3:53
Name: 30, dtype: object

In [26]:
#Calculate the average song duration for each artist
song_mean = df.groupby('artist')['duration'].mean().sort_values(ascending=False)
song_mean_format = song_mean.apply(lambda x: f'{int(x//60000)}:{int((x%60000)/1000):02}')
song_mean_format


artist
Cocktail         4:34
Fujii Kaze       4:19
PUN              4:00
J_ust            3:54
Paul Partohap    3:46
THE TOYS         3:20
Joji             3:18
oftn             3:15
Name: duration, dtype: object