In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/artist_top10_track_clean.csv')
df.shape

(80, 9)

In [None]:
#Review data in dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Rank                   80 non-null     int64 
 1   Albums                 80 non-null     object
 2   Artist                 80 non-null     object
 3   Name                   80 non-null     object
 4   Popularity             80 non-null     int64 
 5   Track Number In Album  80 non-null     int64 
 6   Duration               80 non-null     int64 
 7   Followers              80 non-null     int64 
 8   Duration(min:sec)      80 non-null     object
dtypes: int64(5), object(4)
memory usage: 5.8+ KB


In [None]:
#Basic Statistical Overview of the Dataset
df.describe()

Unnamed: 0,Rank,Popularity,Track Number In Album,Duration,Followers
count,80.0,80.0,80.0,80.0,80.0
mean,5.5,58.35,3.525,228675.025,2425557.0
std,2.890403,12.713095,3.638664,42046.522745,3282709.0
min,1.0,36.0,1.0,126312.0,19932.0
25%,3.0,46.0,1.0,196899.25,182917.0
50%,5.5,60.5,1.5,226500.0,1094398.0
75%,8.0,67.25,6.0,259668.0,2825187.0
max,10.0,85.0,17.0,337240.0,10276470.0


In [None]:
#Get data from input index
df.loc[0:2]

Unnamed: 0,Rank,Albums,Artist,Name,Popularity,Track Number In Album,Duration,Followers,Duration(min:sec)
0,1,PUN,PUN,ที่เดิม,71,9,236062,606188,3:56
1,2,PUN,PUN,I Just Wanna Know,69,1,223500,606188,3:43
2,3,Living Death,PUN,Living Death,69,1,258823,606188,4:18


In [None]:
#Get index of most value in each dataframe column  
df.idxmax() 

Rank                      9
Albums                   16
Artist                   20
Name                     72
Popularity               30
Track Number In Album    36
Duration                 43
Followers                30
Duration(min:sec)        43
dtype: int64

In [None]:
#Find the most popular track
index_of_most_popular_song = df['Popularity'].idxmax()
most_popular_song = df.loc[index_of_most_popular_song]
most_popular_song

Rank                                 1
Albums                     SMITHEREENS
Artist                            Joji
Name                     Glimpse of Us
Popularity                          85
Track Number In Album                1
Duration                        233453
Followers                     10276473
Duration(min:sec)                 3:53
Name: 30, dtype: object

In [None]:
#Sort artists by their average track popularity
artist_popularity = df.groupby('Artist')['Popularity'].mean().sort_values(ascending=False)
artist_popularity

Artist
Joji             76.4
Fujii Kaze       68.1
PUN              64.5
Cocktail         63.2
THE TOYS         55.6
Paul Partohap    55.0
oftn             43.0
J_ust            41.0
Name: Popularity, dtype: float64

In [None]:
#Sort artists by the number of followers
#Use double square brackets [[]] to select multiple columns and return a DataFrame.
#A single pair of brackets [] returns a Series (one column only).
artist_follower = df[['Artist','Followers']].drop_duplicates().sort_values(by='Followers', ascending=False, ignore_index=True,)
artist_follower

Unnamed: 0,Artist,Followers
0,Joji,10276473
1,Fujii Kaze,4261433
2,Cocktail,2346438
3,THE TOYS,1582608
4,PUN,606188
5,Paul Partohap,210142
6,J_ust,101242
7,oftn,19932


In [None]:
#Find the artist with the highest number of followers
most_follower_idx = df['Followers'].idxmax()
most_follower = df.loc[most_follower_idx]
most_follower

Rank                                 1
Albums                     SMITHEREENS
Artist                            Joji
Name                     Glimpse of Us
Popularity                          85
Track Number In Album                1
Duration                        233453
Followers                     10276473
Duration(min:sec)                 3:53
Name: 30, dtype: object

In [None]:
#Calculate the average song duration for each artist
song_mean = df.groupby('Artist')['Duration'].mean().sort_values(ascending=False)
song_mean_format = song_mean.apply(lambda x: f'{int(x//60000)}:{int((x%60000)/1000):02}')
song_mean_format


Artist
Cocktail         4:34
Fujii Kaze       4:19
PUN              4:00
J_ust            3:54
Paul Partohap    3:46
THE TOYS         3:20
Joji             3:18
oftn             3:15
Name: Duration, dtype: object