In [2]:
import pandas as pd
import statistics
import numpy as np

# Declaration
data = pd.read_csv("spotify_songs_dataset.csv")

target_col = ['duration','stream']

for dt in target_col:
    # Convert to int
    data[dt] = pd.to_numeric(data[dt], errors='coerce')
    
    # Remove NaN
    data = data.dropna(subset=[dt])

    max_val = data[dt].max()
    min_val = data[dt].min()
    range_val = max_val - min_val
    mean = round(data[dt].mean(),2)
    mode = round(statistics.mode(data[dt]),2)
    var = round(np.nanvar(data[dt],ddof=1),2)
    std = round(var ** 0.5,2)

    print(f"Max of {dt.title()} : {max_val}")
    print(f"Min of {dt.title()} : {min_val}")
    print(f"Range of {dt.title()} : {range_val}")
    print(f"Mean of {dt.title()} : {mean}")
    print(f"Mode of {dt.title()} : {mode}")
    print(f"Variance of {dt.title()} : {var}")
    print(f"Standard Deviance of {dt.title()} : {std}\n")

Max of Duration : 433.0
Min of Duration : 33.0
Range of Duration : 400.0
Mean of Duration : 239.66
Mode of Duration : 237.0
Variance of Duration : 2513.69
Standard Deviance of Duration : 50.14

Max of Stream : 99999128
Min of Stream : 1899
Range of Stream : 99997229
Mean of Stream : 50244793.8
Mode of Stream : 7987973
Variance of Stream : 836701920912618.1
Standard Deviance of Stream : 28925800.26



In [34]:
import pandas as pd

# Declaration
df = pd.read_csv("spotify_songs_dataset.csv")

# Most Genre and Average Duration
top_context = df.groupby('genre').agg(
    Average_Duration=('duration', 'mean'),  
    Total_Songs=('duration', 'count')       
).sort_values(by='Average_Duration', ascending=False)

top_context['Average_Duration'] = top_context['Average_Duration'].round(2)
top_context = top_context.reset_index()

top_context = top_context.rename(columns={
    'genre': 'Genre',
    'Average_Duration': 'Avg Duration',
    'Total_Songs': 'Number of Songs'
})

print(top_context)

        Genre  Avg Duration  Number of Songs
0      Reggae        240.84              866
1        Jazz        240.40             2245
2  Electronic        240.36            11301
3         R&B        240.33             1358
4        Folk        240.31             2286
5     Hip-Hop        239.52            11087
6   Classical        239.25             2270
7         Pop        238.90            11308
8     Country        238.79             2279


In [36]:
import pandas as pd

# Declaration
df = pd.read_csv("spotify_songs_dataset.csv")

# Most Song Language and Average Duration
top_context = df.groupby('language').agg(
    Average_Duration=('duration', 'mean'),  
    Total_Songs=('duration', 'count')       
).sort_values(by='Average_Duration', ascending=False)

top_context['Average_Duration'] = top_context['Average_Duration'].round(2)
top_context = top_context.reset_index()

top_context = top_context.rename(columns={
    'language': 'Language',
    'Average_Duration': 'Avg Duration',
    'Total_Songs': 'Number of Songs'
})

print(top_context)

   Language  Avg Duration  Number of Songs
0    Korean        241.15             2073
1    French        239.94             2136
2   English        239.75            30028
3   Spanish        239.60             4257
4   Italian        239.46             1699
5  Japanese        238.72             1274
6    German        238.52             1277


In [10]:
import pandas as pd

# Declaration
df = pd.read_csv("spotify_songs_dataset.csv")

# Label With Most Songs and Average Duration
top_context = df.groupby('label').agg(
    Average_Duration=('duration', 'mean'),  
    Total_Songs=('duration', 'count')       
).sort_values(by='Average_Duration', ascending=False)

top_context['Average_Duration'] = top_context['Average_Duration'].round(2)
top_context = top_context.reset_index()

top_context = top_context.rename(columns={
    'label': 'Label',
    'Average_Duration': 'Avg Duration',
    'Total_Songs': 'Number of Songs'
})

print(top_context)

              Label  Avg Duration  Number of Songs
0             Indie        240.26             7603
1      Warner Music        239.97             7426
2        Sony Music        239.69             7555
3  Atlantic Records        239.57             7455
4           Def Jam        239.23             7641
5   Universal Music        239.22             7320


In [24]:
import pandas as pd

# Declaration
df = pd.read_csv("spotify_songs_dataset.csv")

# Total Song per Year, Average Duration, and Total Stream
df['Year'] = df['release_date'].apply(lambda x: x.split('-')[0].strip() if isinstance(x, str) else x)
top_context = df.groupby('Year').agg(
    Average_Duration=('duration', 'mean'),  
    Total_Songs=('duration', 'count'),
    Stream=('stream', 'sum')
).sort_values(by='Year', ascending=False)

top_context['Average_Duration'] = top_context['Average_Duration'].round(2)
top_context = top_context.reset_index()

top_context = top_context.rename(columns={
    'Year': 'Year',
    'Average_Duration': 'Avg Duration',
    'Total_Songs': 'Number of Songs',
    'Stream': 'Total Stream'
})

print(top_context)

    Year  Avg Duration  Number of Songs  Total Stream
0   2024        240.51             1186   66645540633
1   2023        240.96             1438   81772935033
2   2022        239.49             1525   85616448659
3   2021        238.68             1511   84284257744
4   2020        239.92             1454   81050635363
5   2019        241.27             1554   86413774919
6   2018        240.25             1418   78049187914
7   2017        236.75             1592   88422945900
8   2016        239.00             1476   80957839242
9   2015        238.97             1464   78837127668
10  2014        239.25             1511   83922349379
11  2013        237.89             1485   83325137440
12  2012        241.13             1480   82297575190
13  2011        240.71             1496   82420913309
14  2010        239.40             1473   82612810732
15  2009        238.50             1463   83100239814
16  2008        238.56             1503   84416076308
17  2007        241.00      