In [1]:
import os
import sys
import warnings

import numpy as np
import pandas as pd
from mizani.formatters import percent_format
from plotnine import *
from datetime import datetime
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import norm
from IPython.core.display import HTML
from stargazer.stargazer import Stargazer
import statsmodels.nonparametric.kernel_regression as loess

%matplotlib inline
warnings.filterwarnings("ignore")

### Uploading and cleaning data

In [145]:
data = pd.read_csv("spotify_tracks.csv")

In [151]:
data

1,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,language
32862,0,0.0317,0.56,115200,0.55,0.000002,6,0.101,-10.682,0,0.224,150.352,4,0.347,Unknown
33123,9,0.185,0.65,197483,0.895,0.000006,11,0.358,-7.679,0,0.123,120.042,4,0.556,Korean
33124,1,0.0639,0.9,230092,0.503,0.000825,2,0.119,-8.017,1,0.126,109.026,4,0.351,Korean
33125,2,0.114,0.667,173471,0.845,0.0,8,0.753,-4.142,0,0.296,85.091,4,0.731,Korean
33126,1,0.242,0.75,208000,0.699,0.000891,4,0.12,-5.086,0,0.0967,150.053,4,0.552,Korean
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62281,27,0.383,0.541,212143,0.822,0.0,2,0.383,-7.07,0,0.295,139.985,4,0.94,Tamil
62282,25,0.524,0.604,193912,0.62,0.000124,2,0.16,-7.254,1,0.0695,126.433,4,0.291,Tamil
62283,2,0.471,0.859,78857,0.756,0.000005,2,0.0657,-7.004,1,0.264,140.181,4,0.272,Tamil
62284,2,0.078,0.877,135286,0.974,0.000003,1,0.147,-1.71,1,0.124,139.969,4,0.458,Tamil


In [147]:
data.columns = data.iloc[1] 
data = data[2:]
data.reset_index(drop=True, inplace=True) 

In [148]:
data = data[data["year"] == 2024]

In [149]:
data = data.loc[:, ['popularity', 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness',	'mode',	'speechiness', 'tempo', 'time_signature', 'valence', 'language']]

In [150]:
data = data.dropna()

In [152]:
columns = data.columns[0:14]
for s in columns:
    data[s] = pd.to_numeric(data[s], errors='coerce')

In [154]:
data = data[~(data[list_var] == -1).any(axis=1)]

In [155]:
from skimpy import skim
list_var = data.columns.tolist()
skim(data.filter(list_var))

In [142]:
#acousticness: A confidence measure from 0.0 to 1.0 of whether the track is acoustic.
#danceability: A measure of how suitable a track is for dancing (0.0 = least danceable, 1.0 = most danceable).
#duration_ms: Track length in milliseconds.
#energy: A perceptual measure from 0.0 to 1.0 of intensity and activity.
#key: The musical key of the track, using standard pitch class notation (e.g., 0 = C, 1 = C♯/D♭).
#liveness: A measure of the likelihood that the track was recorded live (higher values indicate live performances).
#loudness: The overall loudness of the track in decibels (dB).
#mode: Indicates the modality of the track (1 = major, 0 = minor).
#speechiness: Measures the presence of spoken words in a track (closer to 1.0 indicates more speech-like content).
#tempo: The estimated tempo of the track in beats per minute (BPM).
#time_signature: The number of beats per measure, ranging from 3 to 7.
#valence: A measure from 0.0 to 1.0 indicating the track's musical positiveness (higher values are happier)

In [144]:
a = ['popularity', 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key',
     'liveness', 'loudness',	'mode',	'speechiness', 'tempo', 'time_signature', 'valence', 'language']
melted = data.melt(id_vars = "language", value_vars = 'speechiness') 
melted = melted.rename(columns={1: 'values'})
melted.groupby(["language", 'values']).agg(["mean", "std", "min", "max", "count"]).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,value,value,value,value,value
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,min,max,count
language,values,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
English,speechiness,0.17,0.28,0.02,0.96,2829
Hindi,speechiness,0.08,0.07,0.02,0.41,257
Korean,speechiness,0.08,0.08,0.03,0.54,121
Malayalam,speechiness,0.13,0.04,0.1,0.16,2
Tamil,speechiness,0.1,0.09,0.03,0.9,857
Telugu,speechiness,0.09,0.07,0.03,0.18,4
Unknown,speechiness,0.06,0.06,0.03,0.4,40
