In [11]:
import pandas as pd

df = pd.read_csv("1950big_data.csv", low_memory=False) 
print(df.shape)
df.info()


(455908, 36)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 455908 entries, 0 to 455907
Data columns (total 36 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   year                455908 non-null  int64  
 1   position            4028 non-null    float64
 2   title               455901 non-null  object 
 3   artist              4028 non-null    object 
 4   pos_sentiment       4028 non-null    float64
 5   neg_sentiment       4028 non-null    float64
 6   neut_sentiment      4028 non-null    float64
 7   compound_sentiment  4028 non-null    float64
 8   f_k_grade           4028 non-null    float64
 9   flesch_index        4028 non-null    float64
 10  fog_index           4028 non-null    float64
 11  num_syllables       4028 non-null    float64
 12  difficult_words     4028 non-null    float64
 13  num_dupes           4028 non-null    float64
 14  num_words           4028 non-null    float64
 15  num_lines           4

In [12]:
na_count = df.isna().sum().sort_values(ascending=False)
na_percent = (na_count / len(df)).sort_values(ascending=False)
missing = pd.concat([na_count, na_percent], axis=1)
missing.columns = ["na_count", "na_percent"]
missing.head(20)
# 

Unnamed: 0,na_count,na_percent
genre_tags,453236,0.994139
difficult_words,451880,0.991165
f_k_grade,451880,0.991165
num_lines,451880,0.991165
num_words,451880,0.991165
num_dupes,451880,0.991165
num_syllables,451880,0.991165
fog_index,451880,0.991165
flesch_index,451880,0.991165
compound_sentiment,451880,0.991165


A subset of variables exhibits extremely high levels of missingness, with more than 99% of observations missing. These variables share a common characteristic: they rely on external information sources beyond the core Spotify audio metadata. Specifically, this group includes genre-related information (genre_tags), lyric-based textual features (such as num_words, num_lines, num_syllables, f_k_grade, fog_index, flesch_index, and difficult_words), sentiment analysis scores (pos_sentiment, neg_sentiment, neut_sentiment, and compound_sentiment), as well as Billboard-specific metadata (artist and position).

The high missingness in these variables does not indicate data quality errors but rather reflects limited coverage from external data sources, including lyric databases, genre classification pipelines, and historical Billboard ranking records. Only a small subset of songs could be successfully matched to these auxiliary sources. Consequently, in a large-scale dataset comprising over 450,000 tracks, missingness exceeding 99% for these variables is an expected outcome of incomplete data integration rather than a result of incorrect data processing.

To mitigate potential bias and ensure robustness in exploratory analysis, these variables were retained in the raw dataset for documentation purposes but excluded from the primary analytical dataset used for exploratory data analysis and feature engineering.

In [13]:
## core variable
core = ["danceability","energy","valence","tempo","loudness","acousticness",
"speechiness","instrumentalness","liveness","duration_ms","popularity"]

## These variables are not necessarily provided by Spotify itself, nor are they directly derived from the audio signal, 
## and therefore must be supplemented using external data sources.
extra = ["genre_tags","num_words","num_lines","num_syllables","flesch_index",
          "fog_index","f_k_grade","difficult_words","pos_sentiment","neg_sentiment",
          "neut_sentiment","compound_sentiment","position"]

print(df[core].isna().mean().sort_values(ascending=False))
print(df[extra].isna().mean().sort_values(ascending=False))


danceability        0.005554
energy              0.005554
valence             0.005554
tempo               0.005554
loudness            0.005554
acousticness        0.005554
speechiness         0.005554
instrumentalness    0.005554
liveness            0.005554
duration_ms         0.005554
popularity          0.005554
dtype: float64
genre_tags            0.994139
num_words             0.991165
num_lines             0.991165
num_syllables         0.991165
flesch_index          0.991165
fog_index             0.991165
f_k_grade             0.991165
difficult_words       0.991165
pos_sentiment         0.991165
neg_sentiment         0.991165
neut_sentiment        0.991165
compound_sentiment    0.991165
position              0.991165
dtype: float64


In [14]:
thr = 0.95
drop_cols = missing.index[missing["na_percent"] > thr].tolist()

analysis_df = df.drop(columns=drop_cols)
print("analysis_df shape:", analysis_df.shape)
analysis_df.info()

analysis_df shape: (455908, 21)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 455908 entries, 0 to 455907
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   year              455908 non-null  int64  
 1   title             455901 non-null  object 
 2   id                453376 non-null  object 
 3   popularity        453376 non-null  float64
 4   duration_ms       453376 non-null  float64
 5   explicit          453376 non-null  float64
 6   artists           453376 non-null  object 
 7   id_artists        453376 non-null  object 
 8   danceability      453376 non-null  float64
 9   energy            453376 non-null  float64
 10  key               453376 non-null  float64
 11  loudness          453376 non-null  float64
 12  mode              453376 non-null  float64
 13  speechiness       453376 non-null  float64
 14  acousticness      453376 non-null  float64
 15  instrumentalness  453376 non-null  f

In [15]:
analysis_df.isna().mean().sort_values(ascending=False).head(10)
# There is no severe or huge missing value

key                 0.005554
time_signature      0.005554
valence             0.005554
liveness            0.005554
instrumentalness    0.005554
acousticness        0.005554
speechiness         0.005554
mode                0.005554
loudness            0.005554
energy              0.005554
dtype: float64

Given the extremely low proportion of missing values and the absence of a clear, domain-justified similarity metric among songs, median imputation was chosen over KNN-based methods to avoid introducing unnecessary model-driven assumptions and potential noise.

More sophisticated imputation methods, such as KNN-based imputation, were considered but not adopted, as their assumptions were not aligned with the exploratory focus of this project.

In [16]:
num_cols = analysis_df.select_dtypes(include="number").columns
for c in num_cols:
     analysis_df[c] = analysis_df[c].fillna(analysis_df[c].median())


In [17]:
analysis_df.isna().sum()

year                   0
title                  7
id                  2532
popularity             0
duration_ms            0
explicit               0
artists             2532
id_artists          2532
danceability           0
energy                 0
key                    0
loudness               0
mode                   0
speechiness            0
acousticness           0
instrumentalness       0
liveness               0
valence                0
tempo                  0
time_signature         0
hit                    0
dtype: int64

A small number of missing values remain in identifier and textual fields such as title, artists, id, and id_artists. These variables serve as descriptive identifiers rather than analytical features. Since imputing such fields would introduce artificial or misleading information, missing values in these columns were retained and excluded from numerical preprocessing.

### Question: How do Spotify audio features differ between hit and non-hit songs, and how have these patterns changed over time?

In [None]:
analysis_df["hit"].value_counts(normalize=True)


hit
0    0.991165
1    0.008835
Name: proportion, dtype: float64

The highly imbalanced distribution of the hit variable is expected rather than anomalous. By definition, only a small fraction of released songs achieve chart success, while the vast majority do not enter mainstream rankings. The observed hit rate of approximately 0.9% therefore reflects the competitive structure of the music industry and confirms that the dataset aligns with real-world dynamics.

In [19]:
analysis_df["year"].describe()


count    455908.000000
mean       1988.785898
std          17.241933
min        1950.000000
25%        1976.000000
50%        1991.000000
75%        2003.000000
max        2015.000000
Name: year, dtype: float64

The dataset spans from 1950 to 2015, covering multiple musical eras and enabling temporal exploration of trends in audio features.

In [20]:
analysis_df[["danceability","energy","valence","tempo"]].describe()


Unnamed: 0,danceability,energy,valence,tempo
count,455908.0,455908.0,455908.0,455908.0
mean,0.557691,0.548353,0.562697,118.674952
std,0.162908,0.247616,0.257979,29.515984
min,0.0,0.0,0.0,0.0
25%,0.45,0.359,0.356,96.002
50%,0.571,0.553,0.577,117.3965
75%,0.677,0.75,0.782,136.34
max,0.991,1.0,1.0,246.381


Core audio features exhibit substantial variability, reflecting the diversity of musical styles present in the dataset.

In [21]:
analysis_df.groupby("hit")[["danceability","energy","valence"]].mean()


Unnamed: 0_level_0,danceability,energy,valence
hit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.557431,0.548097,0.56247
1,0.586868,0.577132,0.588089


Hit songs tend to have slightly higher average danceability, energy, and valence compared to non-hit songs, suggesting a preference for rhythmically engaging and emotionally positive tracks, although the magnitude of differences is modest.

In [22]:
analysis_df.groupby("year")[["danceability","energy"]].mean()


Unnamed: 0_level_0,danceability,energy
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1950,0.511720,0.299945
1951,0.479943,0.291473
1952,0.456095,0.267213
1953,0.451998,0.269252
1954,0.462541,0.264381
...,...,...
2011,0.588194,0.657504
2012,0.592452,0.671569
2013,0.591345,0.673437
2014,0.600379,0.666494


Over time, both danceability and energy show a clear upward trend, indicating a shift toward more energetic and rhythm-driven music in recent decades.

In [23]:
analysis_df[["danceability","energy","valence","tempo","loudness","hit"]].corr()


Unnamed: 0,danceability,energy,valence,tempo,loudness,hit
danceability,1.0,0.235792,0.557182,-0.067141,0.222241,0.016909
energy,0.235792,1.0,0.411871,0.220986,0.764812,0.010973
valence,0.557182,0.411871,1.0,0.13706,0.29324,0.009293
tempo,-0.067141,0.220986,0.13706,1.0,0.17741,-0.001782
loudness,0.222241,0.764812,0.29324,0.17741,1.0,0.021938
hit,0.016909,0.010973,0.009293,-0.001782,0.021938,1.0


Correlation analysis shows that while several audio features are moderately correlated with each other, their individual correlations with hit status are weak, suggesting that song success cannot be explained by any single audio attribute alone.

In [None]:
analysis_df["decade"] = (analysis_df["year"] // 10) * 10
# To capture temporal shifts in musical characteristics, a decade-level feature was constructed from the original year variable.

In [None]:
analysis_df["energy_dance"] = (
    analysis_df["energy"] * analysis_df["danceability"]
)
# Since single audio features show only weak associations with hit status, an interaction feature combining energy and danceability was created to better capture 
# rhythm-driven song characteristics.

In [None]:
from sklearn.preprocessing import StandardScaler

scale_cols = [
    "danceability","energy","valence",
    "tempo","loudness","acousticness","speechiness"
]

scaler = StandardScaler()
analysis_df[[c + "_z" for c in scale_cols]] = scaler.fit_transform(
    analysis_df[scale_cols]
)

# Numerical audio features were standardized to ensure comparability across different measurement scales.

In [29]:
analysis_df

Unnamed: 0,year,title,id,popularity,duration_ms,explicit,artists,id_artists,danceability,energy,...,hit,decade,energy_dance,danceability_z,energy_z,valence_z,tempo_z,loudness_z,acousticness_z,speechiness_z
0,1950,Mona Lisa,,27.0,220053.0,0.0,,,0.571,0.553,...,1,1950,0.315763,0.081695,0.018766,0.055444,-0.043314,0.167974,-0.065699,-0.323623
1,1950,I Wanna Be Loved,,27.0,220053.0,0.0,,,0.571,0.553,...,1,1950,0.315763,0.081695,0.018766,0.055444,-0.043314,0.167974,-0.065699,-0.323623
2,1950,Tennessee Waltz,,27.0,220053.0,0.0,,,0.571,0.553,...,1,1950,0.315763,0.081695,0.018766,0.055444,-0.043314,0.167974,-0.065699,-0.323623
3,1950,I'll Never Be Free,6TvGgRU1UtOBFevicFLWLI,6.0,191613.0,0.0,['Ella Fitzgerald and Louis Jordan'],['5TzSUelHY0wl9WbEXMVv1Q'],0.562,0.109,...,1,1950,0.061258,0.026449,-1.774334,-1.095815,-0.465814,-0.833646,1.451618,-0.335199
4,1950,All My Love,,27.0,220053.0,0.0,,,0.571,0.553,...,1,1950,0.315763,0.081695,0.018766,0.055444,-0.043314,0.167974,-0.065699,-0.323623
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455903,2015,One Singapore,7evFdDqiQUkRlo5yICNeIW,13.0,236773.0,0.0,['Sing A Nation Choir'],['6o3knWLBfKrjDdmuHvfnww'],0.550,0.921,...,0,2010,0.506550,-0.047212,1.504938,0.241506,0.447929,1.108884,-0.892786,0.952415
455904,2015,Shine For Singapore,30QbxMta1DGgfcHUc9GfKi,13.0,180733.0,0.0,['Hady Mirza'],['2PpD3uapjVwWbLJI2qbG27'],0.231,0.745,...,0,2010,0.172095,-2.005374,0.794160,-0.762454,-1.135791,1.172843,-0.024047,-0.306536
455905,2015,Shine On Me,1PBxQWuUn6RFzsg30gNH0H,11.0,236680.0,0.0,['Jai'],['2cYD6DhwZpdrFhnhLLAmyL'],0.583,0.484,...,0,2010,0.282172,0.155356,-0.259892,-0.882619,0.109434,0.611830,0.276441,-0.378193
455906,2015,就在這裡,4G3RE7nMKl4Yklg1aYCvPY,5.0,226960.0,0.0,['Kelvin Tan'],['0Ys3jWqZrvyzwdQ7udcvOu'],0.460,0.689,...,0,2010,0.316940,-0.599672,0.568003,-0.847732,0.616651,0.698327,-1.131986,-0.351735
