Question: How do Spotify audio features differ between hit and non-hit songs, and how have these patterns changed over time?

In [66]:
import pandas as pd

file_path = "/Users/shielawu/Desktop/5243/1950big_data.csv"
df = pd.read_csv(file_path, low_memory=False)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 455908 entries, 0 to 455907
Data columns (total 36 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   year                455908 non-null  int64  
 1   position            4028 non-null    float64
 2   title               455901 non-null  object 
 3   artist              4028 non-null    object 
 4   pos_sentiment       4028 non-null    float64
 5   neg_sentiment       4028 non-null    float64
 6   neut_sentiment      4028 non-null    float64
 7   compound_sentiment  4028 non-null    float64
 8   f_k_grade           4028 non-null    float64
 9   flesch_index        4028 non-null    float64
 10  fog_index           4028 non-null    float64
 11  num_syllables       4028 non-null    float64
 12  difficult_words     4028 non-null    float64
 13  num_dupes           4028 non-null    float64
 14  num_words           4028 non-null    float64
 15  num_lines           4028 non-null 

CLEANING DATA

In [67]:
audio_cols = [
    "danceability","energy","key","loudness","mode",
    "speechiness","acousticness","instrumentalness","liveness",
    "valence","tempo","time_signature"
]

other_numeric = [
    "position","popularity","duration_ms",
    "pos_sentiment","neg_sentiment","neut_sentiment","compound_sentiment",
    "f_k_grade","flesch_index","fog_index",
    "num_syllables","difficult_words","num_dupes","num_words","num_lines"
]

num_cols = [c for c in (audio_cols + other_numeric + ["year","hit","explicit"]) if c in df.columns]

for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")

# hit -> 0/1 int
df["hit"] = df["hit"].astype(int)

df[num_cols].dtypes

danceability          float64
energy                float64
key                   float64
loudness              float64
mode                  float64
speechiness           float64
acousticness          float64
instrumentalness      float64
liveness              float64
valence               float64
tempo                 float64
time_signature        float64
position              float64
popularity            float64
duration_ms           float64
pos_sentiment         float64
neg_sentiment         float64
neut_sentiment        float64
compound_sentiment    float64
f_k_grade             float64
flesch_index          float64
fog_index             float64
num_syllables         float64
difficult_words       float64
num_dupes             float64
num_words             float64
num_lines             float64
year                    int64
hit                     int64
explicit              float64
dtype: object

In [68]:
na_count = df.isna().sum().sort_values(ascending=False)
na_percent = (na_count / len(df)).sort_values(ascending=False)
missing = pd.concat([na_count, na_percent], axis=1)
missing.columns = ["na_count", "na_percent"]
missing.head(20)

Unnamed: 0,na_count,na_percent
genre_tags,453236,0.994139
difficult_words,451880,0.991165
f_k_grade,451880,0.991165
num_lines,451880,0.991165
num_words,451880,0.991165
num_dupes,451880,0.991165
num_syllables,451880,0.991165
fog_index,451880,0.991165
flesch_index,451880,0.991165
compound_sentiment,451880,0.991165


In [69]:
print(df[audio_cols ].isna().mean().sort_values(ascending=False))
print(df[other_numeric].isna().mean().sort_values(ascending=False))

danceability        0.005554
energy              0.005554
key                 0.005554
loudness            0.005554
mode                0.005554
speechiness         0.005554
acousticness        0.005554
instrumentalness    0.005554
liveness            0.005554
valence             0.005554
tempo               0.005554
time_signature      0.005554
dtype: float64
position              0.991165
pos_sentiment         0.991165
neg_sentiment         0.991165
neut_sentiment        0.991165
compound_sentiment    0.991165
f_k_grade             0.991165
flesch_index          0.991165
fog_index             0.991165
num_syllables         0.991165
difficult_words       0.991165
num_dupes             0.991165
num_words             0.991165
num_lines             0.991165
popularity            0.005554
duration_ms           0.005554
dtype: float64


In [70]:
thr = 0.95
drop_cols = missing.index[missing["na_percent"] > thr].tolist()

analysis_df = df.drop(columns=drop_cols)
print("analysis_df shape:", analysis_df.shape)
analysis_df.info()

analysis_df shape: (455908, 21)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 455908 entries, 0 to 455907
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   year              455908 non-null  int64  
 1   title             455901 non-null  object 
 2   id                453376 non-null  object 
 3   popularity        453376 non-null  float64
 4   duration_ms       453376 non-null  float64
 5   explicit          453376 non-null  float64
 6   artists           453376 non-null  object 
 7   id_artists        453376 non-null  object 
 8   danceability      453376 non-null  float64
 9   energy            453376 non-null  float64
 10  key               453376 non-null  float64
 11  loudness          453376 non-null  float64
 12  mode              453376 non-null  float64
 13  speechiness       453376 non-null  float64
 14  acousticness      453376 non-null  float64
 15  instrumentalness  453376 non-null  f

In [71]:
num_cols = analysis_df.select_dtypes(include="number").columns
for c in num_cols:
     analysis_df[c] = analysis_df[c].fillna(analysis_df[c].median())
analysis_df.isna().sum()

year                   0
title                  7
id                  2532
popularity             0
duration_ms            0
explicit               0
artists             2532
id_artists          2532
danceability           0
energy                 0
key                    0
loudness               0
mode                   0
speechiness            0
acousticness           0
instrumentalness       0
liveness               0
valence                0
tempo                  0
time_signature         0
hit                    0
dtype: int64

In [72]:
df[df["id"].duplicated(keep=False)].sort_values("id")[["id","title","artist","year"]].head(20)  #df[df.duplicated(subset=["title","year"], keep=False)].sort_values(["title","year"]).head(20)

Unnamed: 0,id,title,artist,year
816,1Gv6f7m5ZNDcInxGA8PsLS,Superstar,Murray Head,1971
817,1Gv6f7m5ZNDcInxGA8PsLS,Superstar,Carpenters,1971
2478,2RChe0r2cMoyOvuKobZy44,Always,Erasure,1994
2492,2RChe0r2cMoyOvuKobZy44,Always,Bon Jovi,1994
3924,2bL2gyO6kBdLkNSkxXNh6x,Animals,Maroon 5,2014
3932,2bL2gyO6kBdLkNSkxXNh6x,Animals,Martin Garrix,2014
2144,4VZDv8sASBS8UruUBGTFdk,Hold On,Wilson Phillips,1990
2151,4VZDv8sASBS8UruUBGTFdk,Hold On,En Vogue,1990
2050,5KG4OVGxSrFmNHGZBezJJn,Don't Be Cruel,Cheap Trick,1988
2059,5KG4OVGxSrFmNHGZBezJJn,Don't Be Cruel,Bobby Brown,1988


In [73]:
df[df["id"]=="1Gv6f7m5ZNDcInxGA8PsLS"]

Unnamed: 0,year,position,title,artist,pos_sentiment,neg_sentiment,neut_sentiment,compound_sentiment,f_k_grade,flesch_index,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,hit
816,1971,27.0,Superstar,Murray Head,0.09,0.122,0.788,-0.9792,0.9,99.23,...,-11.177,1.0,0.0294,0.728,4.9e-05,0.138,0.171,83.175,4.0,1
817,1971,30.0,Superstar,Carpenters,0.196,0.047,0.757,0.9853,2.1,90.77,...,-11.177,1.0,0.0294,0.728,4.9e-05,0.138,0.171,83.175,4.0,1


In [74]:
#df[["key","mode","time_signature"]]

In [75]:
before = len(analysis_df)

analysis_df = analysis_df[analysis_df["id"].notna()].copy()

print("Removed rows with NA id:", before - len(analysis_df))

Removed rows with NA id: 2532


In [76]:
analysis_df["id"].duplicated().sum()

np.int64(5)

In [77]:
check = analysis_df.groupby("id")[audio_cols].nunique()

check.max()

danceability        1
energy              1
key                 1
loudness            1
mode                1
speechiness         1
acousticness        1
instrumentalness    1
liveness            1
valence             1
tempo               1
time_signature      1
dtype: int64

In [78]:
before = len(analysis_df)

analysis_df = analysis_df.drop_duplicates(subset=["id"], keep="first").copy()

print("Removed duplicated ids:", before - len(analysis_df))

Removed duplicated ids: 5


During data integration, duplicate Spotify IDs were identified. Detailed inspection confirmed that duplicated IDs shared identical core audio features, while discrepancies occurred only in externally merged metadata. This indicated a many-to-one join artifact rather than distinct audio records. To preserve dataset integrity, rows without valid Spotify IDs were removed, and duplicate records were eliminated by retaining the first occurrence of each unique ID.