In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("song_recommendation_data.csv")

In [3]:
df = df.dropna(subset=["genre", "user_age"])

In [4]:
df.loc[df["listen_time_min"] < 0, "listen_time_min"] = np.nan

In [5]:
df["listen_time_min"].fillna(df["listen_time_min"].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["listen_time_min"].fillna(df["listen_time_min"].median(), inplace=True)


In [6]:
df["genre"].fillna(df["genre"].mode()[0], inplace=True)

df["user_age"].fillna(df["user_age"].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["genre"].fillna(df["genre"].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["user_age"].fillna(df["user_age"].median(), inplace=True)


In [7]:
num_cols = ["duration_sec", "artist_popularity", "acousticness",
            "danceability", "energy", "valence", "tempo",
            "listen_time_min", "user_age", "user_previous_plays"]

for col in num_cols:
    q_low = df[col].quantile(0.01)
    q_high = df[col].quantile(0.99)
    df[col] = df[col].clip(lower=q_low, upper=q_high)


In [8]:
cat_cols = [
    "genre", "language", "user_country", "user_platform",
    "user_subscription", "network_quality"
]
for col in cat_cols:
    df[col] = df[col].astype("category").cat.codes

In [9]:
binary_cols = ["explicit", "liked", "added_to_playlist"]

for col in binary_cols:
    df[col] = df[col].astype(int)

In [10]:
missing = df.isnull().sum()
print("\nMissing after cleaning:\n", missing[missing > 0])


Missing after cleaning:
 Series([], dtype: int64)


In [11]:
df.to_csv("song_recommendation_data_clean.csv", index=False)