In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("song_recommendation_data_clean.csv")

In [3]:
df["total_listen_time_sec"] = df["listen_time_min"] * 60

In [4]:
df["listen_ratio"] = df["total_listen_time_sec"] / df["duration_sec"]
df["listen_ratio"] = df["listen_ratio"].clip(upper=1.0)

In [5]:
df["high_engagement"] = (df["listen_ratio"] > 0.7).astype(int)

In [6]:
df["time_of_day"] = pd.cut(
    df["listen_hour"],
    bins=[-1, 5, 12, 18, 24],
    labels=["Night", "Morning", "Afternoon", "Evening"]
)

In [7]:
df["time_of_day"] = df["time_of_day"].astype("category").cat.codes

In [8]:
df["is_weekend"] = df["listen_day_of_week"].isin([5, 6]).astype(int)

In [9]:
df["user_activity_level"] = pd.cut(
    df["user_previous_plays"],
    bins=[-1, 50, 200, np.inf],
    labels=["Low", "Medium", "High"]
)

In [10]:
df["user_activity_level"] = df["user_activity_level"].astype("category").cat.codes

In [11]:
df["song_old"] = (2025 - df["song_release_year"] > 5).astype(int)

In [12]:
df["dance_energy_product"] = df["danceability"] * df["energy"]

In [13]:
song_like_counts = df.groupby("song_id")["liked"].sum()
popular_songs = song_like_counts[song_like_counts > 20].index.tolist()
df["song_popular_flag"] = df["song_id"].isin(popular_songs).astype(int)

In [14]:
genre_avg_listen = df.groupby("genre")["listen_time_min"].transform("mean")
df["genre_avg_listen_diff"] = df["listen_time_min"] - genre_avg_listen


In [15]:
df.to_csv("song_recommendation_data_fe.csv", index=False)