In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt


In [None]:

# Read the dataset
kpop_top1000_tracks = pd.read_csv("kpop_top1000_tracks.csv")

# Drop unnecessary columns (e.g., song_name, album_name, etc.)
kpop_top1000_tracks = kpop_top1000_tracks.drop(['song_name', 'album_name', 'album_link', 'artist_name', 'song_link'], axis=1)

# Convert categorical data to numerical using label encoding (explicit column)
label_encoder = LabelEncoder()
kpop_top1000_tracks['explicit'] = label_encoder.fit_transform(kpop_top1000_tracks['explicit'])

# Convert the 'release_date' column to a datetime object for further processing
kpop_top1000_tracks['release_date'] = pd.to_datetime(kpop_top1000_tracks['release_date'], errors='coerce')

# Extract the year, month, and day of the week from the 'release_date' column
kpop_top1000_tracks['release_year'] = kpop_top1000_tracks['release_date'].dt.year
kpop_top1000_tracks['release_month'] = kpop_top1000_tracks['release_date'].dt.month
kpop_top1000_tracks['release_dayofweek'] = kpop_top1000_tracks['release_date'].dt.dayofweek

# Drop the original 'release_date' column and any rows with invalid date values (NaT)
kpop_top1000_tracks = kpop_top1000_tracks.drop(['release_date'], axis=1).dropna()

# Split the data into features (X) and target (y)
features = ['duration_ms', 'explicit', 'release_year', 'release_month', 'release_dayofweek']
target = 'popularity'

X = kpop_top1000_tracks[features]
y = kpop_top1000_tracks[target]

# Standardize the numerical features for better model performance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
