In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import silhouette_score


In [2]:

# Read the dataset
kpop_top1000_tracks = pd.read_csv("kpop_top1000_tracks.csv")

# Drop unnecessary columns (e.g., song_name, album_name, etc.)
kpop_top1000_tracks = kpop_top1000_tracks.drop(['song_name', 'album_name', 'album_link', 'artist_name', 'song_link'], axis=1)

# Convert categorical data to numerical using label encoding (explicit column)
label_encoder = LabelEncoder()
kpop_top1000_tracks['explicit'] = label_encoder.fit_transform(kpop_top1000_tracks['explicit'])

# Convert the 'release_date' column to a datetime object for further processing
kpop_top1000_tracks['release_date'] = pd.to_datetime(kpop_top1000_tracks['release_date'], errors='coerce')

# Drop rows with invalid dates (NaT)
kpop_top1000_tracks = kpop_top1000_tracks.dropna(subset=['release_date'])

# Extract the year, month, and day of the week from the 'release_date' column
kpop_top1000_tracks['release_year'] = kpop_top1000_tracks['release_date'].dt.year
kpop_top1000_tracks['release_month'] = kpop_top1000_tracks['release_date'].dt.month
kpop_top1000_tracks['release_dayofweek'] = kpop_top1000_tracks['release_date'].dt.dayofweek

# Drop the original 'release_date' column
kpop_top1000_tracks = kpop_top1000_tracks.drop('release_date', axis=1)

# Split the data into features (X) and target (y) for supervised learning
features = ['duration_ms', 'explicit', 'release_year', 'release_month', 'release_dayofweek']
target = 'popularity'
X_supervised = kpop_top1000_tracks[features]
y_supervised = kpop_top1000_tracks[target]

# Split the data into training and testing sets for supervised learning
X_train_supervised, X_test_supervised, y_train_supervised, y_test_supervised = train_test_split(X_supervised, y_supervised, test_size=0.2, random_state=42)

# Standardize the numerical features for supervised learning
scaler_supervised = StandardScaler()
X_train_scaled_supervised = scaler_supervised.fit_transform(X_train_supervised)
X_test_scaled_supervised = scaler_supervised.transform(X_test_supervised)

