In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
# Load features and annotation data
feature_df = pd.read_csv('features_sorted.csv')
annotation_df_1 = pd.read_csv('annotations/annotations averaged per song/song_level/static_annotations_averaged_songs_1_2000.csv')
annotation_df_2 = pd.read_csv('annotations/annotations averaged per song/song_level/static_annotations_averaged_songs_2000_2058.csv')
annotation_df = pd.concat([annotation_df_1, annotation_df_2])

In [None]:
# Clean column names
feature_df.columns = feature_df.columns.str.strip()
annotation_df.columns = annotation_df.columns.str.strip()

In [None]:
# Create a binary label for classification using the median valence_mean
median_valence = annotation_df['valence_mean'].median()  # Calculate the median
annotation_df['valence_label'] = annotation_df['valence_mean'].apply(lambda x: 1 if x >= median_valence else 0)

In [None]:
# Check the label distribution
print('Label distribution:\n', annotation_df['valence_label'].value_counts())

In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(feature_df.drop(columns=['song_id']),
                                                    annotation_df['valence_label'], 
                                                    test_size=0.2, random_state=42)

In [None]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=150, max_depth=3, random_state=42)
rf_model.fit(X_train_scaled, y_train)

In [None]:
# Make predictions
y_train_pred = rf_model.predict(X_train_scaled)
y_test_pred = rf_model.predict(X_test_scaled)

In [None]:
# Evaluate model
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print(f'Random Forest - Train Accuracy: {train_acc:.4f}, Test Accuracy: {test_acc:.4f}')
print(f'Random Forest - Confusion Matrix (Test): \n{confusion_matrix(y_test, y_test_pred)}')