In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve

# Load Dataset
df = pd.read_csv("/content/processed_spotify_data.csv")

# Drop unnecessary columns, including potential ID columns like 'track_id' or 'album_id'
columns_to_drop = ['Unnamed: 0', 'track_id', 'album_name', 'track_name', 'artists']
df = df.drop(columns=columns_to_drop, errors='ignore')

# Handle missing values (if any)
df.dropna(inplace=True)

# Check for highly correlated features (potential data leakage)
# Select only numerical features for correlation calculation
numerical_features = df.select_dtypes(include=np.number).columns
corr_matrix = df[numerical_features].corr()

high_corr_features = set()
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i, j]) > 0.85:  # High correlation threshold
            colname = corr_matrix.columns[i]
            high_corr_features.add(colname)

# Drop highly correlated features to reduce redundancy
df.drop(columns=high_corr_features, inplace=True, errors='ignore') # Added errors='ignore' to handle cases where a feature might have already been dropped

# Define features and target
# Check if 'mood' column exists before dropping
if 'mood' in df.columns:
    X = df.drop(columns=['mood'])
    y = df['mood']
else:
    raise KeyError("The 'mood' column is not found in the DataFrame. Please check your data or previous steps.")

# Select only numerical features for scaling
X = X.select_dtypes(include=np.number)  # Add this line to select only numerical features

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA to reduce dimensionality
pca = PCA(n_components=0.95)  # Preserve 95% variance
X_pca = pca.fit_transform(X_scaled)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42, stratify=y)

# Train Random Forest Model
rf_model = RandomForestClassifier(n_estimators=50, max_depth=10, min_samples_split=5, min_samples_leaf=3, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)

# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Calculate roc_auc here before plotting the ROC curve
roc_auc = roc_auc_score(pd.get_dummies(y_test), pd.get_dummies(y_pred), average='macro')

# Print Results
print(f"Model Accuracy: {accuracy:.2f}")
print("Classification Report:\n", report)
print("Confusion Matrix:\n", conf_matrix)
# Define class labels
class_labels = ["Calm", "Energetic", "Happy", "Sad"]

# Plot Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=class_labels, yticklabels=class_labels)

# Labels and Title
plt.xlabel("Predicted Labels")
plt.ylabel("Actual Labels")
plt.title("Confusion Matrix")

# Show Plot
plt.show()
# Plot ROC Curve
fpr, tpr, _ = roc_curve(pd.get_dummies(y_test).values.ravel(), pd.get_dummies(y_pred).values.ravel())
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})') # roc_auc is now defined
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()

# Visualization: Mood Distribution
plt.figure(figsize=(8, 5))
sns.countplot(x=y, palette='viridis')
plt.title("Mood Distribution in Dataset")
plt.show()
