In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

warnings.filterwarnings("ignore")

# Load Dataset
df = pd.read_csv("/content/processed_spotify_data.csv")

# Set style
sns.set_style("darkgrid")

# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# 🔹 1. Mood Distribution
sns.barplot(x=df['mood'].value_counts().index,
            y=df['mood'].value_counts().values,
            palette="viridis", ax=axes[0])
axes[0].set_title("Mood Distribution in Spotify Dataset")
axes[0].set_xlabel("Mood")
axes[0].set_ylabel("Count")

# 🔹 2. Scatter Plot - Valence vs Energy
sns.scatterplot(x=df['valence'], y=df['energy'], hue=df['mood'], palette="deep", alpha=0.7, ax=axes[1])
axes[1].set_title("Mood Classification based on Valence & Energy")
axes[1].set_xlabel("Valence (Happiness)")
axes[1].set_ylabel("Energy (Liveliness)")

# Adjust layout
plt.tight_layout()
plt.show()

# Define Features and Target
X = df.drop(columns=['mood', 'track_id'])  # Drop 'mood' and any ID columns like 'track_id'
# Select only numerical features
X = X.select_dtypes(include=['number'])
y = df['mood']  # Target variable

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train Random Forest Model
rf_model = RandomForestClassifier(n_estimators=50, max_depth=10, min_samples_split=5, min_samples_leaf=3, random_state=42)
rf_model.fit(X_train, y_train)
# 🔹 3. Feature Importance Plot
feature_importances = rf_model.feature_importances_
feature_names = X.columns[:len(feature_importances)]  # Get correct feature names

plt.figure(figsize=(12, 6))
sns.barplot(x=feature_importances, y=feature_names, palette='coolwarm')
plt.xlabel("Importance Score")
plt.ylabel("Feature Name")
plt.title("Feature Importance in Mood Classification")
plt.xticks(rotation=45)  # Rotate labels if needed
plt.show()

# 🔹 4. Improved Feature Distributions
num_features = df.select_dtypes(include='number')  # Select numeric features

plt.figure(figsize=(14, 10))  # Proper figure size
num_features.hist(bins=30, color='skyblue', edgecolor='black', figsize=(14, 10))

plt.suptitle("Audio Feature Distributions", fontsize=16, fontweight='bold')
plt.xlabel("Feature Value")
plt.ylabel("Frequency")
plt.tight_layout()  # Fix overlapping labels
plt.show()
