<a href="https://colab.research.google.com/github/DomenicaJ19/FinalProject-MusicMoodClassifier/blob/main/MusicMoodClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Music Mood Classifier - Final Project

Members: Domenica Jaramillo and Monique Villadiego.

This project classifies music into different mood categories using machine learning.

**Mood Categories:**
- Happy
- Sad
- Energetic
- Calm
- Angry

## Project Overview
1. Load and preprocess audio files
2. Extract audio features (This notebook trains a mood classifier using the `data_moods.csv` dataset.)
3. Train machine learning models
4. Evaluate and classify new songs

## Step 1: Install Required Packages

In [None]:
# Install required packages (uncomment if needed)
# !pip install pandas numpy scikit-learn

## Step 2: Import Libraries

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

print("=" * 60)
print("MUSIC MOOD CLASSIFIER")
print("=" * 60)

MUSIC MOOD CLASSIFIER


## Step 3: Load Dataset from CSV

In [None]:
# Load the CSV file
# Make sure data_moods.csv is uploaded to Colab first!
df = pd.read_csv('data_moods.csv')

print(f"✓ Loaded {len(df)} songs from data_moods.csv")
print(f"\nMood distribution:")
print(df['mood'].value_counts())

✓ Loaded 686 songs from data_moods.csv

Mood distribution:
mood
Sad          197
Calm         195
Energetic    154
Happy        140
Name: count, dtype: int64


## Step 4: Prepare Features for Machine Learning

In [None]:
# Select feature columns
feature_cols = [
    'danceability', 'energy', 'key', 'loudness',
    'speechiness', 'acousticness', 'instrumentalness',
    'liveness', 'valence', 'tempo', 'time_signature'
]

# Use 'length' as 'duration_ms' for consistency
if 'length' in df.columns:
    df['duration_ms'] = df['length']
    feature_cols.append('duration_ms')

# Extract features and labels
X = df[feature_cols]
y = df['mood']

# Remove rows with missing values if any
if X.isnull().sum().sum() > 0:
    print("Removing rows with missing values...")
    mask = ~X.isnull().any(axis=1)
    X = X[mask]
    y = y[mask]
    print(f"✓ Remaining samples: {len(X)}")

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\n✓ Training set: {X_train_scaled.shape[0]} samples")
print(f"✓ Test set: {X_test_scaled.shape[0]} samples")
print(f"✓ Number of features: {X_train_scaled.shape[1]}")
print(f"✓ Classes: {list(le.classes_)}")


✓ Training set: 548 samples
✓ Test set: 138 samples
✓ Number of features: 12
✓ Classes: ['Calm', 'Energetic', 'Happy', 'Sad']


## Step 5: Train Model

In [None]:
# Initialize and train Random Forest Classifier
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    random_state=42,
    n_jobs=-1
)

print("Training Random Forest Classifier...")
rf_model.fit(X_train_scaled, y_train)
print("✓ Model trained successfully!")

Training Random Forest Classifier...
✓ Model trained successfully!


## Step 6: Evaluate Model

In [None]:
# Make predictions
y_train_pred = rf_model.predict(X_train_scaled)
y_test_pred = rf_model.predict(X_test_scaled)

# Calculate accuracy
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_acc:.4f} ({train_acc*100:.2f}%)")
print(f"Test Accuracy: {test_acc:.4f} ({test_acc*100:.2f}%)")

print("\nClassification Report:")
print(classification_report(y_test, y_test_pred, target_names=le.classes_))

# Feature importance
print("\nTop 10 Most Important Features:")
importances = rf_model.feature_importances_
indices = np.argsort(importances)[::-1][:10]
for i, idx in enumerate(indices, 1):
    print(f"  {i}. {feature_cols[idx]}: {importances[idx]:.4f}")

Training Accuracy: 1.0000 (100.00%)
Test Accuracy: 0.7971 (79.71%)

Classification Report:
              precision    recall  f1-score   support

        Calm       0.86      0.95      0.90        39
   Energetic       0.69      0.81      0.75        31
       Happy       0.70      0.57      0.63        28
         Sad       0.89      0.80      0.84        40

    accuracy                           0.80       138
   macro avg       0.78      0.78      0.78       138
weighted avg       0.80      0.80      0.79       138


Top 10 Most Important Features:
  1. instrumentalness: 0.1952
  2. energy: 0.1740
  3. loudness: 0.1590
  4. acousticness: 0.1458
  5. valence: 0.0743
  6. speechiness: 0.0661
  7. duration_ms: 0.0483
  8. danceability: 0.0449
  9. liveness: 0.0394
  10. tempo: 0.0323


## Step 7: Example Predictions

In [None]:
# Predict on a few test samples
test_indices = np.random.choice(len(X_test), min(5, len(X_test)), replace=False)
test_samples = X_test_scaled[test_indices]
test_labels = y_test[test_indices]

predictions = rf_model.predict(test_samples)
probabilities = rf_model.predict_proba(test_samples)

for i, (pred, prob, true_label) in enumerate(zip(predictions, probabilities, test_labels), 1):
    mood = le.inverse_transform([pred])[0]
    true_mood = le.inverse_transform([true_label])[0]
    max_prob = prob.max()

    # Get song info from original dataframe
    test_idx = X_test.index[test_indices[i-1]]
    song_name = df.loc[test_idx, 'name']
    artist = df.loc[test_idx, 'artist']

    print(f"\nSong {i}: {song_name} by {artist}")
    print(f"  True Mood: {true_mood.upper()}")
    print(f"  Predicted Mood: {mood.upper()} (Confidence: {max_prob:.2%})")
    print(f"  All Probabilities:")
    for j, class_name in enumerate(le.classes_):
        print(f"    - {class_name}: {prob[j]:.2%}")


Song 1: Superstition by Stevie Wonder
  True Mood: HAPPY
  Predicted Mood: HAPPY (Confidence: 92.00%)
  All Probabilities:
    - Calm: 0.00%
    - Energetic: 7.00%
    - Happy: 92.00%
    - Sad: 1.00%

Song 2: When You Left by Rouge Haven
  True Mood: CALM
  Predicted Mood: CALM (Confidence: 98.00%)
  All Probabilities:
    - Calm: 98.00%
    - Energetic: 0.00%
    - Happy: 0.00%
    - Sad: 2.00%

Song 3: To Build A Home by The Cinematic Orchestra
  True Mood: SAD
  Predicted Mood: SAD (Confidence: 82.00%)
  All Probabilities:
    - Calm: 18.00%
    - Energetic: 0.00%
    - Happy: 0.00%
    - Sad: 82.00%

Song 4: A Place for My Head by Linkin Park
  True Mood: ENERGETIC
  Predicted Mood: ENERGETIC (Confidence: 65.00%)
  All Probabilities:
    - Calm: 0.00%
    - Energetic: 65.00%
    - Happy: 33.00%
    - Sad: 2.00%

Song 5: Freya by Dave Pad
  True Mood: HAPPY
  Predicted Mood: HAPPY (Confidence: 46.00%)
  All Probabilities:
    - Calm: 8.00%
    - Energetic: 18.00%
    - Happy: 46.0

## Step 8: Helper Function - Predict Mood for New Songs

In [None]:
def predict_mood_from_features(features_dict):
    """
    Predict mood for a new song given its audio features.

    Parameters:
    - features_dict: Dictionary with feature values
      Example: {'danceability': 0.8, 'energy': 0.7, 'valence': 0.8, ...}

    Returns:
    - predicted_mood: String with predicted mood
    - confidence_scores: Dictionary with confidence for each mood
    """
    # Create feature vector in correct order
    feature_vector = np.array([[features_dict.get(col, 0) for col in feature_cols]])

    # Scale features
    feature_vector_scaled = scaler.transform(feature_vector)

    # Predict
    prediction = rf_model.predict(feature_vector_scaled)[0]
    prediction_proba = rf_model.predict_proba(feature_vector_scaled)[0]

    mood = le.inverse_transform([prediction])[0]

    confidence_scores = {}
    for i, class_name in enumerate(le.classes_):
        confidence_scores[class_name] = prediction_proba[i]

    return mood, confidence_scores

print("✓ Helper function created!")
print("\nExample usage:")
print("  mood, confidence = predict_mood_from_features({")
print("      'danceability': 0.8,")
print("      'energy': 0.7,")
print("      'valence': 0.8,")
print("      'tempo': 120,")
print("      ...")
print("  })")

✓ Helper function created!

Example usage:
  mood, confidence = predict_mood_from_features({
      'danceability': 0.8,
      'energy': 0.7,
      'valence': 0.8,
      'tempo': 120,
      ...
  })


## Step 9: Test the Helper Function

In [None]:
# Example: Predict mood for a new song
new_song_features = {
    'danceability': 0.8,
    'energy': 0.7,
    'key': 5,
    'loudness': -5.0,
    'speechiness': 0.05,
    'acousticness': 0.2,
    'instrumentalness': 0.0,
    'liveness': 0.1,
    'valence': 0.8,  # High valence = happy
    'tempo': 120,
    'time_signature': 4,
    'duration_ms': 200000
}

mood, confidence = predict_mood_from_features(new_song_features)

print(f"Predicted Mood: {mood.upper()}")
print(f"Confidence: {confidence[mood]:.2%}")
print(f"\nAll Probabilities:")
for mood_class, score in sorted(confidence.items(), key=lambda x: x[1], reverse=True):
    print(f"  {mood_class}: {score:.2%}")

## Step 10: Save the Model (just in case)

In [None]:
# Save the trained model for later use
# Uncomment to save:

# import joblib
# joblib.dump(rf_model, 'mood_classifier_model.pkl')
# joblib.dump(scaler, 'scaler.pkl')
# joblib.dump(le, 'label_encoder.pkl')
# print("✓ Model saved successfully!")

# To load later:
# rf_model = joblib.load('mood_classifier_model.pkl')
# scaler = joblib.load('scaler.pkl')
# le = joblib.load('label_encoder.pkl')