# Machine Learning

Aim:
Task 1: Genre Classification
Predict the track_genre from audio features (supervised ML)
Task 2: Song Recommendation
Recommend similar songs based on audio similarity (unsupervised ML)

## Step 1: Import Libraries

In [2]:
# Import libraries

# Data handling
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler

# ML models
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Recommendation
from sklearn.neighbors import NearestNeighbors

## Step 2: Load Dataset

In [3]:
df = pd.read_csv('../data/03_spotify_cleaned_dataset.csv')

Inspecting data types in 'X'

In [7]:
X.dtypes


popularity            int64
duration_ms           int64
explicit               bool
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
time_signature        int64
popularity_bin       object
danceability_bin     object
energy_bin           object
dtype: object

## Step 3: Prepare Data for Genre Classification

- X will contain only the raw numeric features (e.g., popularity, danceability, energy, etc.)
- y will contain encoded genre labels (e.g., 0 = "pop", 1 = "rock", ...)
- X_scaled is the input to feed into your ML model

Step 3.1: Select features (X) and drop irrelevant columns

In [None]:
# - 'artists', 'album_name', 'track_name': Text data (requires NLP to be useful)
# - 'track_genre': This is the target variable
# - 'popularity_bin', 'danceability_bin', 'energy_bin': Manually created bins (categorical, not useful for raw ML modeling)

X = df.drop(columns=[
    'artists', 'album_name', 'track_name',
    'track_genre',
    'popularity_bin', 'danceability_bin', 'energy_bin'  # drop derived text bins
])

Step 3.2: Prepare target variable (y)

In [9]:
# We encode the track genre as numbers (ML models need numeric targets)
le = LabelEncoder()
y = le.fit_transform(df['track_genre'])

Step 3.3: Normalise Numeric Feature Values

In [None]:
# Scaling features ensures all numerical columns have similar ranges. This is especially important for algorithms sensitive to magnitude
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Step 4: Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)


## Step 5: Train Classifier (Random Forest)

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)

## Step 6: Evaluate Classification Performance

In [None]:
# Decode labels for readability
y_test_labels = le.inverse_transform(y_test)
y_pred_labels = le.inverse_transform(y_pred)

print("Classification Report:")
print(classification_report(y_test_labels, y_pred_labels))

# Confusion matrix heatmap
plt.figure(figsize=(12, 10))
cm = confusion_matrix(y_test_labels, y_pred_labels, labels=le.classes_)
sns.heatmap(cm, xticklabels=le.classes_, yticklabels=le.classes_, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


## Step 7: Song Recommendation System (Using Audio Features)

We'll use K-Nearest Neighbors to find songs that are similar based on features like danceability, energy, valence, etc.

In [None]:
# Use the same features and scaling
audio_features = scaler.fit_transform(X)

# Fit KNN model
knn = NearestNeighbors(n_neighbors=6, metric='cosine')  # 6 so first result is itself + 5 recommendations
knn.fit(audio_features)


Recommendation Function

In [None]:
def recommend_song(song_index, df_original, model, features_scaled):
    distances, indices = model.kneighbors([features_scaled[song_index]])
    print(f"\nSelected Song: {df_original.iloc[song_index]['track_name']} - {df_original.iloc[song_index]['artists']}\n")
    print("Recommended Songs:\n")
    for idx in indices[0][1:]:  # skip first (it's the input song itself)
        row = df_original.iloc[idx]
        print(f"{row['track_name']} - {row['artists']} (Genre: {row['track_genre']})")


In [None]:
Examlple song index to recommend 
# Recommend songs similar to the 10th track in the dataset
recommend_song(10, df, knn, audio_features)
