## Recommendation System

### Creating voting ensemble on 4 method: cosine similarity, truncated svd, autoencoder, and centroid distance-based similarity

In [29]:
from sklearn.preprocessing import minmax_scale
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from keras.models import Model
from keras.layers import Input, Dense
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
from sklearn import preprocessing

# Read data
data = pd.read_csv(f'GTZAN/features_30_sec.csv', index_col='filename')

# Extract labels
labels = data[['label']]

# Drop labels from original dataframe
data = data.drop(columns=['length', 'label'])
data.head()

# Scale the data
data_scaled = preprocessing.scale(data)

# --- Step 1: K-Means Clustering ---
# Fit KMeans and assign clusters
kmeans = KMeans(n_clusters=10, n_init='auto', random_state=42)
clusters = kmeans.fit_predict(data_scaled)

# Assign clusters to the songs
data['cluster'] = clusters

# Compute cluster centroids
centroids = kmeans.cluster_centers_

# Function to normalize similarity scores using Min-Max scaling
def normalize_similarities(similarity_matrix):
    return minmax_scale(similarity_matrix, feature_range=(0, 1))

# --- Step 2: Cosine Similarity on Raw Data ---
cosine_sim = cosine_similarity(data_scaled)
cosine_sim_normalized = normalize_similarities(cosine_sim)

# --- Step 3: Truncated SVD ---
svd = TruncatedSVD(n_components=10)  # Reduce to 10 latent factors
data_svd = svd.fit_transform(data_scaled)
svd_sim = cosine_similarity(data_svd)
svd_sim_normalized = normalize_similarities(svd_sim)

# --- Step 4: Autoencoder ---
input_dim = data_scaled.shape[1]
encoding_dim = 64  # Latent space dimension for autoencoder
input_layer = Input(shape=(input_dim,))
encoded = Dense(128, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
decoded = Dense(128, activation='relu')(encoded)
decoded = Dense(input_dim, activation='sigmoid')(decoded)

autoencoder = Model(input_layer, decoded)
encoder = Model(input_layer, encoded)
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(data_scaled, data_scaled, epochs=50, batch_size=256, shuffle=True)

# Get latent features from encoder
data_encoded = encoder.predict(data_scaled)
autoencoder_sim = cosine_similarity(data_encoded)
autoencoder_sim_normalized = normalize_similarities(autoencoder_sim)

# --- Step 5: Centroid Distance-Based Similarity ---
# Compute Euclidean distance from each song to its cluster centroid
centroid_distances = np.zeros(data_scaled.shape[0])
for i, cluster_label in enumerate(clusters):
    centroid_distances[i] = np.linalg.norm(data_scaled[i] - centroids[cluster_label])

# Convert centroid distances to similarity scores (closer songs have higher similarity)
# We'll invert the distances by subtracting from the max distance, and then normalize.
max_distance = centroid_distances.max()
centroid_similarity = max_distance - centroid_distances
centroid_similarity_normalized = normalize_similarities(centroid_similarity.reshape(-1, 1))

# Create a pairwise similarity matrix where songs in the same cluster get similarity based on centroid distance
# Ensure the matrix is initialized with float dtype to avoid assignment issues
centroid_sim_matrix = pd.DataFrame(0.0, index=labels.index, columns=labels.index)  # Initialize with 0.0 (float)

for cluster_label in data['cluster'].unique():
    cluster_songs = data[data['cluster'] == cluster_label].index
    for song_a in cluster_songs:
        for song_b in cluster_songs:
            centroid_sim_matrix.loc[song_a, song_b] = (
                centroid_similarity_normalized[labels.index.get_loc(song_a)] +
                centroid_similarity_normalized[labels.index.get_loc(song_b)]
            ) / 2  # Average the similarity between two songs based on their centroid distance

centroid_sim_matrix_normalized = normalize_similarities(centroid_sim_matrix.values)

# --- Step 6: Voting and Aggregation ---
# Combine the normalized similarities including the centroid distance similarity
combined_sim = (cosine_sim_normalized + svd_sim_normalized + autoencoder_sim_normalized + centroid_sim_matrix_normalized) / 4

# Convert into a dataframe with proper labels
combined_sim_df = pd.DataFrame(combined_sim, index=labels.index, columns=labels.index)

# --- Step 7: Find and Rank Similar Songs ---
def find_similar_songs_ensemble_with_centroid(name, top_n=5):
    # Rank based on the combined similarity
    series = combined_sim_df[name].sort_values(ascending=False)
    
    # Drop the song itself from the recommendations
    series = series.drop(name)
    
    # Return top N recommendations
    print(f"\n*******\nTop {top_n} similar songs to {name}:")
    print(series.head(top_n))


Epoch 1/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2137  
Epoch 2/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1739
Epoch 3/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.0756 
Epoch 4/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.0375 
Epoch 5/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 994us/step - loss: 0.9483
Epoch 6/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 806us/step - loss: 0.9283
Epoch 7/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.8593
Epoch 8/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.8361 
Epoch 9/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 859us/step - loss: 0.8334
Epoch 10/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.8070
Epoch 11/50
[1m4/

In [30]:
find_similar_songs_ensemble_with_centroid('pop.00019.wav', top_n=5)


*******
Top 5 similar songs to pop.00019.wav:
filename
pop.00023.wav    0.962249
pop.00078.wav    0.953471
pop.00034.wav    0.946122
pop.00088.wav    0.939833
pop.00016.wav    0.937507
Name: pop.00019.wav, dtype: float64
