# Music Recommendation System

## 1. Data Loading and Pre-processing

In [None]:
import pandas as pd

df = pd.read_csv("spotify_dataset.csv")

# Drop rows with missing values
df = df.dropna()

# Remove duplicate rows
df = df.drop_duplicates()

print("Data loaded and pre-processed. Shape: ", df.shape)

## 2. Correlation Matrix of Features

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Select numerical columns for correlation matrix
numerical_df = df.select_dtypes(include=['float64', 'int64'])

# Calculate correlation matrix
corr = numerical_df.corr()

# Plot heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Numerical Features')
plt.show()

## 3. Finding Optimal Number of Clusters (Elbow Method)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Scale the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(numerical_df)

# Find the optimal number of clusters using the elbow method
inertia = []
K = range(1, 11)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(scaled_features)
    inertia.append(kmeans.inertia_)

# Plot the elbow curve
plt.figure(figsize=(8, 6))
plt.plot(K, inertia, 'bx-')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.title('Elbow Method For Optimal k')
plt.show()

## 4. K-Means Clustering and Visualization

In [None]:
from sklearn.decomposition import PCA

# Perform K-Means clustering
k = 4
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(scaled_features)

# Reduce dimensionality using PCA
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_features)

# Create a DataFrame with the principal components and cluster labels
pca_df = pd.DataFrame(data = principal_components, columns = ['principal component 1', 'principal component 2'])
pca_df['cluster'] = df['cluster']

# Plot the clusters
plt.figure(figsize=(10, 8))
sns.scatterplot(x='principal component 1', y='principal component 2', hue='cluster', data=pca_df, palette='viridis')
plt.title('Clusters of Songs')
plt.show()

## 5. Cluster Analysis

In [None]:
# Analyze clusters by playlist genre
print("Cluster distribution by playlist genre:")
print(df.groupby(['cluster', 'playlist_genre']).size().unstack(fill_value=0))

# Analyze clusters by playlist name (top 5 per cluster)
print("
Top 5 playlist names per cluster:")
for i in range(k):
    print(f"
Cluster {i}:")
    playlist_counts = df[df['cluster'] == i].groupby('playlist_name').size().nlargest(5)
    print(playlist_counts)

## 6. Recommendation Model

In [None]:
def recommend_songs(track_name, df):
    """Recommends songs from the same cluster as the input song."""
    try:
        cluster = df[df['track_name'] == track_name]['cluster'].iloc[0]
        recommended_songs = df[df['cluster'] == cluster].sample(5)['track_name']
        return recommended_songs
    except IndexError:
        return "Song not found."

# Example usage:
print("Recommendations for 'bad guy':")
print(recommend_songs('bad guy', df))