In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from IPython.display import display

In [47]:
# Load the data
tracks = pd.read_csv("cleaned_data.csv")
playlists = pd.read_csv("playlist.csv")

# Preprocess the data
playlists['target'] = 1 # Add a target column (1: track is in a playlist, 0: track is not in a playlist)
merged_data = pd.merge(tracks, playlists[['track_id', 'target']], on='track_id', how='left')
merged_data['target'] = merged_data['target'].fillna(0) # Fill NaN values with 0 (track is not in a playlist)

# Encode the track_genre column
encoder = LabelEncoder()
merged_data['track_genre'] = encoder.fit_transform(merged_data['track_genre'])

# Select the feature columns and the target column
feature_columns = [
    'popularity', 'duration_ms', 'danceability', 'energy', 'key', 'loudness', 'mode', 
    'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 
    'time_signature', 'track_genre'
]
features = merged_data[feature_columns]
target = merged_data['target']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model using the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train_scaled, y_train)

# Evaluate the model
predictions = model.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions, zero_division=0))


Accuracy: 0.9900835928536306
Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      0.99      0.99     24106
         1.0       0.58      0.67      0.62       298

    accuracy                           0.99     24404
   macro avg       0.79      0.83      0.81     24404
weighted avg       0.99      0.99      0.99     24404



In [50]:
# Remove tracks that are already in the playlist from X_test_scaled
X_test_scaled = X_test_scaled[y_test == 0]
y_test = y_test[y_test == 0]

# Calculate the probabilities of the tracks being in a playlist (high probability = high chance of being in a playlist)
probabilities = model.predict_proba(X_test_scaled)[:, 1]

# Get the top 10 tracks with the highest probabilities (top 10 recommended tracks)
top_10_indices = probabilities.argsort()[-10:][::-1]
recommended_tracks = X_test.iloc[top_10_indices]

recommended_details = tracks.loc[recommended_tracks.index, ['artists', 'album_name', 'track_name', 'track_genre', 'popularity']]
recommended_details['probability'] = probabilities[top_10_indices]
display(recommended_details)


# X_train_scaled = X_train_scaled[y_train == 0]
# y_train = y_train[y_train == 0]

# probabilities_train = model.predict_proba(X_train_scaled)[:, 1]
# top_10_indices_train = probabilities_train.argsort()[-10:][::-1]
# recommended_tracks_train = X_train.iloc[top_10_indices_train]

# recommended_details_train = tracks.loc[recommended_tracks_train.index, ['artists', 'album_name', 'track_name', 'track_genre', 'popularity']]
# recommended_details_train['probability'] = probabilities_train[top_10_indices_train]
# display(recommended_details_train)

Unnamed: 0,artists,album_name,track_name,track_genre,popularity,probability
16052,Muck Sticky,The Sticky Muck,Snuffaluppagus,comedy,23,0.81
80969,Michael W. Smith,Decades of Worship,Agnus Dei,world-music,42,0.76
17217,Bruno Mars,Doo-Wops & Hooligans,Talking to the Moon,dance,82,0.73
14414,Wolfgang Amadeus Mozart;Danielle Laval,Mozart: A Night of Classics,"12 Variations on an Allegretto in B Flat, K.50...",classical,25,0.73
15617,Jim Norton,Despicable - EP,"The Bridge, Lol",comedy,30,0.72
35887,Pop Evil,Breathe Again,Breathe Again,grunge,54,0.72
13158,itssvd;Shiloh Dynasty,tape for when i'm sad,i don't feel part of the world anymore,chill,68,0.71
37691,Dj Mad Dog,Till I Die Vol.1,XTC - Radio Edit,happy,47,0.71
64884,MAGIC!,Don't Kill the Magic,Rude,reggae,81,0.71
56604,Secret Garden,Songs From A Secret Garden,Song From A Secret Garden,new-age,58,0.69
