In [7]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from IPython.display import display

In [16]:
# Load the data
tracks = pd.read_csv("cleaned_data.csv")
playlists = pd.read_csv("playlist.csv")

# Preprocess the data
playlists['target'] = 1 # Add a target column (1: track is in a playlist, 0: track is not in a playlist)
merged_data = pd.merge(tracks, playlists[['track_id', 'target']], on='track_id', how='left')
merged_data['target'] = merged_data['target'].fillna(0) # Fill NaN values with 0 (track is not in a playlist)

# Encode the track_genre column
encoder = LabelEncoder()
merged_data['track_genre'] = encoder.fit_transform(merged_data['track_genre'])

# Select the feature columns and the target column
feature_columns = [
    'popularity', 'duration_ms', 'danceability', 'energy', 'key', 'loudness', 'mode', 
    'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 
    'time_signature', 'track_genre'
]
features = merged_data[feature_columns]
target = merged_data['target']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model using the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train_scaled, y_train)

# Evaluate the model
predictions = model.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions, zero_division=0))


Accuracy: 0.9963120799868874
Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     24313
         1.0       1.00      0.01      0.02        91

    accuracy                           1.00     24404
   macro avg       1.00      0.51      0.51     24404
weighted avg       1.00      1.00      0.99     24404



In [17]:
# Calculate the probabilities of the tracks being in a playlist (high probability = high chance of being in a playlist)
probabilities = model.predict_proba(X_test_scaled)[:, 1]

# Get the top 10 tracks with the highest probabilities (top 10 recommended tracks)
top_10_indices = probabilities.argsort()[-10:][::-1]
recommended_tracks = X_test.iloc[top_10_indices]

recommended_details = tracks.loc[recommended_tracks.index, ['artists', 'album_name', 'track_name', 'track_genre', 'popularity']]
display(recommended_details)

Unnamed: 0,artists,album_name,track_name,track_genre,popularity
25704,Calvin Harris;Justin Timberlake;Halsey;Pharrel...,"Stay With Me (with Justin Timberlake, Halsey, ...","Stay With Me (with Justin Timberlake, Halsey, ...",electro,80
17359,Nicki Minaj,Super Freaky Girl,Super Freaky Girl,dance,92
48542,Ado,狂言,踊,j-pop,66
13004,SwitchOTR;A1 x J1,Coming for You (feat. A1 x J1),Coming for You (feat. A1 x J1),chill,74
70346,Jessica Keenan Wynn;Alice Lee;Elle McLemore,Heathers: The Musical (World Premiere Cast Rec...,Candy Store,show-tunes,67
73844,Erpeche,Doblando la Dosis,Aire Ke Respiro,spanish,59
675,Melvin Taylor;Lucky Peterson;Titus Williams;Ra...,"Plays the Blues for You (feat. Lucky Peterson,...",Cadillac Assembly Line,acoustic,55
41569,50 Cent,Get Rich Or Die Tryin',In Da Club,hip-hop,83
17179,Shawn Mendes,Illuminate (Deluxe),There's Nothing Holdin' Me Back,dance,86
39058,The Notorious B.I.G.,NOTORIOUS Music From and Inspired by the Origi...,Kick in the Door - 2008 Remaster,hardcore,59
