Import the Libraries

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

Data Preparation

In [None]:
# load the CSV File
df = pd.read_csv('data/spotify_songs.csv')

# Remove Columns the are Irrelevant
cols_to_drop = ['track_id', 'track_name', 'track_artist', 'track_album_id', 
                'track_album_name', 'track_album_release_date', 'playlist_name', 
                'playlist_id', 'playlist_subgenre']

# Drop the Columns and Remove the Null
new_df = df.drop(columns=cols_to_drop).dropna()

# Label Encoder
le = LabelEncoder()
new_df['playlist_genre'] = le.fit_transform(new_df['playlist_genre'])



       track_popularity  playlist_genre  danceability  energy  key  loudness  \
0                    66               2         0.748   0.916    6    -2.634   
1                    67               2         0.726   0.815   11    -4.969   
2                    70               2         0.675   0.931    1    -3.432   
3                    60               2         0.718   0.930    7    -3.778   
4                    69               2         0.650   0.833    1    -4.672   
...                 ...             ...           ...     ...  ...       ...   
32828                42               0         0.428   0.922    2    -1.814   
32829                20               0         0.522   0.786    0    -4.462   
32830                14               0         0.529   0.821    6    -4.899   
32831                15               0         0.626   0.888    2    -3.361   
32832                27               0         0.603   0.884    5    -4.571   

       mode  speechiness  acousticness 

Scale the Data

In [None]:
X = new_df.drop('playlist_genre', axis=1)
y = new_df['playlist_genre']

# Split into Training and testing Dataset
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)


Train the Model

In [None]:
# Import the Libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Scale the Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

model = RandomForestClassifier(n_estimators=100, 
                               random_state=42)

model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Print the classification report
print(classification_report(y_test, y_pred, target_names=le.classes_))

              precision    recall  f1-score   support

         edm       0.68      0.68      0.68      1218
       latin       0.51      0.41      0.46      1033
         pop       0.37      0.33      0.35      1081
         r&b       0.45      0.47      0.46      1031
         rap       0.57      0.67      0.62      1168
        rock       0.70      0.72      0.71      1036

    accuracy                           0.55      6567
   macro avg       0.54      0.55      0.54      6567
weighted avg       0.55      0.55      0.55      6567



Let's try XGBoost


In [17]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(
    n_estimator = 100,
    learning_rate = 0.1,
    max_depth = 6,
    objective='multi:softprob', 
    random_state=42
)

# Train the Model
xgb_model.fit(X_train_scaled, y_train)

# Predict
y_pred2 = xgb_model.predict(X_test_scaled)

# Classification Report
print(classification_report(y_test, y_pred2, target_names=le.classes_))

              precision    recall  f1-score   support

         edm       0.69      0.67      0.68      1218
       latin       0.49      0.40      0.44      1033
         pop       0.37      0.37      0.37      1081
         r&b       0.44      0.49      0.46      1031
         rap       0.56      0.68      0.61      1168
        rock       0.71      0.61      0.66      1036

    accuracy                           0.54      6567
   macro avg       0.54      0.54      0.54      6567
weighted avg       0.55      0.54      0.54      6567

