In [None]:
# Basic libreries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Pre-processing phase
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Features Importance
from sklearn.inspection import permutation_importance

# Model
from sklearn import tree
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# Hyper-Parameter Tuning
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import cross_val_score

# Evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import tqdm

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
# Kaggle
path = "../input/spotify-track-beta/dataframe_24972.csv"

spotify_tracks = pd.read_csv(path, low_memory=False, na_values = ["nan"])
spotify_tracks.head()

In [None]:
genres = spotify_tracks.track_genre.unique()
genres

In [None]:
spotify_tracks["track_explicit"].unique()

In [None]:
#spotify_tracks.isna().sum()
spotify_tracks.isnull().sum()

In [None]:
spotify_tracks.describe()

In [None]:
list_pitches = []
n = spotify_tracks.shape[0]

for i in range(n):
    pitches = []
    lst = list(spotify_tracks['audio_avg_pitches'][i].strip('][').split(', '))
    for j in lst:
        pitches.append(j)  
    list_pitches.append(pitches)

split_pitch = pd.DataFrame(list_pitches, columns=["pitch" + str(i) for i in range(12)])
split_pitch

In [None]:
spotify_tracks = pd.concat([spotify_tracks, split_pitch], axis=1)
spotify_tracks = spotify_tracks.drop('audio_avg_pitches', axis=1)

spotify_tracks

In [None]:
list_timbre = []
n = spotify_tracks.shape[0]

for i in range(n):
    timbre = []
    lst = list(spotify_tracks['audio_avg_timbre'][i].strip('][').split(', '))
    for j in lst:
        timbre.append(j)  
    list_timbre.append(timbre)

split_timbre = pd.DataFrame(list_timbre, columns=["timbre" + str(i) for i in range(12)])
split_timbre

In [None]:
spotify_tracks = pd.concat([spotify_tracks, split_timbre], axis=1)
spotify_tracks = spotify_tracks.drop('audio_avg_timbre', axis=1)

spotify_tracks

In [None]:
spotify_tracks.isna().sum().sum()

In [None]:
before_sp = spotify_tracks.copy()
spotify_tracks = spotify_tracks.drop_duplicates()
after_sp = spotify_tracks.copy()
print("DUPLICATES")
print("Old Shape - New Shape: ", before_sp.shape[0] - after_sp.shape[0])

In [None]:
spotify_tracks.head()

In [None]:
spotify_tracks["album_release_date_precision"].unique()

In [None]:
# Apply one-hot encoder to each column with categorical data
encoder = OneHotEncoder(handle_unknown='ignore')

album_release_date_precision_df = pd.DataFrame(encoder.fit_transform(spotify_tracks[["album_release_date_precision"]]).toarray(), columns=["album_release_date_precision_day", "album_release_date_precision_year", "album_release_date_precision_month"])
album_release_date_precision_df

In [None]:
spotify_tracks = pd.concat([spotify_tracks, album_release_date_precision_df], axis=1)
spotify_tracks = spotify_tracks.drop('album_release_date_precision', axis=1)

spotify_tracks

In [None]:
spotify_tracks.isna().sum().sum()
spotify_tracks = spotify_tracks.dropna(how="any")


In [None]:
spotify_tracks.isna().sum().sum()
#spotify_tracks.isnull().sum()

In [None]:
spotify_tracks['album_release_date'] = pd.to_datetime(spotify_tracks['album_release_date'])
spotify_tracks['year']= spotify_tracks['album_release_date'].dt.year
spotify_tracks['month']= spotify_tracks['album_release_date'].dt.month
spotify_tracks['day']= spotify_tracks['album_release_date'].dt.day

spotify_tracks = spotify_tracks.drop('album_release_date', axis=1)

spotify_tracks

In [None]:
# Apply one-hot encoder to each column with categorical data
encoder = LabelEncoder()

spotify_tracks.track_genre = encoder.fit_transform(spotify_tracks.track_genre)

spotify_tracks.album_name = encoder.fit_transform(spotify_tracks.album_name)

spotify_tracks.artist_name = encoder.fit_transform(spotify_tracks.artist_name)

spotify_tracks.track_name = encoder.fit_transform(spotify_tracks.track_name)

print("album_name - Unique Value: ", len(spotify_tracks["album_name"].unique()), "Total Value: ", spotify_tracks.shape[0]) 
print("artist_name - Unique Value: ", len(spotify_tracks["album_name"].unique()), "Total Value: ", spotify_tracks.shape[0]) 
print("track_name - Unique Value: ", len(spotify_tracks["album_name"].unique()), "Total Value: ", spotify_tracks.shape[0]) 

spotify_tracks

In [None]:
spotify_tracks["track_explicit"] = spotify_tracks["track_explicit"].astype(int)
spotify_tracks

In [None]:
spotify_tracks = spotify_tracks.drop('track_uri', axis=1)
spotify_tracks = spotify_tracks.drop('id', axis=1)

DISCRETIZATION

In [None]:
# BALANCEMENT
spotify_tracks = spotify_tracks.copy()

X = spotify_tracks.loc[:, spotify_tracks.columns != 'track_genre']
y = spotify_tracks['track_genre']

smote_enc = SMOTE(random_state=0)
X_res, y_res = smote_enc.fit_resample(X, y)

spotify_tracks = pd.concat([pd.DataFrame(X_res), pd.DataFrame(y_res)], axis=1)
spotify_tracks['track_genre'].value_counts()

In [None]:
# SCALE
spotify_tracks = spotify_tracks.copy()

quantile_transformer = preprocessing.QuantileTransformer(random_state=0,n_quantiles=112, output_distribution='normal')

plt.hist(spotify_tracks["audio_duration_ms"])
plt.title('audio_duration_ms - Before scaling')
plt.show()


for el in spotify_tracks.columns:
    if el != 'track_genre':
        # We can't pass a 1D array to normalize()
        # We apply reshape 
        reshape = spotify_tracks[el].values.reshape(-1,1)
        spotify_tracks[el] = quantile_transformer.fit_transform(reshape)

plt.hist(spotify_tracks["audio_duration_ms"])
plt.title('audio_duration_ms - After scaling')
plt.show()

In [None]:
# NORMALIZE
spotify_tracks = spotify_tracks.copy()

print("Before Normalization: \n", spotify_tracks['audio_duration_ms'])

for el in spotify_tracks.columns:
    if el != 'track_genre':
    # We can't pass a 1D array to normalize()
    # We apply reshape 
        reshape = spotify_tracks[el].values.reshape(-1,1)
        spotify_tracks[el] = preprocessing.normalize([spotify_tracks[el]], norm='l2')[0]

print("After Normalization: \n", spotify_tracks['audio_duration_ms'])

In [None]:
# SPLIT
spotify_tracks = spotify_tracks.copy()
y = spotify_tracks['track_genre']
X = spotify_tracks.drop('track_genre', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 112)

In [None]:
%%time

rfc_clf = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
rfc_clf.fit(X_train, y_train)

r = permutation_importance(rfc_clf, X, y, n_repeats=10, random_state=0)

# Dict to store features and their importance
less_important = {}

for i in r.importances_mean.argsort()[::-1]:
    less_important[spotify_tracks.columns[i]] = r.importances_mean[i]
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        print(f"{r.importances_mean[i]:.3f}" f" +/- {r.importances_std[i]:.3f} " f"{spotify_tracks.columns[i]}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 112)

# Class names of the target
target_names = genres

In [None]:
%%time

dtc_parameter = {
    'criterion': ['gini', 'entropy'],
    'min_samples_leaf': [2, 10, 25, 50, 100],
    'max_depth': [5, 10, 50, 100]}

dtc = GridSearchCV(tree.DecisionTreeClassifier(), dtc_parameter, n_jobs = -1) 
dtc.fit(X_train, y_train) 
dtc_best_params = dtc.best_params_
print("Best parameters:", dtc_best_params)

In [None]:
dtc_predicted = dtc.predict(X_test)
print("Confusion Matrix: \n", confusion_matrix(y_test, dtc_predicted), "\n")
print("Classification Report: \n", classification_report(y_test, dtc_predicted, target_names = target_names))

In [None]:
%%time

dtc_clf = tree.DecisionTreeClassifier(criterion = dtc_best_params['criterion'],
                                    min_samples_leaf = dtc_best_params['min_samples_leaf'],
                                    max_depth = dtc_best_params['max_depth'])

dtc_scores = cross_val_score(dtc_clf, X, y, cv=5, n_jobs=-1)
print("Max Accuracy:", "{:.2f}".format(max(dtc_scores)), " Mean Accuracy:", "{:.2f}".format(dtc_scores.mean()), "+/-", "{:.2f}".format(dtc_scores.std()*2))

In [None]:
%%time

# I avoided using the "rbf" kernel because trying it took too long to run (over an afternoon).
svm_parameters = {
    'kernel': ['linear'],
    'gamma': [1, 1e-1, 1e-2],
    'C': [100, 1000, 100000]}

svc = GridSearchCV(svm.SVC(), svm_parameters, n_jobs=-1)
svc.fit(X_train, y_train)  

svc_best_params = svc.best_params_
print("Best parameters:", svc_best_params)

In [None]:
svc_predicted = svc.predict(X_test) 

print("Confusion Matrix: \n", confusion_matrix(y_test, svc_predicted), "\n")
print("Classification Report: \n", classification_report(y_test, svc_predicted, target_names = target_names))

In [None]:
%%time

svc_clf = svm.SVC(kernel = svc_best_params['kernel'], gamma = svc_best_params['gamma'], C = svc_best_params['C'])

svc_scores = cross_val_score(svc_clf, X, y, cv=5, n_jobs=-1)

print("Max Accuracy:", "{:.2f}".format(max(svc_scores)), " Mean Accuracy:", "{:.2f}".format(svc_scores.mean()), "+/-", "{:.2f}".format(svc_scores.std()*2))

In [None]:
%%time

parameter_RF = {
    'min_samples_leaf': [1, 3],
    'n_estimators': [500, 1000],
    'max_depth': [50, 100]}

rf = GridSearchCV(RandomForestClassifier(), parameter_RF, n_jobs=-1)
rf.fit(X_train, y_train) 

rf_best_params = rf.best_params_
print("Best parameters:", rf_best_params)

In [None]:
rf_predicted = rf.predict(X_test)

print("Confusion Matrix: \n", confusion_matrix(y_test, rf_predicted), "\n")
print("Classification Report: \n", classification_report(y_test, rf_predicted, target_names=target_names))

In [None]:
%%time

rf_clf = RandomForestClassifier(n_estimators = rf_best_params['n_estimators'], 
                                min_samples_leaf = rf_best_params['min_samples_leaf'],
                                max_depth = rf_best_params['max_depth'])

rf_scores = cross_val_score(rf_clf, X, y, cv=5, n_jobs=-1)

print("Max Accuracy: ", "{:.2f}".format(max(rf_scores)), " Mean Accuracy:", "{:.2f}".format(rf_scores.mean()), "+/-", "{:.2f}".format(rf_scores.std()*2))

In [None]:
%%time

parameter_MLP = {
    'hidden_layer_sizes' : [(10, 5, 2), (20, 10, 5), (30, 15, 10)],
    'alpha' : [1e-2, 1e-3, 1e-4, 1e-5],
    'solver' : ['lbfgs', 'sgd', 'adam'],
    'tol': [1e-2, 1e-3, 1e-4]}

mlpn = RandomizedSearchCV(MLPClassifier(), parameter_MLP, n_jobs=-1)
mlpn.fit(X_train, y_train)

mlp_best_params = mlpn.best_params_
print("Best parameters:", mlp_best_params)

In [None]:
mpl_predicted = mlpn.predict(X_test)

print("Confusion Matrix: \n", confusion_matrix(y_test, mpl_predicted), "\n")
print("Classification Report: \n", classification_report(y_test, mpl_predicted, target_names = target_names))

In [None]:
%%time

mlpn_clf = MLPClassifier(solver = mlp_best_params['solver'],
                    hidden_layer_sizes = mlp_best_params['hidden_layer_sizes'],
                    alpha = mlp_best_params['alpha'],
                    tol = mlp_best_params['tol'])

mpl_scores = cross_val_score(mlpn_clf, X, y, cv=5, n_jobs=-1)
print("Max Accuracy: ", "{:.2f}".format(max(mpl_scores)), " Mean Accuracy:", "{:.2f}".format(mpl_scores.mean()), "+/-", "{:.2f}".format(mpl_scores.std()*2))