In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.metrics import classification_report
import lightgbm as lgbm
import xgboost as xgb

In [3]:
from sklearn.neighbors import KNeighborsClassifier

In [4]:
from imblearn.over_sampling import SMOTE, ADASYN

In [5]:
from sklearn.model_selection import train_test_split

df_data = pd.read_csv('../data/recleaned_data_stdscle_v2_wartists.csv')


In [6]:
labels = df_data.decade

df_data = df_data.drop(['decade','year'], axis=1)
df_data.head()

Unnamed: 0,acousticness,danceability,duration_ms,explicit,instrumentalness,key,liveness,loudness,mode,popularity,...,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11
0,1.290829,0.34309,-0.586049,-0.276314,-0.578414,5,0.929594,-0.152122,0,-0.63573,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.277648,-1.888467,-0.431375,-0.276314,1.401622,8,0.123664,-0.645386,1,-1.184737,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1.298737,-0.83563,-0.620337,-0.276314,-0.117816,2,-0.630575,-0.608907,1,-1.184737,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.293465,-1.110283,-0.554578,-0.276314,-0.566669,0,-0.304312,-0.388445,1,-1.184737,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.027206,-0.732635,-0.434248,3.619065,-0.579976,0,-0.671149,-0.372584,0,-1.184737,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
df_data.columns

Index(['acousticness', 'danceability', 'duration_ms', 'explicit',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity',
       'speechiness', 'tempo', 'valence', 'key_0', 'key_1', 'key_2', 'key_3',
       'key_4', 'key_5', 'key_6', 'key_7', 'key_8', 'key_9', 'key_10',
       'key_11'],
      dtype='object')

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df_data, labels, test_size=0.20)

In [9]:
X_train.shape, y_train.shape, y_test.shape

((125683, 25), (125683,), (31421,))

## SMOTE

In [10]:
X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train)

In [11]:
X_resampled.shape, y_resampled.shape

((175186, 25), (175186,))

### KNN Classifier

In [12]:
#KNN classifier
classifier = KNeighborsClassifier(n_neighbors=27)
classifier.fit(X_resampled, y_resampled)
y_pred = classifier.predict(X_test)
# from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        1920       0.24      0.58      0.34      1051
        1930       0.37      0.46      0.41      2056
        1940       0.39      0.39      0.39      2731
        1950       0.42      0.27      0.33      3674
        1960       0.41      0.45      0.43      3645
        1970       0.37      0.39      0.38      3629
        1980       0.39      0.33      0.36      3636
        1990       0.39      0.30      0.34      3671
        2000       0.33      0.43      0.37      2524
        2010       0.64      0.31      0.41      3977
        2020       0.20      0.56      0.30       827

    accuracy                           0.37     31421
   macro avg       0.38      0.41      0.37     31421
weighted avg       0.41      0.37      0.37     31421



### XGB and LGBM

In [13]:
model_lgbm = lgbm.LGBMClassifier()

model_lgbm.fit(X_resampled, y_resampled)

model_lgbm.score(X_test, y_test)

y_pred = model_lgbm.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        1920       0.54      0.60      0.57      1051
        1930       0.54      0.50      0.52      2056
        1940       0.49      0.55      0.52      2731
        1950       0.55      0.44      0.49      3674
        1960       0.49      0.55      0.52      3645
        1970       0.44      0.43      0.44      3629
        1980       0.47      0.42      0.44      3636
        1990       0.47      0.47      0.47      3671
        2000       0.41      0.49      0.44      2524
        2010       0.67      0.56      0.61      3977
        2020       0.31      0.55      0.40       827

    accuracy                           0.49     31421
   macro avg       0.49      0.51      0.49     31421
weighted avg       0.50      0.49      0.50     31421



In [14]:
model_lgbm.score(X_test, y_test)

0.4941281308678909

In [15]:
model_xgb = xgb.XGBClassifier()

model_xgb.fit(X_resampled, y_resampled)

model_xgb.score(X_test, y_test)

y_pred = model_xgb.predict(X_test)

print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

        1920       0.57      0.61      0.59      1051
        1930       0.53      0.50      0.52      2056
        1940       0.51      0.58      0.54      2731
        1950       0.55      0.45      0.49      3674
        1960       0.50      0.55      0.52      3645
        1970       0.44      0.44      0.44      3629
        1980       0.47      0.41      0.44      3636
        1990       0.47      0.48      0.47      3671
        2000       0.42      0.47      0.44      2524
        2010       0.67      0.60      0.63      3977
        2020       0.33      0.50      0.40       827

    accuracy                           0.50     31421
   macro avg       0.50      0.51      0.50     31421
weighted avg       0.51      0.50      0.50     31421



### Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(verbose=1)

rfc.fit(X_resampled, y_resampled)


rfc.score(X_test, y_test)

y_pred = rfc.predict(X_test)

print(classification_report(y_test, y_pred))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   49.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


              precision    recall  f1-score   support

        1920       0.61      0.61      0.61      1051
        1930       0.58      0.51      0.54      2056
        1940       0.50      0.60      0.55      2731
        1950       0.56      0.48      0.52      3674
        1960       0.49      0.57      0.53      3645
        1970       0.44      0.44      0.44      3629
        1980       0.46      0.42      0.44      3636
        1990       0.47      0.46      0.47      3671
        2000       0.44      0.47      0.45      2524
        2010       0.67      0.61      0.64      3977
        2020       0.37      0.46      0.41       827

    accuracy                           0.51     31421
   macro avg       0.51      0.51      0.51     31421
weighted avg       0.51      0.51      0.51     31421



[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.9s finished


## ADASYN

In [18]:
X_resampled, y_resampled = ADASYN(sampling_strategy='minority').fit_resample(X_train, y_train)

In [19]:
X_resampled.shape, y_resampled.shape

((139191, 25), (139191,))

### KNN Classifier

In [20]:
#KNN classifier
classifier = KNeighborsClassifier(n_neighbors=27)
classifier.fit(X_resampled, y_resampled)
y_pred = classifier.predict(X_test)
# from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        1920       0.50      0.30      0.38      1051
        1930       0.50      0.40      0.44      2056
        1940       0.41      0.52      0.46      2731
        1950       0.41      0.45      0.43      3674
        1960       0.41      0.47      0.44      3645
        1970       0.36      0.38      0.37      3629
        1980       0.39      0.35      0.37      3636
        1990       0.39      0.37      0.38      3671
        2000       0.40      0.22      0.28      2524
        2010       0.59      0.37      0.45      3977
        2020       0.18      0.64      0.29       827

    accuracy                           0.40     31421
   macro avg       0.41      0.41      0.39     31421
weighted avg       0.42      0.40      0.40     31421



### XGB and LGBM

In [21]:
model_lgbm = lgbm.LGBMClassifier()

model_lgbm.fit(X_resampled, y_resampled)

model_lgbm.score(X_test, y_test)

y_pred = model_lgbm.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        1920       0.72      0.47      0.57      1051
        1930       0.63      0.44      0.52      2056
        1940       0.49      0.65      0.56      2731
        1950       0.54      0.49      0.51      3674
        1960       0.50      0.55      0.52      3645
        1970       0.45      0.45      0.45      3629
        1980       0.47      0.42      0.45      3636
        1990       0.45      0.52      0.48      3671
        2000       0.48      0.37      0.42      2524
        2010       0.64      0.61      0.62      3977
        2020       0.29      0.56      0.39       827

    accuracy                           0.50     31421
   macro avg       0.52      0.50      0.50     31421
weighted avg       0.52      0.50      0.50     31421



In [22]:
model_lgbm.score(X_test, y_test)

0.5044397059291557

In [23]:
model_xgb = xgb.XGBClassifier()

model_xgb.fit(X_resampled, y_resampled)

model_xgb.score(X_test, y_test)

y_pred = model_xgb.predict(X_test)

print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

        1920       0.72      0.50      0.59      1051
        1930       0.61      0.44      0.51      2056
        1940       0.50      0.65      0.56      2731
        1950       0.54      0.49      0.52      3674
        1960       0.50      0.55      0.52      3645
        1970       0.44      0.45      0.44      3629
        1980       0.47      0.42      0.44      3636
        1990       0.46      0.51      0.48      3671
        2000       0.47      0.37      0.41      2524
        2010       0.65      0.64      0.64      3977
        2020       0.31      0.53      0.39       827

    accuracy                           0.51     31421
   macro avg       0.51      0.50      0.50     31421
weighted avg       0.51      0.51      0.51     31421



### Random Forest

In [24]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(verbose=1)

rfc.fit(X_resampled, y_resampled)


rfc.score(X_test, y_test)

y_pred = rfc.predict(X_test)

print(classification_report(y_test, y_pred))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   34.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


              precision    recall  f1-score   support

        1920       0.81      0.48      0.60      1051
        1930       0.64      0.44      0.52      2056
        1940       0.49      0.65      0.56      2731
        1950       0.53      0.51      0.52      3674
        1960       0.49      0.56      0.52      3645
        1970       0.44      0.45      0.44      3629
        1980       0.46      0.44      0.45      3636
        1990       0.46      0.50      0.48      3671
        2000       0.49      0.34      0.40      2524
        2010       0.64      0.64      0.64      3977
        2020       0.34      0.50      0.41       827

    accuracy                           0.51     31421
   macro avg       0.53      0.50      0.50     31421
weighted avg       0.52      0.51      0.51     31421



[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.8s finished
