In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.metrics import classification_report
import lightgbm as lgbm
import xgboost as xgb

In [3]:
from sklearn.neighbors import KNeighborsClassifier

In [4]:
from imblearn.over_sampling import SMOTE, ADASYN

In [39]:
from sklearn.model_selection import train_test_split

df_data = pd.read_csv('recleaned_data_stdscle1.csv')


In [40]:
df_data = df_data.loc[df_data.decade !=2020]

In [41]:
labels = df_data.decade

df_data = df_data.drop(['decade','year'], axis=1)
df_data.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,...,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11
0,1.290829,0.34309,-0.586049,-0.946776,-0.276314,-0.578414,5,0.929594,-0.152122,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.277648,-1.888467,-0.431375,-0.909987,-0.276314,1.401622,8,0.123664,-0.645386,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1.298737,-0.83563,-0.620337,-1.299951,-0.276314,-0.117816,2,-0.630575,-0.608907,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.293465,-1.110283,-0.554578,-1.049785,-0.276314,-0.566669,0,-0.304312,-0.388445,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.027206,-0.732635,-0.434248,-0.681894,3.619065,-0.579976,0,-0.671149,-0.372584,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
df_data = df_data[['acousticness', 'energy', 'loudness', 'speechiness',
                   'tempo','danceability','explicit']]

In [43]:
df_data.columns

Index(['acousticness', 'energy', 'loudness', 'speechiness', 'tempo',
       'danceability', 'explicit'],
      dtype='object')

In [44]:
X_train, X_test, y_train, y_test = train_test_split(df_data, labels, test_size=0.20)

In [45]:
X_train.shape, y_train.shape, y_test.shape

((122267, 7), (122267,), (30567,))

## KNN Classifier

In [46]:
#KNN classifier
classifier = KNeighborsClassifier(n_neighbors=27)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
# from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        1920       0.50      0.34      0.40       999
        1930       0.46      0.39      0.42      2066
        1940       0.35      0.44      0.39      2862
        1950       0.29      0.39      0.33      3664
        1960       0.27      0.32      0.30      3636
        1970       0.28      0.29      0.28      3616
        1980       0.35      0.32      0.34      3623
        1990       0.33      0.27      0.30      3710
        2000       0.35      0.19      0.25      2445
        2010       0.50      0.49      0.49      3946

    accuracy                           0.35     30567
   macro avg       0.37      0.34      0.35     30567
weighted avg       0.35      0.35      0.35     30567



## XGB and LGBM

In [47]:
model_lgbm = lgbm.LGBMClassifier()

model_lgbm.fit(X_train, y_train)

model_lgbm.score(X_test, y_test)

y_pred = model_lgbm.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        1920       0.66      0.42      0.52       999
        1930       0.56      0.39      0.46      2066
        1940       0.42      0.46      0.44      2862
        1950       0.32      0.49      0.39      3664
        1960       0.30      0.37      0.33      3636
        1970       0.31      0.32      0.31      3616
        1980       0.39      0.36      0.37      3623
        1990       0.36      0.27      0.31      3710
        2000       0.36      0.20      0.26      2445
        2010       0.51      0.54      0.53      3946

    accuracy                           0.39     30567
   macro avg       0.42      0.38      0.39     30567
weighted avg       0.40      0.39      0.38     30567



In [48]:
model_lgbm.score(X_test, y_test)

0.3851539241665849

In [49]:
model_xgb = xgb.XGBClassifier()

model_xgb.fit(X_train, y_train)

model_xgb.score(X_test, y_test)

y_pred = model_xgb.predict(X_test)

print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

        1920       0.67      0.42      0.52       999
        1930       0.56      0.39      0.46      2066
        1940       0.43      0.46      0.44      2862
        1950       0.32      0.47      0.38      3664
        1960       0.30      0.36      0.33      3636
        1970       0.31      0.33      0.32      3616
        1980       0.39      0.36      0.37      3623
        1990       0.37      0.28      0.32      3710
        2000       0.35      0.21      0.27      2445
        2010       0.53      0.56      0.54      3946

    accuracy                           0.39     30567
   macro avg       0.42      0.39      0.40     30567
weighted avg       0.40      0.39      0.39     30567



## Random Forest

In [50]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(verbose=1)

rfc.fit(X_train, y_train)


rfc.score(X_test, y_test)

y_pred = rfc.predict(X_test)

print(classification_report(y_test, y_pred))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   24.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


              precision    recall  f1-score   support

        1920       0.65      0.44      0.53       999
        1930       0.54      0.39      0.46      2066
        1940       0.41      0.46      0.43      2862
        1950       0.32      0.44      0.37      3664
        1960       0.30      0.35      0.32      3636
        1970       0.29      0.30      0.30      3616
        1980       0.36      0.34      0.35      3623
        1990       0.34      0.28      0.31      3710
        2000       0.34      0.21      0.26      2445
        2010       0.50      0.52      0.51      3946

    accuracy                           0.37     30567
   macro avg       0.41      0.37      0.38     30567
weighted avg       0.38      0.37      0.37     30567



[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.0s finished
