In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import category_encoders as ce
import warnings
warnings.filterwarnings("ignore")
plt.rcParams["figure.dpi"] = 300

In [38]:
train = pd.read_csv("data/train.csv.gz", compression="gzip")
test = pd.read_csv("data/test.csv.gz", compression="gzip")

train = train.iloc[:, 1:]
test = test.iloc[:, 1:]

train["artists"] = train["artists"].str[1:-1].str.replace("'", "").str.split(",")
train["artist_ids"] = train["artist_ids"].str[1:-1].str.replace("'", "").str.split(",")

test["artists"] = test["artists"].str[1:-1].str.replace("'", "").str.split(",")
test["artist_ids"] = test["artist_ids"].str[1:-1].str.replace("'", "").str.split(",")

In [39]:
y_train = train["decade"]-1
y_test = test["decade"]-1

X_train = train.drop(["year", "decade"], axis=1)
X_test = test.drop(["year", "decade"], axis=1)

In [40]:
continuous_cols = ["explicit", "danceability", "energy", "key", "loudness", "mode", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo", "duration_ms", "time_signature", "num_artists"]
X_train = X_train[continuous_cols + ["primary_artist"]]
X_test = X_test[continuous_cols + ["primary_artist"]]

In [41]:
te = ce.TargetEncoder()
te.fit(X_train["primary_artist"], train["year"])

X_train["primary_artist"] = te.transform(X_train["primary_artist"])
X_train["explicit"] = X_train["explicit"].astype(int)

X_test["primary_artist"] = te.transform(X_test["primary_artist"])
X_test["explicit"] = X_test["explicit"].astype(int)

In [42]:
X_dev, X_val, y_dev, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=88)

In [43]:
from sklearn.neighbors import KNeighborsClassifier
for k in range(3,15,3):
    knn = KNeighborsClassifier(n_neighbors = k).fit(X_train, y_train)
    accuracy = knn.score(X_test, y_test)
    print("The accuracy for knn when k =", k,"is:", accuracy)
  

The accuracy for knn when k = 3 is: 0.4081936202741572
The accuracy for knn when k = 6 is: 0.4338213573112902
The accuracy for knn when k = 9 is: 0.4396465497136638
The accuracy for knn when k = 12 is: 0.4396310020471094


In [44]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
gnb = GaussianNB().fit(X_train, y_train)
gnb_predictions = gnb.predict(X_test)
  
# accuracy on X_test
accuracy = gnb.score(X_test, y_test)
print ("accuracy for the naive bayes classifier is:", accuracy)
  
# creating a confusion matrix
cm = confusion_matrix(y_test, gnb_predictions)

accuracy for the naive bayes classifier is: 0.6204348164079708


In [48]:
from sklearn.svm import LinearSVC
primal_svm = LinearSVC(dual = False)
primal_svm.fit(X_dev, np.ravel(y_dev))
acc_train = accuracy_score(primal_svm.predict(X_dev), y_dev)
acc_val = accuracy_score(primal_svm.predict(X_val), y_val)
acc_test = accuracy_score(primal_svm.predict(X_test), y_test)

print("Accuracy score for train dataset:", acc_train)
print("Accuracy score for validation dataset:", acc_val)
print("Accuracy score for test dataset:", acc_test)

Accuracy score for train dataset: 0.40869117375545383
Accuracy score for validation dataset: 0.4092599310720116
Accuracy score for test dataset: 0.4111424943639709
