In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import category_encoders as ce

plt.rcParams["figure.dpi"] = 300

In [12]:
import gzip
with gzip.open('data/train.csv.gz', 'rb') as fio:
    df = pd.read_csv(fio)

PermissionError: [Errno 1] Operation not permitted

In [11]:
train = pd.read_csv("data/train.csv.gz", compression="gzip")
test = pd.read_csv("data/test.csv.gz", compression="gzip")

train = train.iloc[:, 1:]
test = test.iloc[:, 1:]

train["artists"] = train["artists"].str[1:-1].str.replace("'", "").str.split(",")
train["artist_ids"] = train["artist_ids"].str[1:-1].str.replace("'", "").str.split(",")

test["artists"] = test["artists"].str[1:-1].str.replace("'", "").str.split(",")
test["artist_ids"] = test["artist_ids"].str[1:-1].str.replace("'", "").str.split(",")

PermissionError: [Errno 1] Operation not permitted

In [None]:
y_train = train["year"]
y_test = test["year"]

X_train = train.drop(["year", "decade"], axis=1)
X_test = test.drop(["year", "decade"], axis=1)

In [None]:
continuous_cols = ["explicit", "danceability", "energy", "key", "loudness", "mode", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo", "duration_ms", "time_signature", "num_artists"]
X_train = X_train[continuous_cols + ["primary_artist"]]
X_test = X_test[continuous_cols + ["primary_artist"]]

In [None]:
te = ce.TargetEncoder()
te.fit(X_train["primary_artist"], y_train)

X_train["primary_artist"] = te.transform(X_train["primary_artist"])

X_train["explicit"] = X_train["explicit"].astype(int)

X_test["primary_artist"] = te.transform(X_test["primary_artist"])
X_test["explicit"] = X_test["explicit"].astype(int)

In [None]:
X_dev, X_val, y_dev, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
# Train the GB model
gb_model = GradientBoostingClassifier()
gb_model.fit(X_dev, y_dev)

# Make predictions on the test set
gb_preds = gb_model.predict(X_val)

# Evaluate the GB model
gb_accuracy = accuracy_score(y_val, gb_preds)
print("Gradient Boosting accuracy: {:.2f}%".format(gb_accuracy * 100))

In [None]:
from sklearn.metrics import confusion_matrix

# Generate the confusion matrix
gb_cm = confusion_matrix(y_val, gb_preds)

# Print the confusion matrix
print("Confusion Matrix:")
print(gb_cm)

In [None]:
from sklearn.metrics import roc_curve, auc

# Make predictions on the test set
gb_probs = gb_model.predict_proba(X_val)[:, 1]

# Calculate the false positive rate, true positive rate, and threshold values
fpr, tpr, thresholds = roc_curve(y_val, gb_probs)

# Calculate the area under the ROC curve
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.plot(fpr, tpr, color='darkorange', label='GB (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Train the RF model
rf_model = RandomForestClassifier()
rf_model.fit(X_dev, y_dev)

# Make predictions on the test set
rf_preds = rf_model.predict(X_val)

# Evaluate the RF model
rf_accuracy = accuracy_score(y_val, rf_preds)
print("Random Forest accuracy: {:.2f}%".format(rf_accuracy * 100))

In [None]:
# Generate the confusion matrix
rf_cm = confusion_matrix(y_val, rf_preds)

# Print the confusion matrix
print("Confusion Matrix:")
print(rf_cm)

In [None]:
# Make predictions on the test set
y_pred_proba = rf_model.predict_proba(X_val)[:, 1]

# Calculate the false positive rate, true positive rate, and thresholds
fpr, tpr, thresholds = roc_curve(y_val, y_pred_proba)

# Calculate the area under the ROC curve
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.plot(fpr, tpr, color="darkorange", lw=2, label="ROC curve (AUC = %0.2f)" % roc_auc)
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic")
plt.legend(loc="lower right")
plt.show()

In [None]:
# Get the feature importances
importances = rf_model.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf_model.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

# Plot the feature importances
plt.figure()
plt.title("Feature Importances")
plt.bar(range(X_dev.shape[1]), importances[indices], yerr=std[indices], align="center")
plt.xticks(range(X_dev.shape[1]), X_dev.columns[indices], rotation=90)
plt.xlim([-1, X_dev.shape[1]])
plt.show()