In [6]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from IPython.display import display
import joblib

In [None]:
tracks = pd.read_csv("cleaned_data_f.csv")

features = tracks[['danceability', 'energy', 'key', 'loudness', 'speechiness',
                    'acousticness', 'instrumentalness', 'valence', 'tempo']]
target = tracks['genre']

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
}
model = RandomForestClassifier()
# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(features, target)

# Get the best parameters
best_params = grid_search.best_params_
print(best_params)

In [None]:
# Load the data
tracks = pd.read_csv("cleaned_data_f.csv")

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model using the Random Forest Classifier
model = RandomForestClassifier(n_estimators = 300, min_samples_split = 5)
model.fit(X_train_scaled, y_train)

# Evaluate the model
predictions = model.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions, zero_division=0))

In [None]:
# Confusion matrix
import numpy
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
#predicted genres
predicted = predictions
#actual genres in test set
actual = y_test
#creating confusion matrix
conf_matrix = metrics.confusion_matrix(actual, predicted, labels=['alt-rock','classical','country','pop','edm','punk-rock','rock','hip-hop','jazz'])
#visualizing confusion matrix
plt.figure(figsize=(6, 6))
sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='PiYG', xticklabels=['alt-rock','classical','country','pop','edm','punk-rock','rock','hip-hop','jazz'], yticklabels=['alt-rock','classical','country','pop','edm','punk-rock','rock','hip-hop','jazz'])
plt.xlabel('Predicted Preference', fontsize=12)
plt.ylabel('Actual Preference', fontsize=12)
plt.show()


In [None]:
# Using XGBoost
import xgboost as xgb

# Encode the target labels to numerical values
label_encoder = LabelEncoder()
target_encoded = label_encoder.fit_transform(target)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target_encoded, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model using the XGBoost Classifier
model = xgb.XGBClassifier(n_estimators=300, use_label_encoder=True, eval_metric='mlogloss')
model.fit(X_train_scaled, y_train)

# Evaluate the model
predictions = model.predict(X_test_scaled)

# Decode the predictions back to original labels
predictions_decoded = label_encoder.inverse_transform(predictions)
y_test_decoded = label_encoder.inverse_transform(y_test)

print("Accuracy:", accuracy_score(y_test_decoded, predictions_decoded))
print("Classification Report:\n", classification_report(y_test_decoded, predictions_decoded, zero_division=0))


The False Positive, False Negative, True Positive and True Negative values can be found for each genre as follows:
TP = the cell where the actual and predicted value is the same genre
FN = the sum of values across the row for that genre, except for the true positive value
FP = the sum of the values down the column for that genre, except for the TP value
TN = the sum of values across the rows and columns except for the rows and columns of that genre