In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from IPython.display import display
import joblib

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
tracks = pd.read_csv("cleaned_data_f.csv")

features = tracks[['danceability', 'energy', 'key', 'loudness', 'speechiness',
                    'acousticness', 'instrumentalness', 'valence', 'tempo']]
target = tracks['track_genre']

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
}
model = RandomForestClassifier()
# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(features, target)

# Get the best parameters
best_params = grid_search.best_params_
print(best_params)

{'max_depth': None, 'min_samples_split': 5, 'n_estimators': 300}


In [49]:
# Load the data
tracks = pd.read_csv("cleaned_data_f.csv")

features = tracks[['danceability', 'energy', 'key', 'loudness', 'speechiness',
                    'acousticness', 'instrumentalness', 'valence', 'tempo']]
target = tracks['genre']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model using the Random Forest Classifier
model = RandomForestClassifier(n_estimators = 300, min_samples_split = 5)
model.fit(X_train_scaled, y_train)

# Evaluate the model
predictions = model.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions, zero_division=0))

Accuracy: 0.47935103244837757
Classification Report:
               precision    recall  f1-score   support

    alt-rock       0.38      0.34      0.36       365
   classical       0.82      0.81      0.82       155
     country       0.34      0.11      0.17       108
         edm       0.65      0.73      0.69       637
     hip-hop       0.39      0.24      0.30       134
        jazz       0.50      0.16      0.24        44
       latin       0.48      0.32      0.38       103
         pop       0.31      0.35      0.33       400
   punk-rock       0.40      0.47      0.43       474
        rock       0.45      0.47      0.46       292

    accuracy                           0.48      2712
   macro avg       0.47      0.40      0.42      2712
weighted avg       0.47      0.48      0.47      2712



In [48]:
# Using XGBoost
import xgboost as xgb


# Load the data
tracks = pd.read_csv("cleaned_data_f.csv")

# Define features and target
features = tracks[['danceability', 'energy', 'key', 'loudness', 'speechiness',
                    'acousticness', 'instrumentalness', 'valence', 'tempo']]
target = tracks['genre']

# Encode the target labels to numerical values
label_encoder = LabelEncoder()
target_encoded = label_encoder.fit_transform(target)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target_encoded, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model using the XGBoost Classifier
model = xgb.XGBClassifier(n_estimators=300, use_label_encoder=True, eval_metric='mlogloss')
model.fit(X_train_scaled, y_train)

# Evaluate the model
predictions = model.predict(X_test_scaled)

# Decode the predictions back to original labels
predictions_decoded = label_encoder.inverse_transform(predictions)
y_test_decoded = label_encoder.inverse_transform(y_test)

print("Accuracy:", accuracy_score(y_test_decoded, predictions_decoded))
print("Classification Report:\n", classification_report(y_test_decoded, predictions_decoded, zero_division=0))


Accuracy: 0.4476401179941003
Classification Report:
               precision    recall  f1-score   support

    alt-rock       0.35      0.34      0.35       365
   classical       0.84      0.80      0.82       155
     country       0.30      0.15      0.20       108
         edm       0.65      0.69      0.67       637
     hip-hop       0.28      0.19      0.22       134
        jazz       0.45      0.23      0.30        44
       latin       0.42      0.28      0.34       103
         pop       0.28      0.32      0.30       400
   punk-rock       0.36      0.42      0.39       474
        rock       0.40      0.41      0.41       292

    accuracy                           0.45      2712
   macro avg       0.43      0.38      0.40      2712
weighted avg       0.44      0.45      0.44      2712



In [3]:
import csv
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the data
tracks = pd.read_csv("cleaned_data_f.csv")

# Create an instance of the encoder
encoder = OneHotEncoder()


features = tracks[['danceability', 'energy', 'key', 'loudness', 'speechiness',
                    'acousticness', 'instrumentalness', 'valence', 'tempo']]
target = tracks['genre']

# Fit the encoder to each feature in rel_feats and transform the data
X_encoded = []
for feat in features:
    feat_data = np.array(feat for row in tracks)  # Extract feature data
    feat_encoded = encoder.fit_transform(feat_data)  # Fit and transform the encoder
    feat_encoded = feat_encoded.toarray()  # Convert to numpy array
    X_encoded.append(feat_encoded)
# Concatenate the encoded features horizontally
X_encoded = np.concatenate(X_encoded, axis=1)

print("X_encoded:", X_encoded)

# Extract target labels
y = np.array([d['Level'] for d in tracks])

# Encode the target labels to numerical values
label_encoder = LabelEncoder()
target_encoded = label_encoder.fit_transform(target)


# Split the mapped dataset into train and test sets [80:20]
X_train, X_test, y_train, y_test = train_test_split(X_encoded, target_encoded, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fit logistic regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Evaluate the model
predictions = model.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions, zero_division=0))

UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 5614: character maps to <undefined>