In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
#The data has been cleaned considering the vast amount of data it has obtained. 
df = pd.read_csv("datasets/cleaned_data.csv")
print(df.info())
print(df.head())




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 30 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   genre               1200 non-null   object 
 1   tempo               1200 non-null   float64
 2   beats               1200 non-null   int64  
 3   chroma_stft         1200 non-null   float64
 4   rmse                1200 non-null   float64
 5   spectral_centroid   1200 non-null   float64
 6   spectral_bandwidth  1200 non-null   float64
 7   rolloff             1200 non-null   float64
 8   zero_crossing_rate  1200 non-null   float64
 9   mfcc1               1200 non-null   float64
 10  mfcc2               1200 non-null   float64
 11  mfcc3               1200 non-null   float64
 12  mfcc4               1200 non-null   float64
 13  mfcc5               1200 non-null   float64
 14  mfcc6               1200 non-null   float64
 15  mfcc7               1200 non-null   float64
 16  mfcc8 

Preprocessing Summary:
Missing Values: No missing values in the dataset.
Encoded genre: Added a new column genre_encoded for the categorical genre variable using Label Encoding.
Feature Scaling: Applied standard scaling to all numerical columns.
Train-Test Split:
Training Set: 960 rows, 28 features.
Testing Set: 240 rows, 28 features.
Next steps:

Proceed with feature selection if necessary.
Build and train a machine learning model using the preprocessed data.
Would you like to proceed with training a Random Forest model or another algorithm? Let me know!

In [29]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split


missing_values = df.isnull().sum()

# Step 2: Encode categorical variables
# Encoding the 'genre' column using LabelEncoder
if 'genre' in df.columns:
    label_encoder = LabelEncoder()
    df['genre_encoded'] = label_encoder.fit_transform(df['genre'])

# Step 3: Feature scaling (StandardScaler for numerical columns)
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns.drop(['label', 'genre_encoded'], errors='ignore')
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# Step 4: Train-Test Split
X = df[numerical_columns]
y = df['label']  # Using 'label' as the target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Displaying results
missing_values, X_train.shape, X_test.shape

(genre                 0
 tempo                 0
 beats                 0
 chroma_stft           0
 rmse                  0
 spectral_centroid     0
 spectral_bandwidth    0
 rolloff               0
 zero_crossing_rate    0
 mfcc1                 0
 mfcc2                 0
 mfcc3                 0
 mfcc4                 0
 mfcc5                 0
 mfcc6                 0
 mfcc7                 0
 mfcc8                 0
 mfcc9                 0
 mfcc10                0
 mfcc11                0
 mfcc12                0
 mfcc13                0
 mfcc14                0
 mfcc15                0
 mfcc16                0
 mfcc17                0
 mfcc18                0
 mfcc19                0
 mfcc20                0
 label                 0
 dtype: int64,
 (960, 28),
 (240, 28))

In [33]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# Define the model
rf = RandomForestRegressor(random_state=42)

# Corrected parameter grid
param_grid = {
    'n_estimators': [50, 100, 200, 300, 500],
    'max_depth': [None, 10, 20, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']  # Removed 'auto'
}

# Perform Randomized Search
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=50,  # Number of parameter settings sampled
    scoring='neg_mean_squared_error',  # Evaluation metric
    cv=5,  # Cross-validation folds
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use all processors
)

# Fit the search to the training data
random_search.fit(X_train, y_train)

# Best hyperparameters and score
best_params = random_search.best_params_
best_score = -random_search.best_score_  # Convert back to positive MSE

# Evaluate on the test set
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_pred)
test_r2 = r2_score(y_test, y_pred)

best_params, best_score, test_mse, test_r2


Fitting 5 folds for each of 50 candidates, totalling 250 fits


({'n_estimators': 100,
  'min_samples_split': 5,
  'min_samples_leaf': 1,
  'max_features': 'log2',
  'max_depth': 20},
 4.636819209606159,
 4.021015190517326,
 0.5317595120212721)

Validation MSE: 4.6368

The mean squared error achieved during cross-validation using the best hyperparameters.
Test MSE: 4.0210

The mean squared error on the test dataset, showing how well the model generalizes to unseen data.
Test R² Score: 0.5318

The proportion of variance in the target variable explained by the model on the test set. A value of 0.5318 suggests the model explains ~53.18% of the variability in the target.

Next Steps
Performance Improvement:

Feature Engineering: Explore additional transformations or feature combinations.
Hyperparameter Refinement: Use a finer grid or Bayesian Optimization for further tuning.
Ensemble Methods: Combine predictions from multiple models to improve performance.
Model Analysis:

Evaluate feature importance to understand which features contribute most to the predictions.
Plot residuals to assess the fit quality.
Deployment and Use:

Save the trained model using joblib or pickle for later use.
Would you like to explore any of these steps, or is there something specific you want to do next?