In [123]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [237]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the dataset into a DataFrame and select 500 rows at random
df = pd.read_csv('dataset.csv').sample(n=500)

# Drop irrelevant columns
df_cleaned = df.drop(columns=['Unnamed: 0', 'track_id', 'artists', 'album_name', 'track_name', 'mode'])

# Handle missing values: drop rows with missing values
df_cleaned = df_cleaned.dropna()

# Remove duplicates
df_cleaned = df_cleaned.drop_duplicates()

# Convert 'danceability' to binary labels (0 for not danceable, 1 for danceable)
df_cleaned['danceability_binary'] = df_cleaned['danceability'].apply(lambda x: 1 if x > 0.5 else 0)

# Select relevant columns
######features = ['duration_ms', 'popularity', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'explicit']
features = ['energy', 'loudness', 'acousticness', 'valence', 'time_signature']

In [238]:
X = df_cleaned[features].copy()
y = df_cleaned['danceability']


# Normalize and standardize columns
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# Display cleaned DataFrame info
df_cleaned.head()


Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,danceability_binary
69423,30,84532,False,0.694,0.731,9,-5.604,0.0333,0.0565,1e-06,0.0688,0.867,95.044,4,malay,1
89462,1,205791,False,0.755,0.767,1,-4.446,0.0535,0.00459,8e-06,0.0618,0.321,93.959,4,reggaeton,1
60526,22,130000,True,0.458,0.619,4,-7.553,0.335,0.257,0.0,0.275,0.278,193.092,4,j-dance,0
14142,0,81035,False,0.751,0.379,2,-3.406,0.123,0.698,0.0,0.0464,0.801,109.521,4,children,1
61595,20,251146,False,0.608,0.984,7,-1.519,0.138,0.0402,0.0,0.334,0.733,97.488,4,j-idol,1


In [241]:
print(df_cleaned.info())

<class 'pandas.core.frame.DataFrame'>
Index: 500 entries, 69423 to 23816
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   popularity           500 non-null    int64  
 1   duration_ms          500 non-null    int64  
 2   explicit             500 non-null    bool   
 3   danceability         500 non-null    float64
 4   energy               500 non-null    float64
 5   key                  500 non-null    int64  
 6   loudness             500 non-null    float64
 7   speechiness          500 non-null    float64
 8   acousticness         500 non-null    float64
 9   instrumentalness     500 non-null    float64
 10  liveness             500 non-null    float64
 11  valence              500 non-null    float64
 12  tempo                500 non-null    float64
 13  time_signature       500 non-null    int64  
 14  track_genre          500 non-null    object 
 15  danceability_binary  500 non-null    in

In [243]:
# Review the X variable DataFrame
print(X.head(10))
print(X.info())
print(X.describe())
print(X.columns)

       energy  loudness  acousticness  valence  time_signature
69423  0.7310    -5.604       0.05650    0.867               4
89462  0.7670    -4.446       0.00459    0.321               4
60526  0.6190    -7.553       0.25700    0.278               4
14142  0.3790    -3.406       0.69800    0.801               4
61595  0.9840    -1.519       0.04020    0.733               4
23495  0.7510    -6.867       0.01770    0.230               4
76003  0.2330   -10.632       0.88400    0.167               3
97143  0.5420    -6.378       0.55800    0.178               4
45681  0.0317   -23.894       0.91700    0.398               4
77085  0.7720    -6.530       0.65900    0.561               4
<class 'pandas.core.frame.DataFrame'>
Index: 500 entries, 69423 to 23816
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   energy          500 non-null    float64
 1   loudness        500 non-null    float64
 2   acousticness    5

In [245]:
# Split the data into training and testing datasets by using

# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X_scaled, 
                                                    y, 
                                                    random_state=1)

Linear Regression

In [248]:
# Import the LinearRegression module from SKLearn
from sklearn.linear_model import LinearRegression

# Instantiate the Linear Regression model
# Assign a random_state parameter of 1 to the model (not necessary, but for reproducibility)
lin_reg = LinearRegression(n_jobs=-1)  # n_jobs=-1 uses all available CPU cores

# Fit the model using training data
lin_reg.fit(X_train, y_train)

In [250]:
# Make a prediction using the testing data
y_predict = lin_reg.predict(X_test)

In [252]:
###### features = ['duration_ms', 'popularity', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'explicit']

In [254]:
# Compute metrics for the linear regression model:  r2, mse, rmse, std
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
r2 = r2_score(y_test, y_predict)
mse = mean_squared_error(y_test, y_predict)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")


The r2 is 0.26609858646985385.
The mean squared error is 0.021220190991999847.
The root mean squared error is 0.14567151743563272.
The standard deviation is 0.16005636405354207.


Logistic Regression Model


In [257]:
# Import the LogisticRegression module from SKLearn
# Create a logistic regression model
from sklearn.linear_model import LogisticRegression
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, df_cleaned['danceability_binary'], test_size=0.2, random_state=42)

# Create a logistic regression model
log_reg = LogisticRegression()

# Train the model
log_reg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = log_reg.predict(X_test)

# Evaluate the model
accuracy = log_reg.score(X_test, y_test)
print("Accuracy:", accuracy)


Accuracy: 0.66


Random Forest Model

In [260]:
# Import the RandomForestClassifier module from SKLearn
from sklearn.ensemble import RandomForestClassifier

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, df_cleaned['danceability_binary'], test_size=0.2, random_state=42)

# Create a random forest classifier model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = rf_model.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.73
