In [28]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [29]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the dataset into a DataFrame and select 500 rows at random
df = pd.read_csv('dataset.csv')

# Drop irrelevant columns
df_cleaned = df.drop(columns=['Unnamed: 0', 'track_id', 'artists', 'album_name', 'track_name', 'mode'])

# Handle missing values: drop rows with missing values
df_cleaned = df_cleaned.dropna()

# Remove duplicates
df_cleaned = df_cleaned.drop_duplicates()

# Convert 'danceability' to binary labels (0 for not danceable, 1 for danceable)
df_cleaned['danceability_binary'] = df_cleaned['danceability'].apply(lambda x: 1 if x > 0.7 else 0)

# Select relevant columns
######features = ['duration_ms', 'popularity', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'explicit']
features = ['energy', 'loudness', 'acousticness', 'valence']

In [30]:
X = df_cleaned[features].copy()
y = df_cleaned['danceability']


# Normalize and standardize columns
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# Display cleaned DataFrame info
df_cleaned.head()


Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,danceability_binary
0,73,230666,False,0.676,0.461,1,-6.746,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic,0
1,55,149610,False,0.42,0.166,1,-17.235,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic,0
2,57,210826,False,0.438,0.359,0,-9.734,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic,0
3,71,201933,False,0.266,0.0596,0,-18.515,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic,0
4,82,198853,False,0.618,0.443,2,-9.681,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic,0


In [31]:
print(df_cleaned.info())

<class 'pandas.core.frame.DataFrame'>
Index: 106907 entries, 0 to 113999
Data columns (total 16 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   popularity           106907 non-null  int64  
 1   duration_ms          106907 non-null  int64  
 2   explicit             106907 non-null  bool   
 3   danceability         106907 non-null  float64
 4   energy               106907 non-null  float64
 5   key                  106907 non-null  int64  
 6   loudness             106907 non-null  float64
 7   speechiness          106907 non-null  float64
 8   acousticness         106907 non-null  float64
 9   instrumentalness     106907 non-null  float64
 10  liveness             106907 non-null  float64
 11  valence              106907 non-null  float64
 12  tempo                106907 non-null  float64
 13  time_signature       106907 non-null  int64  
 14  track_genre          106907 non-null  object 
 15  danceability_binary  1

In [32]:
# Review the X variable DataFrame
print(X.head(10))
print(X.info())
print(X.describe())
print(X.columns)

   energy  loudness  acousticness  valence
0  0.4610    -6.746        0.0322   0.7150
1  0.1660   -17.235        0.9240   0.2670
2  0.3590    -9.734        0.2100   0.1200
3  0.0596   -18.515        0.9050   0.1430
4  0.4430    -9.681        0.4690   0.1670
5  0.4810    -8.807        0.2890   0.6660
6  0.1470    -8.822        0.8570   0.0765
7  0.4440    -9.331        0.5590   0.7120
8  0.4140    -8.700        0.2940   0.6690
9  0.6320    -6.770        0.4260   0.1960
<class 'pandas.core.frame.DataFrame'>
Index: 106907 entries, 0 to 113999
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   energy        106907 non-null  float64
 1   loudness      106907 non-null  float64
 2   acousticness  106907 non-null  float64
 3   valence       106907 non-null  float64
dtypes: float64(4)
memory usage: 4.1 MB
None
              energy       loudness   acousticness        valence
count  106907.000000  106907.000000  106907.000

In [33]:
# Split the data into training and testing datasets by using

# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X_scaled, 
                                                    y, 
                                                    random_state=1)

Linear Regression

In [34]:
# Import the LinearRegression module from SKLearn
from sklearn.linear_model import LinearRegression

# Instantiate the Linear Regression model
# Assign a random_state parameter of 1 to the model (not necessary, but for reproducibility)
lin_reg = LinearRegression(n_jobs=-1)  # n_jobs=-1 uses all available CPU cores

# Fit the model using training data
lin_reg.fit(X_train, y_train)

In [35]:
# Make a prediction using the testing data
y_predict = lin_reg.predict(X_test)

In [36]:
###### features = ['duration_ms', 'popularity', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'explicit']

In [37]:
# Compute metrics for the linear regression model:  r2, mse, rmse, std
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
r2 = r2_score(y_test, y_predict)
mse = mean_squared_error(y_test, y_predict)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")


The r2 is 0.2787623412340502.
The mean squared error is 0.021724783526632128.
The root mean squared error is 0.14739329539240287.
The standard deviation is 0.17372527276493077.


Logistic Regression Model


In [38]:
# Import the LogisticRegression module from SKLearn
# Create a logistic regression model
from sklearn.linear_model import LogisticRegression
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, df_cleaned['danceability_binary'], test_size=0.2, random_state=42)

# Create a logistic regression model
log_reg = LogisticRegression()

# Train the model
log_reg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = log_reg.predict(X_test)

# Evaluate the model
accuracy = log_reg.score(X_test, y_test)
print("Accuracy:", accuracy)


Accuracy: 0.777897296791694


Random Forest Model

In [39]:
# Import the RandomForestClassifier module from SKLearn
from sklearn.ensemble import RandomForestClassifier

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, df_cleaned['danceability_binary'], test_size=0.2, random_state=42)

# Create a random forest classifier model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = rf_model.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.8503881769712842


In [40]:
#Create y-Feature variable
y_variable = df_cleaned[['danceability_binary']].to_numpy().ravel()
y_variable[0:5]

array([0, 0, 0, 0, 0])

In [41]:
#Create X-Feature list "Variables", scale the data
X_variables = df_cleaned[['energy', 'loudness', 'valence', 'tempo']]

X_scaled = StandardScaler().fit_transform(X_variables)
#Create the X-Variable DataFrame
df_X_scaled = pd.DataFrame(
    X_scaled,
    columns=['energy', 'loudness', 'valence', 'tempo']
)
df_X_scaled.head()

Unnamed: 0,energy,loudness,valence,tempo
0,-0.720546,0.307522,0.946848,-1.151637
1,-1.889271,-1.765014,-0.780102,-1.500056
2,-1.124648,-0.282881,-1.346757,-1.538714
3,-2.310805,-2.017931,-1.258097,1.983166
4,-0.791858,-0.272409,-1.165582,-0.081387


In [42]:
#Create dummy variables for the 'track_genre"
df_dummies = pd.get_dummies(df_cleaned, columns=['track_genre'])
df_dropped = df_dummies.drop(columns=[
    'duration_ms', 'popularity', 'energy', 'loudness', 'speechiness', 
    'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 
    'explicit', 'danceability', 'key', 'time_signature', 'danceability_binary'
])
df_int = df_dropped.astype(int)
#review DataFrame
df_int.head()

Unnamed: 0,track_genre_acoustic,track_genre_afrobeat,track_genre_alt-rock,track_genre_alternative,track_genre_ambient,track_genre_anime,track_genre_black-metal,track_genre_bluegrass,track_genre_blues,track_genre_brazil,...,track_genre_spanish,track_genre_study,track_genre_swedish,track_genre_synth-pop,track_genre_tango,track_genre_techno,track_genre_trance,track_genre_trip-hop,track_genre_turkish,track_genre_world-music
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
#Split the data into training and testing groups
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_variable, random_state=42)


In [44]:
#Create a LINEAR REGRESSION MODEL
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model

In [45]:
#fit the model
model.fit(X_train, y_train)

In [46]:
#Make Predictions
predictions = model.predict(X_test)

In [52]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
r2 = r2_score(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")


The r2 is 0.09805419000476279.
The mean squared error is 0.1626320632893909.
The root mean squared error is 0.4032766584980972.
The standard deviation is 0.17372527276493077.


In [53]:
#Create a LOGISTITIC REGRESSION MODEL
from sklearn.linear_model import LogisticRegression
classifier_2 = LogisticRegression(random_state=42)
classifier_2

In [54]:
#fit the model
classifier_2.fit(X_train, y_train)

In [55]:
#make predictions
predictions_2 = classifier_2.predict(X_test)

In [56]:
#MODEL EVALUATION
# Calculating the confusion matrix
cm_2 = confusion_matrix(y_test, predictions_2)
cm_df_2 = pd.DataFrame(
    cm_2, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score_2 = accuracy_score(y_test, predictions_2)

# Displaying results
print("Confusion Matrix")
display(cm_df_2)
print(f"Accuracy Score : {acc_score_2}")
print("Classification Report")
print(classification_report(y_test, predictions_2))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,19870,549
Actual 1,5603,705


Accuracy Score : 0.7698207804841546
Classification Report
              precision    recall  f1-score   support

           0       0.78      0.97      0.87     20419
           1       0.56      0.11      0.19      6308

    accuracy                           0.77     26727
   macro avg       0.67      0.54      0.53     26727
weighted avg       0.73      0.77      0.71     26727



In [57]:
#Create a RANDOM FORREST MODEL
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model

In [58]:
#Fit Model
rf_model = rf_model.fit(X_train, y_train)

In [59]:
#Make predictions
predictions_3 = rf_model.predict(X_test)

In [60]:
#MODEL EVALUATION
# Calculating the confusion matrix
cm_3 = confusion_matrix(y_test, predictions_3)
cm_df_3 = pd.DataFrame(
    cm_3, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score_3 = accuracy_score(y_test, predictions_3)

# Displaying results
print("Confusion Matrix")
display(cm_df_3)
print(f"Accuracy Score : {acc_score_3}")
print("Classification Report")
print(classification_report(y_test, predictions_3))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,19259,1160
Actual 1,2783,3525


Accuracy Score : 0.8524712837205822
Classification Report
              precision    recall  f1-score   support

           0       0.87      0.94      0.91     20419
           1       0.75      0.56      0.64      6308

    accuracy                           0.85     26727
   macro avg       0.81      0.75      0.77     26727
weighted avg       0.85      0.85      0.84     26727



Index(['popularity', 'duration_ms', 'explicit', 'danceability', 'energy',
       'key', 'loudness', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'time_signature', 'track_genre',
       'danceability_binary'],
      dtype='object')