In [20]:
# Import required libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


## Data Loading and Visualization

In [8]:
# Read data
file_path = Path("C:/Users/Usuario/Documents/Projects py2/Project04/Resources/songAttributes_1999-2019.csv")
df_attributes = pd.read_csv(file_path)

# Display sample data
df_attributes.head()

Unnamed: 0.1,Unnamed: 0,Acousticness,Album,Artist,Danceability,Duration,Energy,Explicit,Instrumentalness,Liveness,Loudness,Mode,Name,Popularity,Speechiness,Tempo,TimeSignature,Valence
0,0,0.000728,Collective Soul (Deluxe Version),Collective Soul,0.52,234947,0.904,False,0.0103,0.0634,-5.03,1,Welcome All Again,35,0.0309,106.022,4,0.365
1,1,0.0182,Collective Soul (Deluxe Version),Collective Soul,0.581,239573,0.709,False,0.000664,0.174,-4.909,1,Fuzzy,31,0.0282,120.027,4,0.408
2,2,0.000473,Collective Soul (Deluxe Version),Collective Soul,0.572,198400,0.918,False,0.000431,0.0977,-3.324,0,Dig,30,0.0559,144.061,4,0.37
3,3,0.00097,Collective Soul (Deluxe Version),Collective Soul,0.596,231453,0.661,False,3.3e-05,0.113,-5.051,1,You,35,0.0254,111.975,4,0.183
4,4,3.6e-05,Collective Soul (Deluxe Version),Collective Soul,0.52,222520,0.808,False,1e-05,0.08,-4.553,0,My Days,21,0.0318,92.721,4,0.666


In [13]:
# Create a scatter plot with the duration information
attributes_plot = df_attributes.hvplot.scatter(
    x="Duration",
    y="Popularity",
)
attributes_plot

In [14]:
# Create a scatter plot with the Loudness information
attributes_plot = df_attributes.hvplot.scatter(
    x="Loudness",
    y="Popularity",
)
attributes_plot

In [15]:
# Create a scatter plot with the Acousticness information
attributes_plot = df_attributes.hvplot.scatter(
    x="Acousticness",
    y="Popularity",
)
attributes_plot

In [17]:
# Create a scatter plot with the Speechiness information
attributes_plot = df_attributes.hvplot.scatter(
    x="Speechiness",
    y="Popularity",
)
attributes_plot

## Building the Multiple Regression Model

In [21]:
#Prepare data
feature_columns = ['Acousticness', 'Danceability', 'Duration', 'Energy', 'Liveness', 'Loudness', 'Speechiness', 'Tempo', 'Valence']
X = df_attributes[feature_columns]
y = df_attributes['Popularity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:

# Create a model with scikit-learn

model=LinearRegression()

In [23]:
# Fit the data into the model

model.fit(X_train, y_train)

In [24]:
# Display the slope

print("Slope",model.coef_)

Slope [-2.50028002e+00 -5.81439828e-01  1.35931062e-06 -2.47300785e-01
 -3.32578773e+00  4.87662486e-01 -1.05781839e+01  7.10870098e-03
 -3.07586696e+00]


In [25]:
# Display the y-intercept
print("y-intercept:",model.intercept_)


y-intercept: 27.825782181326637


In [29]:
# Display the model's best fit line formula

print(f"Model´s formula: y = {model.intercept_}+{model.coef_[0]}+{model.coef_[1]}+{model.coef_[2]}+{model.coef_[3]}+{model.coef_[4]}+{model.coef_[5]}+{model.coef_[6]}+{model.coef_[7]}+{model.coef_[8]}X")

Model´s formula: y = 27.825782181326637+-2.5002800161345475+-0.5814398280247978+1.3593106216871567e-06+-0.2473007853356227+-3.325787730184531+0.4876624855479087+-10.5781839357494+0.007108700980108832+-3.0758669631744655X


In [30]:
# Make predictions using the X test
y_pred = model.predict(X_test)


## Multiple Regression Model Assessment

In [33]:
# Import relevant metrics - score, r2, mse, rmse, std - from Scikit-learn
from sklearn.metrics import mean_squared_error, r2_score

In [35]:
# Compute the metrics for the linear regression model
score = model.score(X_test, y_test, sample_weight=None)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
std = np.std(y_test)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.03951816769403138.
The r2 is 0.03951816769403138.
The mean squared error is 262.37637060905905.
The root mean squared error is 16.1980360108582.
The standard deviation is 16.52790404705884.


In [36]:
# Create a copy of the original DataFrame
df_attributes_copy = df_attributes.copy()

# Create a new column "predicted" in the copy of the DataFrame
df_attributes_copy['predicted'] = model.predict(X)

# Display the copy of the DataFrame with the new column
df_attributes_copy.head()

Unnamed: 0.1,Unnamed: 0,Acousticness,Album,Artist,Danceability,Duration,Energy,Explicit,Instrumentalness,Liveness,Loudness,Mode,Name,Popularity,Speechiness,Tempo,TimeSignature,Valence,predicted
0,0,0.000728,Collective Soul (Deluxe Version),Collective Soul,0.52,234947,0.904,False,0.0103,0.0634,-5.03,1,Welcome All Again,35,0.0309,106.022,4,0.365,24.257743
1,1,0.0182,Collective Soul (Deluxe Version),Collective Soul,0.581,239573,0.709,False,0.000664,0.174,-4.909,1,Fuzzy,31,0.0282,120.027,4,0.408,23.920134
2,2,0.000473,Collective Soul (Deluxe Version),Collective Soul,0.572,198400,0.918,False,0.000431,0.0977,-3.324,0,Dig,30,0.0559,144.061,4,0.37,24.883457
3,3,0.00097,Collective Soul (Deluxe Version),Collective Soul,0.596,231453,0.661,False,3.3e-05,0.113,-5.051,1,You,35,0.0254,111.975,4,0.183,24.7534
4,4,3.6e-05,Collective Soul (Deluxe Version),Collective Soul,0.52,222520,0.808,False,1e-05,0.08,-4.553,0,My Days,21,0.0318,92.721,4,0.666,23.413821
