This is an implementation of the model trained on only numerical data.
The hyperparameter is set to the optimal values of alpha=1000.
The dataset is split by using 80% of it as training data and 20% of it as testing data.

In [1]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score, validation_curve, learning_curve, cross_validate
from category_encoders import TargetEncoder

In [2]:
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
class My_Ridge :
    def __init__(self,alpha) :
        self.alpha = alpha
        self.coefficients = None

    def fit(self, X, y):
        if "intercept" not in X:
            X.insert(0,"intercept",1, True)
    
        n_feat= X.shape[1]
        I=np.eye(n_feat)
        I[0][0]=0
    
        A = np.dot(X.T, X) + self.alpha * I
        b = np.dot(X.T, y)
        self.coefficients = np.dot(np.linalg.inv(A),b)

    def predict(self, X):
        if "intercept" not in X:
            X.insert(0,"intercept",1, True)
        return np.dot(X,self.coefficients)
    
    def r2score(self,Y_pred, Y_act):
        return r2_score(Y_act,Y_pred)
    
    def mse_score(self,Y_pred, Y_act):
        return mean_squared_error(Y_act,Y_pred,squared=False)

In [4]:
spotify = pd.read_csv("dataset.csv")

In [5]:
spotify=spotify.drop(columns = "Unnamed: 0")
spotify=spotify.drop(columns = "album_name")
spotify=spotify.drop(columns = "track_name")
spotify=spotify.drop(columns = "track_id")
cor = spotify.corr()

In [6]:
cor_target= abs(cor["popularity"])
relevant_feat = cor_target[cor_target > 0.02]
relevant_feat

popularity          1.000000
explicit            0.044082
danceability        0.035448
loudness            0.050423
speechiness         0.044927
acousticness        0.025472
instrumentalness    0.095139
valence             0.040534
time_signature      0.031073
Name: popularity, dtype: float64

In [7]:
predictors =["danceability","loudness","speechiness","instrumentalness","valence","time_signature"]

In [8]:
spotify=spotify.drop(columns = "acousticness")
spotify=spotify.drop(columns = "tempo")
spotify=spotify.drop(columns = "duration_ms")
spotify=spotify.drop(columns = "energy")
spotify=spotify.drop(columns = "key")
spotify=spotify.drop(columns = "mode")
spotify=spotify.drop(columns = "liveness")
spotify

Unnamed: 0,artists,popularity,explicit,danceability,loudness,speechiness,instrumentalness,valence,time_signature,track_genre
0,Gen Hoshino,73,False,0.676,-6.746,0.1430,0.000001,0.7150,4,acoustic
1,Ben Woodward,55,False,0.420,-17.235,0.0763,0.000006,0.2670,4,acoustic
2,Ingrid Michaelson;ZAYN,57,False,0.438,-9.734,0.0557,0.000000,0.1200,4,acoustic
3,Kina Grannis,71,False,0.266,-18.515,0.0363,0.000071,0.1430,3,acoustic
4,Chord Overstreet,82,False,0.618,-9.681,0.0526,0.000000,0.1670,4,acoustic
...,...,...,...,...,...,...,...,...,...,...
113995,Rainy Lullaby,21,False,0.172,-16.393,0.0422,0.928000,0.0339,5,world-music
113996,Rainy Lullaby,22,False,0.174,-18.318,0.0401,0.976000,0.0350,4,world-music
113997,Cesária Evora,22,False,0.629,-10.895,0.0420,0.000000,0.7430,4,world-music
113998,Michael W. Smith,41,False,0.587,-10.889,0.0297,0.000000,0.4130,4,world-music


In [9]:
spotify_num = spotify[["popularity"] + predictors]
spotify_num

Unnamed: 0,popularity,danceability,loudness,speechiness,instrumentalness,valence,time_signature
0,73,0.676,-6.746,0.1430,0.000001,0.7150,4
1,55,0.420,-17.235,0.0763,0.000006,0.2670,4
2,57,0.438,-9.734,0.0557,0.000000,0.1200,4
3,71,0.266,-18.515,0.0363,0.000071,0.1430,3
4,82,0.618,-9.681,0.0526,0.000000,0.1670,4
...,...,...,...,...,...,...,...
113995,21,0.172,-16.393,0.0422,0.928000,0.0339,5
113996,22,0.174,-18.318,0.0401,0.976000,0.0350,4
113997,22,0.629,-10.895,0.0420,0.000000,0.7430,4
113998,41,0.587,-10.889,0.0297,0.000000,0.4130,4


In [10]:
ridge = My_Ridge(alpha=1000)


In [11]:
y = spotify_num["popularity"].copy() #labels
yt=(np.array([y])).T
spotify_num=spotify_num.drop(columns= "popularity")

In [12]:
train,test,y_train,y_test= train_test_split(spotify_num,yt,test_size=0.2,random_state=10) #80% training set and 20% testing set

x_mean=train.mean() #Normalization of the datasets
x_std=train.std()
X = (train - x_mean) / x_std
X_test= (test - x_mean) / x_std

In [13]:
ridge.fit(X,y_train)
pred=ridge.predict(X_test)

In [14]:
rmse=ridge.mse_score(pred,y_test)
r2=ridge.r2score(pred,y_test)
print("RMSE score is:", rmse )
print("R2 score is:", r2 )

RMSE score is: 21.891594107122977
R2 score is: 0.023829443201239364


In [15]:
pred #predicted values

array([[34.71542782],
       [25.8205111 ],
       [37.12331792],
       ...,
       [35.30697808],
       [35.76777625],
       [26.86314572]])

In [16]:
y_test #actual values

array([[12],
       [ 0],
       [43],
       ...,
       [ 3],
       [61],
       [27]], dtype=int64)