In [138]:
import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [139]:
df = pd.read_csv("./spotify-2023.csv" , encoding = 'latin-1')

In [140]:
df.head()

Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,...,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2,2023,7,14,553,147,141381703,43,...,125,B,Major,80,89,83,31,0,8,4
1,LALA,Myke Towers,1,2023,3,23,1474,48,133716286,48,...,92,C#,Major,71,61,74,7,0,10,4
2,vampire,Olivia Rodrigo,1,2023,6,30,1397,113,140003974,94,...,138,F,Major,51,32,53,17,0,31,6
3,Cruel Summer,Taylor Swift,1,2019,8,23,7858,100,800840817,116,...,170,A,Major,55,58,72,11,0,11,15
4,WHERE SHE GOES,Bad Bunny,1,2023,5,18,3133,50,303236322,84,...,144,A,Minor,65,23,80,14,63,11,6


In [141]:
df = df.drop(574)

In [142]:
df['streams'] = df['streams'].astype('int64')

In [143]:
df['mode'].replace(['Major', 'Minor'],[0, 1], inplace=True)
df['mode'] = df['mode'].astype('int64')

In [144]:
columns = ['streams','released_year','bpm','mode','danceability_%', 'valence_%', 'energy_%','acousticness_%', 'instrumentalness_%', 'liveness_%', 'speechiness_%']

In [145]:
df_copy = df[columns].copy()
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
Index: 952 entries, 0 to 952
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   streams             952 non-null    int64
 1   released_year       952 non-null    int64
 2   bpm                 952 non-null    int64
 3   mode                952 non-null    int64
 4   danceability_%      952 non-null    int64
 5   valence_%           952 non-null    int64
 6   energy_%            952 non-null    int64
 7   acousticness_%      952 non-null    int64
 8   instrumentalness_%  952 non-null    int64
 9   liveness_%          952 non-null    int64
 10  speechiness_%       952 non-null    int64
dtypes: int64(11)
memory usage: 89.2 KB


In [146]:
def custom_standard_scaler(data):
    mean = data.mean(axis=0)
    std = data.std(axis=0)
    standardized_data = (data - mean) / std
    return standardized_data

In [147]:
# Normalise
df_copy[columns] = custom_standard_scaler(df_copy[columns])

In [148]:
X = df_copy.drop(columns='streams')
y = df_copy.streams

In [149]:
def custom_train_test_split(X, y):
    n_samples = X.shape[0]
    n_train_samples = int(n_samples * 0.7)
    
    X_train, X_test = X[:n_train_samples], X[n_train_samples:]
    y_train, y_test = y[:n_train_samples], y[n_train_samples:]
    
    return X_train, X_test, y_train, y_test

In [150]:
class CustomLinearRegression:
    def __init__(self):
        self.coefficients = None

    def fit(self, X, y):
        # Add bias term to X
        X = np.c_[np.ones(X.shape[0]), X]
        
        # Compute coefficients using closed-form solution
        self.coefficients = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)
    
    def predict(self, X):
        # Add bias term to X
        X = np.c_[np.ones(X.shape[0]), X]
        
        # Make predictions
        return X.dot(self.coefficients)

In [151]:
X_train, X_test, y_train, y_test = custom_train_test_split(X, y)

In [152]:
model = CustomLinearRegression()

In [153]:
model.fit(X_train,y_train)
predictions = model.predict(X_test)

In [156]:
# model evaluation
print('Mean squared error : ', np.mean((y_test - predictions) ** 2))

Mean squared error :  0.6692974090010414
