*Importing libraries*

In [8]:
import pandas as pd
from sklearn.preprocessing import (
    StandardScaler,
    LabelEncoder,
)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [9]:
data = pd.read_csv(
    r"E:\Doniawy\AI_projects\MUSIC_RECOMMDATION_SYSTEM\code\Spotify-2000.csv"
)

*show data information*

In [10]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1994 entries, 0 to 1993
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Index                   1994 non-null   int64 
 1   Title                   1994 non-null   object
 2   Artist                  1994 non-null   object
 3   Top_Genre               1994 non-null   object
 4   Year                    1994 non-null   int64 
 5   Beats_Per_Minute_(BPM)  1994 non-null   int64 
 6   Energy                  1994 non-null   int64 
 7   Danceability            1994 non-null   int64 
 8   Loudness_(dB)           1994 non-null   int64 
 9   Liveness                1994 non-null   int64 
 10  Valence                 1994 non-null   int64 
 11  Length_(Duration)       1994 non-null   object
 12  Acousticness            1994 non-null   int64 
 13  Speechiness             1994 non-null   int64 
 14  Popularity              1994 non-null   int64 
dtypes: i

**Data Cleaning**

*Drop index column*

In [11]:
data.drop("Index", axis=1, inplace=True)

In [12]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1994 entries, 0 to 1993
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Title                   1994 non-null   object
 1   Artist                  1994 non-null   object
 2   Top_Genre               1994 non-null   object
 3   Year                    1994 non-null   int64 
 4   Beats_Per_Minute_(BPM)  1994 non-null   int64 
 5   Energy                  1994 non-null   int64 
 6   Danceability            1994 non-null   int64 
 7   Loudness_(dB)           1994 non-null   int64 
 8   Liveness                1994 non-null   int64 
 9   Valence                 1994 non-null   int64 
 10  Length_(Duration)       1994 non-null   object
 11  Acousticness            1994 non-null   int64 
 12  Speechiness             1994 non-null   int64 
 13  Popularity              1994 non-null   int64 
dtypes: int64(10), object(4)
memory usage: 218.2+ KB
None


*check null entries existance*

In [13]:
print(data.isna().sum())

Title                     0
Artist                    0
Top_Genre                 0
Year                      0
Beats_Per_Minute_(BPM)    0
Energy                    0
Danceability              0
Loudness_(dB)             0
Liveness                  0
Valence                   0
Length_(Duration)         0
Acousticness              0
Speechiness               0
Popularity                0
dtype: int64


*check duplicates*

In [14]:
data.duplicated().sum()

0

*perform label encoding on string data*

In [15]:
encoder = LabelEncoder()
data["Title"] = encoder.fit_transform(data["Title"])
data["Artist"] = encoder.fit_transform(data["Artist"])
data["Top_Genre"] = encoder.fit_transform(data["Top_Genre"])

*perform scalling on numeric columns*

In [16]:
data.Title = data.Title / 1000
data.Artist = data.Artist / 100
data.Year = data.Year / 1000
data.Top_Genre = data.Top_Genre / 100

In [17]:
data.Top_Genre.max()

1.48

In [18]:
data[["Length_(Duration)"]] = data[["Length_(Duration)"]].apply(
    pd.to_numeric, errors="coerce"
)

In [19]:
scaler = StandardScaler()
data.iloc[:, 4:13] = scaler.fit_transform(data.iloc[:, 4:13])

*checking null values*

In [20]:
data.isna().sum()

Title                     0
Artist                    0
Top_Genre                 0
Year                      0
Beats_Per_Minute_(BPM)    0
Energy                    0
Danceability              0
Loudness_(dB)             0
Liveness                  0
Valence                   0
Length_(Duration)         4
Acousticness              0
Speechiness               0
Popularity                0
dtype: int64

*dropping nan values*

In [21]:
data.dropna(inplace=True)

*separate the target from data*

In [22]:
x = data.iloc[:, 0:-1]
y = data.iloc[:, -1]

*check null values in x and y*

In [23]:
print(x.isna().sum())
print("*-" * 10)
print(y.isna().sum())

Title                     0
Artist                    0
Top_Genre                 0
Year                      0
Beats_Per_Minute_(BPM)    0
Energy                    0
Danceability              0
Loudness_(dB)             0
Liveness                  0
Valence                   0
Length_(Duration)         0
Acousticness              0
Speechiness               0
dtype: int64
*-*-*-*-*-*-*-*-*-*-
0


*Splitting data into train and test*

In [24]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=22
)

*Building the model*

In [25]:
lr_model = LinearRegression()
lr_model.fit(x_train, y_train)

*get prediction values*

In [26]:
y_pred = lr_model.predict(x_test)

*get the error*

In [27]:
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

Mean Squared Error: 170.97013889803753


*Check fitting*

In [28]:
train_pred = lr_model.predict(x_train)
test_pred = lr_model.predict(x_test)
print(r2_score(y_train, train_pred))
print(r2_score(y_test, test_pred))

0.13482522793684115
0.16233185669508576
