In [1]:
import pandas as pd
import seaborn as sns
import random

In [2]:
df = pd.read_csv("Spotify - Data.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   artist            2000 non-null   object 
 1   song              2000 non-null   object 
 2   duration          2000 non-null   int64  
 3   year              2000 non-null   int64  
 4   popularity        2000 non-null   int64  
 5   danceability      2000 non-null   float64
 6   energy            2000 non-null   float64
 7   key               2000 non-null   int64  
 8   loudness          2000 non-null   float64
 9   mode              2000 non-null   int64  
 10  speechiness       2000 non-null   float64
 11  acousticness      2000 non-null   float64
 12  instrumentalness  2000 non-null   float64
 13  liveness          2000 non-null   float64
 14  valence           2000 non-null   float64
 15  tempo             2000 non-null   float64
dtypes: float64(9), int64(5), object(2)
memory 

In [4]:
df.isnull().values.any()

False

In [5]:
df = df.drop(columns = ['artist', 'song'])

In [6]:
corr = df.corr()

In [7]:
for column in corr:
    if not(abs(corr['popularity'][column]) > 0.02):
        df = df.drop(columns = column)

In [8]:
df

Unnamed: 0,duration,popularity,loudness,mode,speechiness,acousticness,instrumentalness
0,211160,77,-5.444,0,0.0437,0.3000,0.000018
1,167066,79,-4.918,1,0.0488,0.0103,0.000000
2,250546,66,-9.007,1,0.0290,0.1730,0.000000
3,224493,78,-4.063,0,0.0466,0.0263,0.000013
4,200560,65,-4.806,0,0.0516,0.0408,0.001040
...,...,...,...,...,...,...,...
1995,181026,79,-5.065,0,0.0588,0.0427,0.000000
1996,178426,78,-5.707,1,0.1570,0.1170,0.000021
1997,200593,69,-8.635,1,0.1090,0.0669,0.000000
1998,171029,75,-7.513,1,0.0656,0.4500,0.000002


In [9]:
def normalizingData (df):
    minMaxList = {}
    for column in df:
        minMaxList[column] = [df[column].max(), df[column].min()]
        df[column] = (df[column] - df[column].min()) / (df[column].max() - df[column].min())  
    return df, minMaxList

In [10]:
df, minMaxList = normalizingData (df)

In [11]:
def testTrainSplit (df):
    df = df.sample(frac = 1, ignore_index=True)
    length = len(df.index)
    splitPoint = round(length * 0.70)
    training = df.iloc[:splitPoint, :]
    testing = df.iloc[splitPoint:, :]
    return training, testing

In [12]:
training, testing = testTrainSplit (df)

In [13]:
def trainMRModel(trainDF, iterations):
    modelA = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]
    
    distanceA = calculateDistance(trainDF, modelA)
    print("First model - weights: ", modelA, " distance: ", distanceA)
  
    modelB = modelA[:] 
  
    for i in range(iterations):
        modelB = take_best_step(trainDF, modelB)
  
    distanceB = calculateDistance(trainDF, modelB)
    print("Final model - weights: ", modelB, " distance: ", distanceB)
    return modelB

In [14]:
def take_random_step(model):
    return [model[0] + (random.uniform(0, 1) - 0.5), model[1] + (random.uniform(0, 1) - 0.5), model[2] + (random.uniform(0, 1) - 0.5), model[3] + (random.uniform(0, 1) - 0.5), model[4] + (random.uniform(0, 1) - 0.5), model[5] + (random.uniform(0, 1) - 0.5), model[6] + (random.uniform(0, 1) - 0.5)]

def take_best_step(trainDF, model):
    old_distance = calculateDistance(trainDF, model)
    for i in range(50): 
        new_model = take_random_step(model)
        new_distance = calculateDistance(trainDF, new_model)
        if new_distance < old_distance:
            return new_model
    return model

def calculateDistance(trainDF, model):
    sum = 0
    for i in range(len(trainDF.index)):
        duration = trainDF.iat[i, 0]
        loudness = trainDF.iat[i, 2]
        instrumentalness = trainDF.iat[i, 6]
        mode = trainDF.iat[i, 3]
        speechiness = trainDF.iat[i, 4]
        acousticness = trainDF.iat[i, 5]
        popularity = trainDF.iat[i, 1]
        

    
        a = model[0]
        b = model[1]
        c = model[2]
        d = model[3]
        e = model[4]
        f = model[5]
        g = model[6]

        numerator = abs((a * duration) + (b * loudness) + (c * instrumentalness) + (d * mode) + (e * speechiness) + (f * acousticness) + (-1 * popularity) + g)
        denominator = ((a**2) + (b**2) + (c**2) + (d**2) + (e**2) + (f**2) + (-1**2))**0.5
        distance = numerator / denominator
        sum = sum + distance
 
    return sum

In [15]:
modelB = trainMRModel(training, 100)

First model - weights:  [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]  distance:  1545.6125753166243
Final model - weights:  [0.13405679712864327, 0.1492793575281286, 0.12663816114003335, -0.05718258910090357, -0.035298996590933274, 0.12739527157388297, 0.6187123431072269]  distance:  (1.4364963838664216e-14-234.59766274922146j)


In [16]:
def MSE (df, model):
    sum = 0
    a = model[0]
    b = model[1]
    c = model[2]
    d = model[3]
    e = model[4]
    f = model[5]
    g = model[6]
    for row in range (len(df.index)):
        duration = df.iat[row, 0]
        loudness = df.iat[row, 2]
        instrumentalness = df.iat[row, 6]
        mode = df.iat[row, 3]
        speechiness = df.iat[row, 4]
        acousticness = df.iat[row, 5]
        
        prediction = (a * duration) + (b * loudness) + (c * instrumentalness) + (d * mode) + (e * speechiness) + (f * acousticness) + g
        
        sum = ((prediction - df.iat[row, 1])**2) + sum
    return sum/len(df.index)

In [17]:
MSE = MSE (testing, modelB)

In [18]:
MSE

0.06256244971967988

In [19]:
RMSE = MSE**0.5

In [20]:
RMSE

0.2501248682551974

In [21]:
example = {'duration':[168602], 'popularity':[0], 'loudness':[-5.692], 'mode':[1], 'speechiness':[0.0493], 'acousticness':[0.299], 'instrumentalness':[0.0]}

In [22]:
example = pd.DataFrame.from_dict(example)

In [23]:
example

Unnamed: 0,duration,popularity,loudness,mode,speechiness,acousticness,instrumentalness
0,168602,0,-5.692,1,0.0493,0.299,0.0


In [24]:
def normalizeExample (df, minMaxList):
    for column in df:
        df[column] = (df[column] - minMaxList[column][1]) / (minMaxList[column][0] - minMaxList[column][1])  
    return df

In [25]:
example = normalizeExample (example, minMaxList)

In [26]:
def prediction (df, model):
    duration = df.iat[0, 0]
    loudness = df.iat[0, 2]
    instrumentalness = df.iat[0, 6]
    mode = df.iat[0, 3]
    speechiness = df.iat[0, 4]
    acousticness = df.iat[0, 5]
    
    a = model[0]
    b = model[1]
    c = model[2]
    d = model[3]
    e = model[4]
    f = model[5]
    g = model[6]
    predictedValue = (a * duration) + (b * loudness) + (c * mode) + (d * speechiness) + (e * acousticness) + (f* instrumentalness) + g
    df['popularity'][0] = predictedValue
    
    return df

In [27]:
example = prediction (example, modelB)

In [28]:
def denormalizeData (df, minMaxList):
    for column in df:
        df[column] = (df[column] * (minMaxList[column][0] - minMaxList[column][1])) + minMaxList[column][1]   
    return df

In [29]:
example = denormalizeData (example, minMaxList)

In [30]:
example

Unnamed: 0,duration,popularity,loudness,mode,speechiness,acousticness,instrumentalness
0,168602.0,76.651286,-5.692,1.0,0.0493,0.299,0.0


In [50]:
import numpy as np
import sklearn
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [51]:
df = pd.read_csv("Spotify - Data.csv")

In [52]:
df = df.drop(columns = ['artist', 'song'])

In [53]:
df.fillna(method ='ffill', inplace = True)

In [54]:
df.dropna(inplace = True)

In [55]:
df, minMaxList = normalizingData (df)

In [56]:
features = ['duration', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness']

In [57]:
X = df[features]
y = df['popularity']

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [59]:
X = sklearn.preprocessing.normalize(X)

In [60]:
y = sklearn.preprocessing.normalize([y])

In [61]:
regr = LinearRegression()

In [62]:
regr.fit(X_train, y_train)

LinearRegression()

In [63]:
print(regr.intercept_)

0.5265321546802264


In [64]:
print(regr.coef_)

[ 0.11899099  0.14631809 -0.00921855  0.05338777  0.02206409 -0.13015106]


In [65]:
y_pred = regr.predict(X_test)

In [66]:
MSE = sklearn.metrics.mean_squared_error(y_test, y_pred)

In [67]:
MSE

0.0638201168930191

In [68]:
RMSE = MSE**0.5

In [69]:
RMSE

0.25262643743879837