In [1]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, balanced_accuracy_score
import matplotlib as plt
import sklearn as skl
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


## Importing data

In [2]:
attributes_df = pd.read_csv("../Resources/songAttributes_1999-2019.csv", encoding='unicode_escape')
attributes_df.drop(columns=["X"], inplace=True)
attributes_df["Popularity"].describe()
attributes_df.head()

Unnamed: 0,Acousticness,Album,Artist,Danceability,Duration,Energy,Explicit,Instrumentalness,Liveness,Loudness,Mode,Name,Popularity,Speechiness,Tempo,TimeSignature,Valence
0,0.000728,Collective Soul (Deluxe Version),Collective Soul,0.52,234947,0.904,False,0.0103,0.0634,-5.03,1,Welcome All Again,35,0.0309,106.022,4,0.365
1,0.0182,Collective Soul (Deluxe Version),Collective Soul,0.581,239573,0.709,False,0.000664,0.174,-4.909,1,Fuzzy,31,0.0282,120.027,4,0.408
2,0.000473,Collective Soul (Deluxe Version),Collective Soul,0.572,198400,0.918,False,0.000431,0.0977,-3.324,0,Dig,30,0.0559,144.061,4,0.37
3,0.00097,Collective Soul (Deluxe Version),Collective Soul,0.596,231453,0.661,False,3.3e-05,0.113,-5.051,1,You,35,0.0254,111.975,4,0.183
4,3.6e-05,Collective Soul (Deluxe Version),Collective Soul,0.52,222520,0.808,False,1e-05,0.08,-4.553,0,My Days,21,0.0318,92.721,4,0.666


In [3]:
attributes_billboard = pd.read_csv("../Resources/attributesBillboard.csv")
attributes_billboard.drop(columns=["Unnamed: 0"], inplace=True)
attributes_billboard.head()

Unnamed: 0,Artist,Name,Weekly.rank,Peak.position,Weeks.on.chart,Week,Date,Genre,Writing.Credits,Lyrics,...,Explicit,Instrumentalness,Liveness,Loudness,Mode,Popularity,Speechiness,Tempo,TimeSignature,Valence
0,jonas brothers,sucker,6,1.0,17.0,2019-07-06,"March 1, 2019","Alternative Pop,Boy Band,Teen Pop,Pop-Rock,Pop","Ryan tedder, Louis bell, Frank dukes, Nick jon...",Sucker \nWe go together\nBetter than birds of ...,...,False,0.0,0.106,-5.065,0,81,0.0588,137.958,4,0.952
1,taylor swift,you need to calm down,13,2.0,2.0,2019-07-06,"June 14, 2019","Synth-Pop,LGBTQ+,Pop","Joel little, Taylor swift",You Need To Calm Down \nYou are somebody that ...,...,False,0.0,0.0637,-5.617,1,89,0.0553,85.026,4,0.714
2,panic! at the disco,"hey look ma, i made it",24,24.0,11.0,2019-07-06,"June 22, 2018","Pop-Rock,Jazz Fusion,Alternative,Alternative P...","Jake sinclair, Michael angelakos, Dillon franc...","Hey Look Ma, I Made It \nAll my life, been hus...",...,False,0.0,0.121,-3.337,1,81,0.0695,107.936,4,0.58
3,lee brice,rumor,26,25.0,16.0,2019-07-06,"November 3, 2017",Country,"Kyle jacobs, Ashley gorley, Lee brice",Rumor \nGirl you know I've known you forever\n...,...,False,0.0,0.115,-6.857,1,79,0.0486,140.975,4,0.599
4,panic! at the disco,high hopes,32,4.0,47.0,2019-07-06,"May 23, 2018","Adult Alternative,Rock,Power Pop,Pop-Rock,Alte...","Sam hollander, Cook classics, Tayla parx, Jake...","High Hopes \nHigh, high hopes\nHad to have hig...",...,False,0.0,0.064,-2.729,1,87,0.0618,82.014,4,0.681


In [4]:
print(attributes_df['Popularity'].describe())

print(attributes_billboard['Popularity'].describe())


count    154931.000000
mean         20.249111
std          16.506651
min           0.000000
25%           6.000000
50%          17.000000
75%          31.000000
max          91.000000
Name: Popularity, dtype: float64
count    4774.000000
mean       39.847926
std        21.823721
min         0.000000
25%        23.000000
50%        42.000000
75%        58.000000
max        89.000000
Name: Popularity, dtype: float64


We are using the 75% to make the cut as 0= popular 1=Most popular 

In [5]:
# Making popularity as a binary variable in both DF
attributes_df['Popularity'] = np.where((attributes_df['Popularity'] > 31) & attributes_df['Popularity']
                                            .notna(), 1, 0)

attributes_billboard['Popularity'] = np.where((attributes_billboard['Popularity'] > 58) & attributes_billboard['Popularity']
                                            .notna(), 1, 0)

# Keras Sequential Model

## Attributes DF 

In [6]:
# Creating a sample for the second model
attributes_sample_df = attributes_df.sample(frac=0.3, random_state=78)

In [7]:
X = attributes_sample_df[['Valence', 'Energy', "Danceability", "Acousticness", "Tempo", "Duration", "Instrumentalness","Liveness","Loudness"]]
y = attributes_sample_df['Popularity']

In [8]:
# Data split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Scaler instance
X_scaler = skl.preprocessing.StandardScaler()

# Fit the scaler
X_scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
nn_model = tf.keras.models.Sequential()

nn_model.add(tf.keras.layers.Dense(units=20, activation="relu", input_dim=9))

nn_model.add(tf.keras.layers.Dense(units=20, activation="relu"))

nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the Sequential model
nn_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 20)                200       
                                                                 
 dense_1 (Dense)             (None, 20)                420       
                                                                 
 dense_2 (Dense)             (None, 1)                 21        
                                                                 
Total params: 641 (2.50 KB)
Trainable params: 641 (2.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [10]:
# Compile the model and train
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

fit_model = nn_model.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [11]:
# Evaluate the performance of model using the loss and predictive accuracy of the model on the test dataset.
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

364/364 - 1s - loss: 0.5303 - accuracy: 0.7615 - 593ms/epoch - 2ms/step
Loss: 0.5303089022636414, Accuracy: 0.7615318298339844


## Attributes Billboard DF

In [12]:
X = attributes_billboard[['Valence', 'Energy', "Danceability", "Acousticness", "Tempo", "Duration", "Instrumentalness","Liveness","Loudness"]]
y = attributes_billboard['Popularity']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Scaler instance
X_scaler = skl.preprocessing.StandardScaler()

# Fit the scaler
X_scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
nn_model = tf.keras.models.Sequential()

nn_model.add(tf.keras.layers.Dense(units=20, activation="relu", input_dim=9))

nn_model.add(tf.keras.layers.Dense(units=20, activation="relu"))

nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the Sequential model
nn_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 20)                200       
                                                                 
 dense_4 (Dense)             (None, 20)                420       
                                                                 
 dense_5 (Dense)             (None, 1)                 21        
                                                                 
Total params: 641 (2.50 KB)
Trainable params: 641 (2.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [15]:
# Compile the model and train
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

fit_model = nn_model.fit(X_train_scaled, y_train, epochs=200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [16]:
# Evaluate the performance of model using the loss and predictive accuracy of the model on the test dataset.
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

38/38 - 0s - loss: 0.6347 - accuracy: 0.6901 - 158ms/epoch - 4ms/step
Loss: 0.6347009539604187, Accuracy: 0.6901172399520874


# SVC Model


## Attributes Billboard DF

In [17]:
attributes_billboard

Unnamed: 0,Artist,Name,Weekly.rank,Peak.position,Weeks.on.chart,Week,Date,Genre,Writing.Credits,Lyrics,...,Explicit,Instrumentalness,Liveness,Loudness,Mode,Popularity,Speechiness,Tempo,TimeSignature,Valence
0,jonas brothers,sucker,6,1.0,17.0,2019-07-06,"March 1, 2019","Alternative Pop,Boy Band,Teen Pop,Pop-Rock,Pop","Ryan tedder, Louis bell, Frank dukes, Nick jon...",Sucker \nWe go together\nBetter than birds of ...,...,False,0.000000,0.1060,-5.065,0,1,0.0588,137.958,4,0.952
1,taylor swift,you need to calm down,13,2.0,2.0,2019-07-06,"June 14, 2019","Synth-Pop,LGBTQ+,Pop","Joel little, Taylor swift",You Need To Calm Down \nYou are somebody that ...,...,False,0.000000,0.0637,-5.617,1,1,0.0553,85.026,4,0.714
2,panic! at the disco,"hey look ma, i made it",24,24.0,11.0,2019-07-06,"June 22, 2018","Pop-Rock,Jazz Fusion,Alternative,Alternative P...","Jake sinclair, Michael angelakos, Dillon franc...","Hey Look Ma, I Made It \nAll my life, been hus...",...,False,0.000000,0.1210,-3.337,1,1,0.0695,107.936,4,0.580
3,lee brice,rumor,26,25.0,16.0,2019-07-06,"November 3, 2017",Country,"Kyle jacobs, Ashley gorley, Lee brice",Rumor \nGirl you know I've known you forever\n...,...,False,0.000000,0.1150,-6.857,1,1,0.0486,140.975,4,0.599
4,panic! at the disco,high hopes,32,4.0,47.0,2019-07-06,"May 23, 2018","Adult Alternative,Rock,Power Pop,Pop-Rock,Alte...","Sam hollander, Cook classics, Tayla parx, Jake...","High Hopes \nHigh, high hopes\nHad to have hig...",...,False,0.000000,0.0640,-2.729,1,1,0.0618,82.014,4,0.681
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4769,mark wills,wish you were here,81,34.0,20.0,1999-07-12,,Country,"Debbie moore, Bill anderson, Skip ewing",Wish You Were Here \nThey kissed goodbye at th...,...,False,0.000000,0.3910,-14.857,1,0,0.0310,136.094,4,0.272
4770,mark wills,wish you were here,81,34.0,20.0,1999-07-12,,Country,"Debbie moore, Bill anderson, Skip ewing",Wish You Were Here \nThey kissed goodbye at th...,...,False,0.000000,0.1170,-10.234,1,0,0.0285,138.045,4,0.256
4771,clay walker,she's always right,85,74.0,6.0,1999-07-12,,Pop,,She's Always Right \nShe's Perfect for a cotto...,...,False,0.000000,0.1430,-10.020,1,0,0.0315,142.685,4,0.480
4772,collective soul,heavy,96,73.0,20.0,1999-07-12,,"Hockey,Gaming,Soundtrack,Rock",Collective soul,Heavy \nComplicate this world you wrapped for ...,...,False,0.000563,0.0839,-5.560,1,0,0.0371,106.923,4,0.548


In [18]:
scaled_data_arr = StandardScaler().fit_transform(
    attributes_billboard[[
        "Danceability",
        "Acousticness",
        "Energy",
        "Instrumentalness",
        "Liveness",
        "Duration",
        "Loudness",
        "Valence",
        "Speechiness"
    ]]
)

attributes_scaled_df= pd.DataFrame(scaled_data_arr, columns=[[
        "Danceability",
        "Acousticness",
        "Energy",
        "Instrumentalness",
        "Liveness",
        "Duration",
        "Loudness",
        "Valence",
        "Speechiness"
    ]])
attributes_scaled_df

Unnamed: 0,Danceability,Acousticness,Energy,Instrumentalness,Liveness,Duration,Loudness,Valence,Speechiness
0,1.640417,-0.598419,0.178807,-0.138798,-0.546187,-1.208851,0.356331,1.926640,-0.322868
1,1.143386,-0.756771,-0.186643,-0.138798,-0.814590,-1.422540,0.122208,0.857526,-0.359151
2,-0.214696,-0.735870,0.753087,-0.138798,-0.451009,-1.459964,1.089238,0.255589,-0.211946
3,0.331337,2.749215,-0.830533,-0.138798,-0.489080,-0.823936,-0.403720,0.340938,-0.428606
4,-0.200696,0.113955,1.164944,-0.138798,-0.812686,-0.989569,1.347112,0.709288,-0.291769
...,...,...,...,...,...,...,...,...,...
4769,-0.522715,0.839125,-2.170519,-0.138798,1.262197,0.470381,-3.796805,-1.127969,-0.611057
4770,-0.662724,0.099735,-1.770263,-0.138798,-0.476390,0.138076,-1.836026,-1.199842,-0.636973
4771,-1.040747,1.213559,-0.952350,-0.138798,-0.311415,-0.748182,-1.745261,-0.193618,-0.605874
4772,-0.424709,-0.796646,1.153342,-0.131373,-0.686416,-1.380386,0.146384,0.111843,-0.547821


In [19]:
# preparing the data for the model
popularity = attributes_billboard["Popularity"]
target_popularity=["Popular","Not popular"]
X = attributes_scaled_df

In [20]:
popularity.value_counts()

Popularity
0    3641
1    1133
Name: count, dtype: int64

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, popularity, random_state=42, stratify=popularity)

In [22]:
model = SVC(kernel="linear", random_state=42)
model.fit(X_train, y_train)

In [23]:
print(f"Training Data Score: {model.score(X_train, y_train)}")
print(f"Testing Data Score: {model.score(X_test, y_test)}")

Training Data Score: 0.7625698324022346
Testing Data Score: 0.7629815745393634


In [24]:
# Calculate the classification report
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_popularity))

              precision    recall  f1-score   support

     Popular       0.76      1.00      0.87       911
 Not popular       0.00      0.00      0.00       283

    accuracy                           0.76      1194
   macro avg       0.38      0.50      0.43      1194
weighted avg       0.58      0.76      0.66      1194



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Neural Networks

## attributesBillboard

In [25]:
df_01 = pd.read_csv("../Resources/attributesBillboard.csv")
print(df_01.info())
df_01.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4774 entries, 0 to 4773
Data columns (total 27 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        4774 non-null   int64  
 1   Artist            4774 non-null   object 
 2   Name              4774 non-null   object 
 3   Weekly.rank       4774 non-null   int64  
 4   Peak.position     4229 non-null   float64
 5   Weeks.on.chart    4229 non-null   float64
 6   Week              4774 non-null   object 
 7   Date              3938 non-null   object 
 8   Genre             4774 non-null   object 
 9   Writing.Credits   4279 non-null   object 
 10  Lyrics            4774 non-null   object 
 11  Features          717 non-null    object 
 12  Acousticness      4774 non-null   float64
 13  Album             4774 non-null   object 
 14  Danceability      4774 non-null   float64
 15  Duration          4774 non-null   int64  
 16  Energy            4774 non-null   float64


Unnamed: 0.1,Unnamed: 0,Artist,Name,Weekly.rank,Peak.position,Weeks.on.chart,Week,Date,Genre,Writing.Credits,...,Explicit,Instrumentalness,Liveness,Loudness,Mode,Popularity,Speechiness,Tempo,TimeSignature,Valence
0,0,jonas brothers,sucker,6,1.0,17.0,2019-07-06,"March 1, 2019","Alternative Pop,Boy Band,Teen Pop,Pop-Rock,Pop","Ryan tedder, Louis bell, Frank dukes, Nick jon...",...,False,0.0,0.106,-5.065,0,81,0.0588,137.958,4,0.952
1,1,taylor swift,you need to calm down,13,2.0,2.0,2019-07-06,"June 14, 2019","Synth-Pop,LGBTQ+,Pop","Joel little, Taylor swift",...,False,0.0,0.0637,-5.617,1,89,0.0553,85.026,4,0.714
2,2,panic! at the disco,"hey look ma, i made it",24,24.0,11.0,2019-07-06,"June 22, 2018","Pop-Rock,Jazz Fusion,Alternative,Alternative P...","Jake sinclair, Michael angelakos, Dillon franc...",...,False,0.0,0.121,-3.337,1,81,0.0695,107.936,4,0.58
3,3,lee brice,rumor,26,25.0,16.0,2019-07-06,"November 3, 2017",Country,"Kyle jacobs, Ashley gorley, Lee brice",...,False,0.0,0.115,-6.857,1,79,0.0486,140.975,4,0.599
4,4,panic! at the disco,high hopes,32,4.0,47.0,2019-07-06,"May 23, 2018","Adult Alternative,Rock,Power Pop,Pop-Rock,Alte...","Sam hollander, Cook classics, Tayla parx, Jake...",...,False,0.0,0.064,-2.729,1,87,0.0618,82.014,4,0.681


In [26]:
df_01 = df_01.drop(columns= ["Unnamed: 0", "Artist","Name","Genre","Writing.Credits","Lyrics","Album","Week","Date"])
df_01['Features'] = df_01['Features'].notna().astype(int)
df_01.head()

Unnamed: 0,Weekly.rank,Peak.position,Weeks.on.chart,Features,Acousticness,Danceability,Duration,Energy,Explicit,Instrumentalness,Liveness,Loudness,Mode,Popularity,Speechiness,Tempo,TimeSignature,Valence
0,6,1.0,17.0,0,0.0427,0.842,181027,0.734,False,0.0,0.106,-5.065,0,81,0.0588,137.958,4,0.952
1,13,2.0,2.0,0,0.00929,0.771,171360,0.671,False,0.0,0.0637,-5.617,1,89,0.0553,85.026,4,0.714
2,24,24.0,11.0,0,0.0137,0.577,169667,0.833,False,0.0,0.121,-3.337,1,81,0.0695,107.936,4,0.58
3,26,25.0,16.0,0,0.749,0.655,198440,0.56,False,0.0,0.115,-6.857,1,79,0.0486,140.975,4,0.599
4,32,4.0,47.0,0,0.193,0.579,190947,0.904,False,0.0,0.064,-2.729,1,87,0.0618,82.014,4,0.681


In [27]:
df_01_dummies = pd.get_dummies(df_01)
df_01_dummies.head()

Unnamed: 0,Weekly.rank,Peak.position,Weeks.on.chart,Features,Acousticness,Danceability,Duration,Energy,Explicit,Instrumentalness,Liveness,Loudness,Mode,Popularity,Speechiness,Tempo,TimeSignature,Valence
0,6,1.0,17.0,0,0.0427,0.842,181027,0.734,False,0.0,0.106,-5.065,0,81,0.0588,137.958,4,0.952
1,13,2.0,2.0,0,0.00929,0.771,171360,0.671,False,0.0,0.0637,-5.617,1,89,0.0553,85.026,4,0.714
2,24,24.0,11.0,0,0.0137,0.577,169667,0.833,False,0.0,0.121,-3.337,1,81,0.0695,107.936,4,0.58
3,26,25.0,16.0,0,0.749,0.655,198440,0.56,False,0.0,0.115,-6.857,1,79,0.0486,140.975,4,0.599
4,32,4.0,47.0,0,0.193,0.579,190947,0.904,False,0.0,0.064,-2.729,1,87,0.0618,82.014,4,0.681


In [28]:
df_01_dummies["Popularity"].describe()

count    4774.000000
mean       39.847926
std        21.823721
min         0.000000
25%        23.000000
50%        42.000000
75%        58.000000
max        89.000000
Name: Popularity, dtype: float64

In [29]:
df_01_dummies = df_01_dummies.dropna()
df_01_dummies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4229 entries, 0 to 4773
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Weekly.rank       4229 non-null   int64  
 1   Peak.position     4229 non-null   float64
 2   Weeks.on.chart    4229 non-null   float64
 3   Features          4229 non-null   int32  
 4   Acousticness      4229 non-null   float64
 5   Danceability      4229 non-null   float64
 6   Duration          4229 non-null   int64  
 7   Energy            4229 non-null   float64
 8   Explicit          4229 non-null   bool   
 9   Instrumentalness  4229 non-null   float64
 10  Liveness          4229 non-null   float64
 11  Loudness          4229 non-null   float64
 12  Mode              4229 non-null   int64  
 13  Popularity        4229 non-null   int64  
 14  Speechiness       4229 non-null   float64
 15  Tempo             4229 non-null   float64
 16  TimeSignature     4229 non-null   int64  
 17  

In [30]:
df_01_dummies["is_popular"] = df_01_dummies["Popularity"] >= 58
df_01_dummies = df_01_dummies.drop(columns=["Popularity"])
df_01_dummies["is_popular"].value_counts()

is_popular
False    3125
True     1104
Name: count, dtype: int64

In [31]:
# Split our preprocessed data into our features and target arrays
X = df_01_dummies.drop(columns = ["is_popular"])
y = df_01_dummies["is_popular"]
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [32]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [33]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
num_input_features = len(X_train_scaled[0])
num_neurons_hl1 = 30
num_neurons_hl2 = 30
num_neurons_hl3 = 30

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units = num_neurons_hl1, input_dim = num_input_features, activation = "relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units = num_neurons_hl2, activation = "relu"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units = num_neurons_hl3, activation = "relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units = 1, activation = "sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 30)                540       
                                                                 
 dense_7 (Dense)             (None, 30)                930       
                                                                 
 dense_8 (Dense)             (None, 30)                930       
                                                                 
 dense_9 (Dense)             (None, 1)                 31        
                                                                 
Total params: 2431 (9.50 KB)
Trainable params: 2431 (9.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [34]:
# Compile the model
nn.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])


In [35]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs = 50)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [36]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

34/34 - 0s - loss: 0.6929 - accuracy: 0.6758 - 159ms/epoch - 5ms/step
Loss: 0.6929194331169128, Accuracy: 0.6758034229278564


## Artist Week

In [37]:
df_02 = pd.read_csv("../Resources/ArtistWeek.csv")
print(df_02.info())
df_02.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 595 entries, 0 to 594
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      595 non-null    int64  
 1   index           595 non-null    int64  
 2   Artist          595 non-null    object 
 3   Followers       130 non-null    float64
 4   Genres          129 non-null    object 
 5   NumAlbums       130 non-null    float64
 6   YearFirstAlbum  130 non-null    float64
 7   Gender          126 non-null    object 
 8   Group.Solo      130 non-null    object 
 9   Features        108 non-null    object 
 10  Week            595 non-null    object 
 11  Total streams   595 non-null    float64
dtypes: float64(4), int64(2), object(6)
memory usage: 55.9+ KB
None


Unnamed: 0.1,Unnamed: 0,index,Artist,Followers,Genres,NumAlbums,YearFirstAlbum,Gender,Group.Solo,Features,Week,Total streams
0,0,0,ed sheeran,52698756.0,"pop,uk pop",8.0,2011.0,M,Solo,,2018-07-20,936155300.0
1,1,1,justin bieber,30711450.0,"canadian pop,dance pop,pop,post-teen pop",10.0,2009.0,M,Solo,,2018-12-14,45434490.0
2,2,2,jonas brothers,3069527.0,"boy band,dance pop,pop,post-teen pop",10.0,2006.0,M,Group,,2019-04-12,182558400.0
3,3,3,drake,41420478.0,"canadian hip hop,canadian pop,hip hop,pop rap,...",11.0,2010.0,M,Solo,,2018-07-20,3441947000.0
4,4,4,chris brown,9676862.0,"dance pop,pop,pop rap,r&b,rap",6.0,2005.0,M,Solo,,2017-11-10,138334200.0


In [38]:
df_02 = df_02.drop(columns= ["Unnamed: 0", "index","Artist","Genres","Week"])
df_02['Features'] = df_02['Features'].notna().astype(int)
df_02.head()

Unnamed: 0,Followers,NumAlbums,YearFirstAlbum,Gender,Group.Solo,Features,Total streams
0,52698756.0,8.0,2011.0,M,Solo,0,936155300.0
1,30711450.0,10.0,2009.0,M,Solo,0,45434490.0
2,3069527.0,10.0,2006.0,M,Group,0,182558400.0
3,41420478.0,11.0,2010.0,M,Solo,0,3441947000.0
4,9676862.0,6.0,2005.0,M,Solo,0,138334200.0


In [39]:
df_02_dummies = pd.get_dummies(df_02)
df_02_dummies.head()

Unnamed: 0,Followers,NumAlbums,YearFirstAlbum,Features,Total streams,Gender_F,Gender_M,Group.Solo_Group,Group.Solo_Solo
0,52698756.0,8.0,2011.0,0,936155300.0,False,True,False,True
1,30711450.0,10.0,2009.0,0,45434490.0,False,True,False,True
2,3069527.0,10.0,2006.0,0,182558400.0,False,True,True,False
3,41420478.0,11.0,2010.0,0,3441947000.0,False,True,False,True
4,9676862.0,6.0,2005.0,0,138334200.0,False,True,False,True


In [40]:
df_02_dummies["Followers"].describe()

count    1.300000e+02
mean     6.954067e+06
std      8.476361e+06
min      1.047000e+04
25%      1.916465e+06
50%      3.766748e+06
75%      8.624010e+06
max      5.269876e+07
Name: Followers, dtype: float64

In [41]:
df_02_dummies = df_02_dummies.dropna()
df_02_dummies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 130 entries, 0 to 129
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Followers         130 non-null    float64
 1   NumAlbums         130 non-null    float64
 2   YearFirstAlbum    130 non-null    float64
 3   Features          130 non-null    int32  
 4   Total streams     130 non-null    float64
 5   Gender_F          130 non-null    bool   
 6   Gender_M          130 non-null    bool   
 7   Group.Solo_Group  130 non-null    bool   
 8   Group.Solo_Solo   130 non-null    bool   
dtypes: bool(4), float64(4), int32(1)
memory usage: 6.1 KB


In [42]:
df_02_dummies["is_popular"] = df_02_dummies["Followers"] >= 8624010
df_02_dummies = df_02_dummies.drop(columns=["Followers"])
df_02_dummies["is_popular"].value_counts()

is_popular
False    97
True     33
Name: count, dtype: int64

In [43]:

# Split our preprocessed data into our features and target arrays
X = df_02_dummies.drop(columns = ["is_popular"])
y = df_02_dummies["is_popular"]
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


In [44]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [45]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
num_input_features = len(X_train_scaled[0])
num_neurons_hl1 = 30
num_neurons_hl2 = 30
num_neurons_hl3 = 30

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units = num_neurons_hl1, input_dim = num_input_features, activation = "relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units = num_neurons_hl2, activation = "relu"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units = num_neurons_hl3, activation = "relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units = 1, activation = "sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_10 (Dense)            (None, 30)                270       
                                                                 


 dense_11 (Dense)            (None, 30)                930       
                                                                 
 dense_12 (Dense)            (None, 30)                930       
                                                                 
 dense_13 (Dense)            (None, 1)                 31        
                                                                 
Total params: 2161 (8.44 KB)
Trainable params: 2161 (8.44 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [46]:
# Compile the model
nn.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])


In [47]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs = 50)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [48]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

2/2 - 0s - loss: 0.4644 - accuracy: 0.8485 - 127ms/epoch - 64ms/step
Loss: 0.4644012749195099, Accuracy: 0.8484848737716675


## Attributes

In [49]:
df_03 = pd.read_csv("../Resources/songAttributes_1999-2019.csv", encoding='latin-1')
print(df_03.info())
df_03.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154931 entries, 0 to 154930
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   X                 154931 non-null  int64  
 1   Acousticness      154931 non-null  float64
 2   Album             154931 non-null  object 
 3   Artist            154931 non-null  object 
 4   Danceability      154931 non-null  float64
 5   Duration          154931 non-null  int64  
 6   Energy            154931 non-null  float64
 7   Explicit          154931 non-null  bool   
 8   Instrumentalness  154931 non-null  float64
 9   Liveness          154931 non-null  float64
 10  Loudness          154931 non-null  float64
 11  Mode              154931 non-null  int64  
 12  Name              154931 non-null  object 
 13  Popularity        154931 non-null  int64  
 14  Speechiness       154931 non-null  float64
 15  Tempo             154931 non-null  float64
 16  TimeSignature     15

Unnamed: 0,X,Acousticness,Album,Artist,Danceability,Duration,Energy,Explicit,Instrumentalness,Liveness,Loudness,Mode,Name,Popularity,Speechiness,Tempo,TimeSignature,Valence
0,0,0.000728,Collective Soul (Deluxe Version),Collective Soul,0.52,234947,0.904,False,0.0103,0.0634,-5.03,1,Welcome All Again,35,0.0309,106.022,4,0.365
1,1,0.0182,Collective Soul (Deluxe Version),Collective Soul,0.581,239573,0.709,False,0.000664,0.174,-4.909,1,Fuzzy,31,0.0282,120.027,4,0.408
2,2,0.000473,Collective Soul (Deluxe Version),Collective Soul,0.572,198400,0.918,False,0.000431,0.0977,-3.324,0,Dig,30,0.0559,144.061,4,0.37
3,3,0.00097,Collective Soul (Deluxe Version),Collective Soul,0.596,231453,0.661,False,3.3e-05,0.113,-5.051,1,You,35,0.0254,111.975,4,0.183
4,4,3.6e-05,Collective Soul (Deluxe Version),Collective Soul,0.52,222520,0.808,False,1e-05,0.08,-4.553,0,My Days,21,0.0318,92.721,4,0.666


In [50]:
df_03 = df_03.drop(columns= ["X", "Album","Artist","Name"])
df_03.head()

Unnamed: 0,Acousticness,Danceability,Duration,Energy,Explicit,Instrumentalness,Liveness,Loudness,Mode,Popularity,Speechiness,Tempo,TimeSignature,Valence
0,0.000728,0.52,234947,0.904,False,0.0103,0.0634,-5.03,1,35,0.0309,106.022,4,0.365
1,0.0182,0.581,239573,0.709,False,0.000664,0.174,-4.909,1,31,0.0282,120.027,4,0.408
2,0.000473,0.572,198400,0.918,False,0.000431,0.0977,-3.324,0,30,0.0559,144.061,4,0.37
3,0.00097,0.596,231453,0.661,False,3.3e-05,0.113,-5.051,1,35,0.0254,111.975,4,0.183
4,3.6e-05,0.52,222520,0.808,False,1e-05,0.08,-4.553,0,21,0.0318,92.721,4,0.666


In [51]:
df_03_dummies = pd.get_dummies(df_03)
df_03_dummies.head()

Unnamed: 0,Acousticness,Danceability,Duration,Energy,Explicit,Instrumentalness,Liveness,Loudness,Mode,Popularity,Speechiness,Tempo,TimeSignature,Valence
0,0.000728,0.52,234947,0.904,False,0.0103,0.0634,-5.03,1,35,0.0309,106.022,4,0.365
1,0.0182,0.581,239573,0.709,False,0.000664,0.174,-4.909,1,31,0.0282,120.027,4,0.408
2,0.000473,0.572,198400,0.918,False,0.000431,0.0977,-3.324,0,30,0.0559,144.061,4,0.37
3,0.00097,0.596,231453,0.661,False,3.3e-05,0.113,-5.051,1,35,0.0254,111.975,4,0.183
4,3.6e-05,0.52,222520,0.808,False,1e-05,0.08,-4.553,0,21,0.0318,92.721,4,0.666


In [52]:
df_03_dummies["Popularity"].describe()

count    154931.000000
mean         20.249111
std          16.506651
min           0.000000
25%           6.000000
50%          17.000000
75%          31.000000
max          91.000000
Name: Popularity, dtype: float64

In [53]:
df_03_dummies = df_03_dummies.dropna()
df_03_dummies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154931 entries, 0 to 154930
Data columns (total 14 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Acousticness      154931 non-null  float64
 1   Danceability      154931 non-null  float64
 2   Duration          154931 non-null  int64  
 3   Energy            154931 non-null  float64
 4   Explicit          154931 non-null  bool   
 5   Instrumentalness  154931 non-null  float64
 6   Liveness          154931 non-null  float64
 7   Loudness          154931 non-null  float64
 8   Mode              154931 non-null  int64  
 9   Popularity        154931 non-null  int64  
 10  Speechiness       154931 non-null  float64
 11  Tempo             154931 non-null  float64
 12  TimeSignature     154931 non-null  int64  
 13  Valence           154931 non-null  float64
dtypes: bool(1), float64(9), int64(4)
memory usage: 15.5 MB


In [54]:
df_03_dummies["is_popular"] = df_03_dummies["Popularity"] >= 31
df_03_dummies = df_03_dummies.drop(columns=["Popularity"])
df_03_dummies["is_popular"].value_counts()

is_popular
False    115742
True      39189
Name: count, dtype: int64

In [55]:
# Split our preprocessed data into our features and target arrays
X = df_03_dummies.drop(columns = ["is_popular"])
y = df_03_dummies["is_popular"]
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [56]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [57]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
num_input_features = len(X_train_scaled[0])
num_neurons_hl1 = 10
num_neurons_hl2 = 10
num_neurons_hl3 = 10

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units = num_neurons_hl1, input_dim = num_input_features, activation = "relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units = num_neurons_hl2, activation = "relu"))

# Third hidden layer
#nn.add(tf.keras.layers.Dense(units = num_neurons_hl3, activation = "relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units = 1, activation = "sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_14 (Dense)            (None, 10)                140       
                                                                 
 dense_15 (Dense)            (None, 10)                110       
                                                                 
 dense_16 (Dense)            (None, 1)                 11        
                                                                 
Total params: 261 (1.02 KB)
Trainable params: 261 (1.02 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [58]:
# Compile the model
nn.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])


In [59]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs = 50)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [60]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

1211/1211 - 1s - loss: 0.5367 - accuracy: 0.7463 - 1s/epoch - 1ms/step
Loss: 0.536716103553772, Accuracy: 0.7462887167930603
