<a href="https://colab.research.google.com/github/Ali7109/SpotifyStreamPrediction/blob/main/SpotifyStreamPrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Spotify Stream Prediction (Linear)
Explore the intriguing world of Spotify stream prediction in our project, "Spotify Stream Prediction (Linear)," where we investigate the connection between a song's 'speechiness' and its popularity.

# **PROJECT INITIALIZATION**

In [14]:
!pip install -q sklearn

In [15]:
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib

import tensorflow.compat.v2.feature_column as fc

import tensorflow as tf

# **Project Start**

Initialization complete.
This is assisting logic from this point on.

In [16]:
from sklearn.preprocessing import MinMaxScaler


In [34]:
df_train = pd.read_csv("spotify-2023.csv", encoding="ISO-8859-1")
df_eval = pd.read_csv("spotify-2023.csv", encoding="ISO-8859-1")
columns_to_check = [
    'artist_count', 'released_month', 'released_day', 'in_spotify_playlists',
    'in_spotify_charts', 'in_apple_playlists', 'in_apple_charts', 'in_deezer_playlists',
    'in_deezer_charts', 'in_shazam_charts', 'bpm', 'key', 'mode', 'danceability_%',
    'valence_%', 'energy_%', 'acousticness_%', 'instrumentalness_%', 'liveness_%', 'track_name', 'artist(s)_name',
    'released_year'
]

for column in columns_to_check:
  df_eval.pop(column)
  df_train.pop(column)

df_train['speechiness_%'] = pd.to_numeric(df_train['speechiness_%'], errors='coerce')
df_train['streams'] = pd.to_numeric(df_train['streams'], errors='coerce')

df_eval['speechiness_%'] = pd.to_numeric(df_eval['speechiness_%'], errors='coerce')
df_eval['streams'] = pd.to_numeric(df_eval['streams'], errors='coerce')

# Fill NaN values
df_train = df_train.fillna(0)
df_eval = df_eval.fillna(0)

scaler = MinMaxScaler()

df_train[['speechiness_%', 'streams']] = scaler.fit_transform(df_train[['speechiness_%', 'streams']])
df_eval[['speechiness_%', 'streams']] = scaler.transform(df_eval[['speechiness_%', 'streams']])

y_train = df_train.pop("streams")
y_eval = df_eval.pop("streams")


In [None]:
# Display the styled DataFrame

NUMERIC_COLUMNS = ['speechiness_%']

feature_columns = [tf.feature_column.numeric_column(feature_name, dtype=tf.float32) for feature_name in NUMERIC_COLUMNS]

print(feature_columns)

In [41]:


def make_input_fn(data_df, label_df, num_epochs=10, shuffle=True, batch_size=32):
  def input_function():
    features = dict(data_df)
    ds = tf.data.Dataset.from_tensor_slices((features, label_df))
    if shuffle:
      ds = ds.shuffle(1000)
    ds = ds.batch(batch_size).repeat(num_epochs)

    return ds
  return input_function


train_input_fn = make_input_fn(df_train, y_train)
eval_input_fn = make_input_fn(df_eval, y_eval, num_epochs=1, shuffle=False)

linear_est = tf.estimator.LinearRegressor(feature_columns=feature_columns)

linear_est.train(train_input_fn)
result = linear_est.evaluate(eval_input_fn)

clear_output()
print(result)




{'average_loss': 0.023199234, 'label/mean': 0.13866426, 'loss': 0.023077926, 'prediction/mean': 0.12885933, 'global_step': 300}


In [42]:
formatted_result = {
    'Average Loss': '{:.3f}'.format(result['average_loss']),
    'Label Mean': '{:.3f}'.format(result['label/mean']),
    'Total Loss': '{:.3f}'.format(result['loss']),
    'Prediction Mean': '{:.3f}'.format(result['prediction/mean']),
    'Global Step': result['global_step']
}

for key, value in formatted_result.items():
    print(f'{key}: {value}')

Average Loss: 0.023
Label Mean: 0.139
Total Loss: 0.023
Prediction Mean: 0.129
Global Step: 300
