<a href="https://colab.research.google.com/github/1021114Carlos/DS_deepDive/blob/diving_production/PopularityPrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BUSSINES PROBLEM: predict the popularity of songs on the Spotify Top 200 Weekly (Global) charts. The features of the songs will be used as input variables, and the song's popularity score will be the target variable for the model. Will dataset project is suitable for Tree-based regression models like Random Forest, Decision Trees, and or XGBoost.

In [None]:
url = "https://ddc-datascience.s3.amazonaws.com/Projects/Project.4-Spotify/Data/Spotify.csv"
!curl -s -I {url}

In [None]:
%magic

In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
music_df = pd.read_csv(url)

In [None]:
music_df.shape

In [None]:
music_df.describe().T

In [None]:
#%%capture
music_df

In [None]:
music_df.isnull()

In [None]:
music_df.isnull().sum()*100

In [None]:
music_df.isnull().value_counts()

In [None]:
music_df.dtypes

### **observations about data type**
>**From week of highest charting to Chords, we have object type instead of ints or floats. Need to take a look a closer look at these columns**


In [None]:
music_df.info()

In [None]:
# Looking for the alphanumeric value in columns with type object.
import re

pattern = r'\s+|,+'

for col in music_df.columns:
    for index, value in music_df[col].iteritems():
        if re.search(pattern, str(value)):  # Convert value to string for regex
            print(f"Column: {col}, Index: {index}, Value: {value}")

In [None]:
# substitue with Null values where whitespaces, commas, exist.

for col in music_df.columns:
    if pd.api.types.is_string_dtype(music_df[col]):
        music_df[col] = music_df[col].str.strip()
music_df = music_df.replace({"":np.nan})

In [None]:
music_df.isnull().sum()

In [None]:
music_df.info()

In [None]:
music_df.isnull().sum().sum()

### ***let's make a copy of the original dataset and drop the null values.***

In [None]:
music_df_copy = music_df.copy()

In [None]:
# Dropping Null values from the dataset copy only.
music_df_copy = music_df_copy.dropna()

In [None]:
music_df_copy.isnull().sum()

In [None]:
music_df_copy.isnull().sum().sum()

In [None]:
music_df_copy.info()

### **Exploring and modifying columns. dropping columns if necessary.**




In [None]:
# Stream column have integer values with commas. Covert Streams column to numerical
music_df_copy['Streams'] = music_df_copy['Streams'].str.replace(',', '').astype(int)

In [None]:
# Denote Week of Highest Charting as WHC and Week of ending charting as WEC.
music_df_copy[['WHC_start_date', 'WEC_end_date']] = music_df_copy['Week of Highest Charting'].str.split('--', expand=True)

In [None]:
# Modifying the date format.
music_df_copy['WHC_start_date'] = pd.to_datetime(music_df_copy['WHC_start_date'], format='%Y-%m-%d')
music_df_copy['WEC_end_date'] = pd.to_datetime(music_df_copy['WEC_end_date'], format='%Y-%m-%d')

In [None]:
# Weeks Charted provides similar information as Week of Highest Chrting. drop Week of Highest Charting and Weeks charted.
music_df_copy = music_df_copy.drop(columns = ['Week of Highest Charting', "Weeks Charted"])

In [None]:
music_df_copy["Song ID"].unique()

In [None]:
# There are 1516 unique cells out of 1535. the proportion of cells that have the same identifier is to small to consider it. We'll drop this column next.
music_df_copy["Song ID"].nunique()

In [None]:
music_df_copy = music_df_copy.drop("Song ID", axis=1)

In [None]:
# Let's look at index column.
music_df_copy["Index"].unique()

In [None]:
music_df_copy["Index"].nunique()

In [None]:
# As suspected, Index columns is only to reference the number of rows in the datase. 1,2,...,1556. Let's drop it.
music_df_copy = music_df_copy.drop("Index", axis=1)

In [None]:
# How many artist are there in this dataset
music_df_copy["Artist"].nunique()

In [None]:
music_df_copy["Artist"].value_counts()

In [None]:
# Let's normalize each time a artist is found in the rows of the artist column.
artist_counts = (music_df_copy['Artist'].value_counts(normalize=True) * 100).round(3)
artist_info_df = pd.DataFrame({'Artist Name': artist_counts.index, 'Percentage (%)': artist_counts.values})
artist_info_df

In [None]:
artist_info_df.dtypes

In [None]:
# Let's consider Popularity of songs. convert Popularity column to numeric.
music_df_copy["Popularity"] = pd.to_numeric(music_df_copy["Popularity"])
pd.DataFrame(music_df_copy[music_df_copy["Popularity"] > 90]["Song Name"])

## **Let's look at the data visually.**

In [None]:
fig = plt.figure(figsize=(17, 12))
plt.title('Top 25 artists by number of genres', y = 1.05)
ax = fig.subplots()
ax.set_ylabel("Genre")
ax.set_xlabel("Artist")
music_df_copy["Artist"].value_counts()[:25].plot(ax=ax, kind="bar")
plt.show()

In [None]:
# Who are the most popular artists.
top_artists = music_df_copy['Artist'].value_counts().head(20)

plt.figure(figsize=(14, 8))
top_artists.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title("Top 20 Most Popular Artists", y = 1.25)
plt.xlabel("Artist")
plt.ylabel("Count")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
top_songs = music_df_copy.sort_values(by='Streams', ascending=False).head(50)

# Separate data for plotting
song_names = top_songs['Song Name'].tolist()
stream_counts = top_songs['Streams'].tolist()

plt.figure(figsize=(15, 8))

plt.bar(song_names, stream_counts, color='lightcoral', edgecolor='black')

plt.title("Popularity of top 50 Songs")
plt.xlabel("Song")
plt.ylabel("Streams")
plt.xticks(rotation=90, ha='right')  # Rotate x-axis labels for better readability
plt.tight_layout()
plt.show()

In [None]:
sns.scatterplot(x=music_df_copy.Streams, y=music_df_copy.Popularity, s=50)

plt.xlabel('Streaming Frequency')
plt.ylabel('Popularity');

In [None]:
# Based on this observation, I might not need these columns.

corr = music_df_copy.corr()[['Popularity']].sort_values(by='Popularity', ascending=False)
plt.figure(figsize=(8, 4))
heatmap = sns.heatmap(corr, annot=True, cmap='Greens');
heatmap.set_title('HCP, NTC, Streams columns correlated features to Popularity', fontdict={'fontsize':18}, pad=16);

In [None]:
plt.figure(figsize=(20, 15))
plt.title('Popularity trend over different features', y = 1.05)
Xfeatures = ["Highest Charting Position", "Number of Times Charted", "Song Name", "Streams", "Artist", "Artist Followers", "Genre"]
Yfeature = "Popularity"
cols = 4
rows = len(Xfeatures)//cols + 1
for idx, feat in enumerate(Xfeatures):
  plt.subplot(rows,cols,idx+1)
  sub_group = music_df_copy.groupby(feat)
  Yfeature = sub_group.mean()["Popularity"] # Mean popularity is good enough to estimate and visualize
  sns.scatterplot(x=Yfeature.index, y=Yfeature);

## **Dropping columns. These columns are not features that contribute to the popularity of a song**
 * Highest Charting
 * Number of times Charted
 * Song Names

In [None]:
music_df_copy = music_df_copy.drop(columns=["Highest Charting Position", "Number of Times Charted", "Song Name"])

In [None]:
# Instead of one-hot encoding, obtain every element and count how popular the genre is.
# Concatenate and use column as a feature.
genre_count = everyGenre["Genre"].explode().value_counts().reset_index(name='Genre Count')
genre_count.columns = ['Genre', 'Genre Count']  # Rearrange columns for clarity

genre_count

In [None]:
# Let's evaluate popular genres
# everyGenre = music_df_copy[["Genre"]]

# def count_unique_items(list_of_items):
#   item_counts = {}

#   for item in list_of_items:
#     # If the item is already in the dictionary, increment its count
#     if item in item_counts:
#       item_counts[item] += 1
#     # Otherwise, add the item to the dictionary with a count of 1
#     else:
#       item_counts[item] = 1

#   return item_counts

# everyGenre["Genre"] = everyGenre["Genre"].apply(lambda x: [i.strip() for i in x[1:-1].split(", ")])
# # Flatten all lists in the "Genre" column into a single list
# all_items = sum(everyGenre["Genre"], [])

# # Count occurrences of each unique item
# item_counts = count_unique_items(all_items)

# # Convert the item counts into a DataFrame
# total_counts = pd.DataFrame.from_dict(item_counts, orient='index', columns=['Count'])

# # Sort the DataFrame by count in descending order
# genre_total = total_counts.sort_values(by='Count', ascending=False)
# genre_total

In [None]:
# Looking at  the top Genres. Let's keep this column for now.
genre_count[genre_count["Genre Count"] > 80]

In [None]:
music_df_copy = pd.concat([music_df_copy, genre_count], axis=1)

In [None]:
artist_count = music_df_copy["Artist"].explode().value_counts().reset_index(name='Artist Count')
artist_count.columns = ['Artist', 'Artist Count']  # Rearrange columns for clarity

artist_count

In [None]:
# use artist count as a feature as well.
music_df_copy = pd.concat([music_df_copy, artist_count], axis=1)

In [None]:
music_df_copy

### *Let's drop the NaN*

In [None]:
music_df_copy = music_df_copy.dropna()

### Now Let's investigate the columns with Speechness, Energy, Loudness, etc

In [None]:
# trying to graph the features return an error, caused by the the datatype.
# convert from object to float
music_df_copy[['Danceability', 'Energy', 'Loudness', 'Speechiness', 'Acousticness', 'Liveness', 'Tempo', 'Valence']] = music_df_copy[['Danceability', 'Energy', 'Loudness', 'Speechiness', 'Acousticness', 'Liveness', 'Tempo', 'Valence']].astype(float);

In [None]:
music_df_copy

In [None]:
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (16, 8)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

In [None]:
tem_music_df_copy = music_df_copy.head(100)

In [None]:
plt.plot(temp_music_df_copy.Streams, temp_music_df_copy.Danceability, 's-b')
plt.plot(temp_music_df_copy.Streams, temp_music_df_copy.Energy, 'o--r')

plt.xlabel('streaming frequency')
plt.ylabel('Danceability/Energy');

plt.title("Danceability & Energy relation")
plt.legend(['Danceability', 'Energy']);

In [None]:
plt.plot(temp_music_df_copy.Acousticness, temp_music_df_copy.Danceability, 's-b')
plt.plot(temp_music_df_copy.Acousticness, temp_music_df_copy.Energy, 'o--r')

plt.xlabel('Acousticness')
plt.ylabel('Danceability');

plt.title("Danceability &E acoustics relation")
plt.legend(['Danceability', 'acoustics']);

In [None]:
sns.scatterplot(x=temp_music_df_copy.Streams,
                y=temp_music_df_copy.Valence,
                s=100);

plt.xlabel('Num of times streamed')
plt.ylabel('Valence(Happy, cheerful)');
plt.title("Streams based on Valence");

In [None]:


fig,ax = plt.subplots(figsize=(20, 5))
sns.despine(fig, left=True, bottom=True)
sns.set_context(rc={"lines.linewidth": 3})

features = ["Acousticness","Danceability","Energy","Speechiness","Liveness","Valence"]
ax.set_ylabel('Measure')
ax.set_xlabel('Popularity')
ax.set_title('Audio characteristic trend over years', y = 1.05)
for col in features:
    x = temp_music_df_copy.groupby("Popularity")[col].mean()
    ax= sns.lineplot(x=x.index,y=x,label=col)

### *Daceability, Energy, Valence are strong features for popularity*

In [None]:
plt.figure(figsize=(20, 15))
plt.title('Popularity trend over different features', y = 1.05)
Xfeatures = ["Danceability", "Acousticness", "Loudness", "Energy", "Liveness", "Valence", "Genre Count", "Artist Count"]
Yfeature = "Popularity"
cols = 4
rows = len(Xfeatures)//cols + 1
for idx, feat in enumerate(Xfeatures):
  plt.subplot(rows,cols,idx+1)
  sub_group = music_df_copy.groupby(feat)
  Yfeature = sub_group.mean()["Popularity"]
  sns.scatterplot(x=Yfeature.index, y=Yfeature)

## time to process the data

In [None]:
music_df_copy

In [None]:
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
import plotly.express as px

In [None]:
# Selecting features based on graphs observations.
features_selected = music_df_copy[["Energy", "Valence", "Danceability", "Acousticness", "Streams", "Artist Followers", "Genre Count", "Artist Count"]]
target = music_df_copy["Popularity"]

### *Decision tree*

In [None]:
X = features_selected
y = target

results = {}

numLoops = 100

for i in range(1,6):
  mean_error = np.zeros(numLoops)
  for idx in range(0,numLoops):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, shuffle = True, )
    model = DecisionTreeRegressor(max_depth = i)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    mean_error[idx] = mean_squared_error(y_test, y_pred)
    results[f"Depth = {i}"] = mean_error

print(f'Decision Tree RMSE: {np.sqrt(mean_error).mean()}')

*Random Forest*

In [None]:
results = {}

numLoops = 100

for i in range(1,6):
  mean_error = np.zeros(numLoops)
  for idx in range(0,numLoops):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, shuffle = True, )
    random_forest_model = RandomForestRegressor(max_depth = i)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    mean_error[idx] = mean_squared_error(y_test, y_pred)
    results[f"Depth = {i}"] = mean_error

print(f'Random Forest RMSE: {np.sqrt(mean_error).mean()}')

### *XGBRegresson*

In [None]:
results = {}

# Define numLoops
numLoops = 100

for column in X.columns:
    X[column] = pd.to_numeric(X[column], errors='coerce')

# Perform grid search
for i in range(1, 6):
    mean_error = np.zeros(numLoops)
    for idx in range(0, numLoops):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
        xgb_model = XGBRegressor(max_depth=i)
        xgb_model.fit(X_train, y_train)
        y_pred_xgb = xgb_model.predict(X_test)
        mean_error[idx] = mean_squared_error(y_test, y_pred_xgb)
    results[f"Depth = {i}"] = mean_error

# Print results
for depth, error in results.items():
    print(f'XGBRegressor RMSE (Depth={depth}): {np.sqrt(error).mean()}')

In [None]:
importances = model.feature_importances_

# Create a DataFrame to display feature importances
feature_importances_rf = pd.DataFrame({'Feature': X_train.columns, 'Importance': importances})
feature_importances_rf = feature_importances_rf.sort_values(by='Importance', ascending=False)

# Display feature importances
print(feature_importances_rf)

In [None]:
feature_importances_rf = feature_importances_rf.sort_values(by='Importance', ascending=False)

feature_importances_rf = feature_importances_rf[::-1]

# Plot the feature importance values using Plotly Express
fig = px.bar(feature_importances_rf, x='Importance', y='Feature', orientation='h',
             title='Feature Importance Plot for Random Forest Model',
             labels={'Importance': 'Importance', 'Feature': 'Feature'})
fig.show()

In [None]:
importances = xgb_model.feature_importances_

# Create a DataFrame to display feature importances
feature_importances_xgb = pd.DataFrame({'Feature': X_train.columns, 'Importance': importances})
feature_importances_xgb = feature_importances_xgb.sort_values(by='Importance', ascending=False)

In [None]:
feature_importances_xgb = feature_importances_xgb.sort_values(by='Importance', ascending=False)

feature_importances_xgb = feature_importances_xgb[::-1]

# Plot the feature importance values using Plotly Express
fig = px.bar(feature_importances_xgb, x='Importance', y='Feature', orientation='h',
             title='Feature Importance Plot for XG Boost Model',
             labels={'Importance': 'Importance', 'Feature': 'Feature'})
fig.show()

# Conclusion: Based on the RMSE values, XGBoost with a depth of 2 achieved the best performance in predicting song popularity (RMSE: 8.0541) followed by Random Forest (RMSE: 10.52) and Decision Tree (RMSE: 10.97). This indicates that XGBoos is a more suitable model for this dataset.