Based on our prior analyses, we have identified the Random Forest algorithm as the better choice for predicting song popularity, outperforming other models in terms of accuracy. We will now construct a Random Forest algorithm to predict the popularity of songs across various countries. This predictive model will allow to populate our visual map, offering insightful and precise representations of song popularity on a global scale. 

In [2]:
#Import libraries:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder
import ast
import numpy as np

In [3]:
file_path = '../input/merged_file.csv'

data = pd.read_csv(file_path)
data.head()

Unnamed: 0,track_id,available_markets,popularity_x,genre,artist_name,track_name,popularity_y,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,0BRjO6ga9RKCKjfDqeFgWV,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",4,Movie,Henri Salvador,C'est beau de faire un Show,0,0.611,0.389,99373,0.91,0.0,C#,0.346,-1.828,Major,0.0525,166.969,4/4,0.814
1,0BjC1NfoEOOusryehmNudP,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",0,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),1,0.246,0.59,137373,0.737,0.0,F#,0.151,-5.559,Minor,0.0868,174.003,4/4,0.816
2,0CoSDzoNIKCRs124s9uTVy,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,3,0.952,0.663,170267,0.131,0.0,C,0.103,-13.879,Minor,0.0362,99.488,5/4,0.368
3,0Gc6TVm52BwZD07Ki6tIvf,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",0,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0,0.703,0.24,152427,0.326,0.0,C#,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227
4,0IuslXpMROHdEPvSl1fTQK,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",5,Movie,Fabien Nataf,Ouverture,4,0.95,0.331,82625,0.225,0.123,F,0.202,-21.15,Major,0.0456,140.576,4/4,0.39


## Data Preprocessing:

The first step is to convert the raw data into a format that can be used by the Random Forest algorithm. To do so, we will be renaming the "popularity_x" column to popularity and dropping the "popularity_y" column corresponding to the predicting variable. 

In [4]:
data= data.rename(columns={"popularity_x": "popularity"})
data= data.drop(['popularity_y'], axis=1)
data.head()

Unnamed: 0,track_id,available_markets,popularity,genre,artist_name,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,0BRjO6ga9RKCKjfDqeFgWV,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",4,Movie,Henri Salvador,C'est beau de faire un Show,0.611,0.389,99373,0.91,0.0,C#,0.346,-1.828,Major,0.0525,166.969,4/4,0.814
1,0BjC1NfoEOOusryehmNudP,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",0,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0.246,0.59,137373,0.737,0.0,F#,0.151,-5.559,Minor,0.0868,174.003,4/4,0.816
2,0CoSDzoNIKCRs124s9uTVy,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0.952,0.663,170267,0.131,0.0,C,0.103,-13.879,Minor,0.0362,99.488,5/4,0.368
3,0Gc6TVm52BwZD07Ki6tIvf,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",0,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0.703,0.24,152427,0.326,0.0,C#,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227
4,0IuslXpMROHdEPvSl1fTQK,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",5,Movie,Fabien Nataf,Ouverture,0.95,0.331,82625,0.225,0.123,F,0.202,-21.15,Major,0.0456,140.576,4/4,0.39


We checked for null values and noticed that there were 58004 tracks with no assigned markets and 184 with no names. Since we have over 25161429 rows and have no way of determining to which market a song belongs to or its name, we decided to drop all the NaN values. 

In [5]:
#Check if there exists NaN values:
pd.isnull(data).sum()

#Checks the shape of the data to see if we can drop these NaN values:
data.shape

#Drops the NaN values:
data = data.dropna()

#Check if there's at least one NaN in the whole dataset:
data.isna().any().any()

False

## Encode Data


In [6]:
def encode_data(encoder, categorical_columns, market_data):
    encoded_data = encoder.fit_transform(market_data[categorical_columns]) #Fit and Transform the Encoder
    encoded_columns = encoder.get_feature_names_out(categorical_columns) #Reconstruct the DataFrame
    encoded_df = pd.DataFrame(encoded_data, columns=encoded_columns)
    data_prepared = market_data.drop(categorical_columns, axis=1).join(encoded_df)
    data_prepared = data_prepared.dropna()
    return data_prepared, encoded_columns

In [7]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
categorical_columns = ['genre', 'key', 'mode', 'time_signature']


In [8]:
encoded_data, encoded_columns= encode_data(encoder, categorical_columns, data)

In [9]:
encoded_data

Unnamed: 0,track_id,available_markets,popularity,artist_name,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,...,key_F#,key_G,key_G#,mode_Major,mode_Minor,time_signature_0/4,time_signature_1/4,time_signature_3/4,time_signature_4/4,time_signature_5/4
0,0BRjO6ga9RKCKjfDqeFgWV,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",4,Henri Salvador,C'est beau de faire un Show,0.6110,0.389,99373,0.910,0.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0BjC1NfoEOOusryehmNudP,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",0,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0.2460,0.590,137373,0.737,0.000000,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0CoSDzoNIKCRs124s9uTVy,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",2,Joseph Williams,Don't Let Me Be Lonely Tonight,0.9520,0.663,170267,0.131,0.000000,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0Gc6TVm52BwZD07Ki6tIvf,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",0,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0.7030,0.240,152427,0.326,0.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0IuslXpMROHdEPvSl1fTQK,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",5,Fabien Nataf,Ouverture,0.9500,0.331,82625,0.225,0.123000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210241,5egnFVvJxjpEXWiap4Z7cH,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",33,Klyne,Ecstacy,0.3350,0.633,215474,0.462,0.000023,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
210242,3nypYvHtEnXd1PaYp9qAvu,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",34,Amy Winehouse,You Know I'm No Good - Ghostface UK Version,0.0197,0.646,202733,0.827,0.000004,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
210243,0eSCQfYaAn7IOyFlVeSs0q,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",32,Autograf,Slow Burn,0.6880,0.711,220800,0.697,0.588000,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
210244,5lu9R9rCwIv080hpqJ6rny,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",61,BAYNK,Find You,0.1070,0.888,180006,0.527,0.011500,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


We notice that the available markets are listed as a string representations of lists. For our Random Forest algorithm, we need to be considering each marketplace as a separate instance for prediction. This requires transforming the dataset so that each row corresponds to a single marketplace-song combination. This can be done using a method like explode in pandas. To use this method however, we need to convert the string representations to actual lists.

In [11]:
# Convert string representation to list
denormalized_data= encoded_data.copy()
denormalized_data['available_markets'] = denormalized_data['available_markets'].apply(ast.literal_eval)

# Expand the 'available_markets' list into separate rows
denormalized_data = denormalized_data.explode('available_markets')

denormalized_data.head()

Unnamed: 0,track_id,available_markets,popularity,artist_name,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,...,key_F#,key_G,key_G#,mode_Major,mode_Minor,time_signature_0/4,time_signature_1/4,time_signature_3/4,time_signature_4/4,time_signature_5/4
0,0BRjO6ga9RKCKjfDqeFgWV,AR,4,Henri Salvador,C'est beau de faire un Show,0.611,0.389,99373,0.91,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
0,0BRjO6ga9RKCKjfDqeFgWV,AU,4,Henri Salvador,C'est beau de faire un Show,0.611,0.389,99373,0.91,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
0,0BRjO6ga9RKCKjfDqeFgWV,AT,4,Henri Salvador,C'est beau de faire un Show,0.611,0.389,99373,0.91,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
0,0BRjO6ga9RKCKjfDqeFgWV,BE,4,Henri Salvador,C'est beau de faire un Show,0.611,0.389,99373,0.91,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
0,0BRjO6ga9RKCKjfDqeFgWV,BO,4,Henri Salvador,C'est beau de faire un Show,0.611,0.389,99373,0.91,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


## Feature Selection:

For feature selection, we will select the following features which are only based on music theory and not artist/song information: acousticness, danceability, duration_ms, energy, instrumentalness, key, liveliness, loudness, mode, speeciness, tempo, time_signature, and valence. 

In [12]:
#Complete data
complete_data = denormalized_data.copy() 

# Define the features to keep
features = ["available_markets", "acousticness", "danceability", "duration_ms", "energy", "instrumentalness", "liveness", 
            "speechiness", "tempo", "valence", "loudness", "popularity"]

#Also select the encoded columns
features.extend(encoded_columns)

# Select only the specified features from the dataset
data = denormalized_data[features]

data.head()

Unnamed: 0,available_markets,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,speechiness,tempo,valence,...,key_F#,key_G,key_G#,mode_Major,mode_Minor,time_signature_0/4,time_signature_1/4,time_signature_3/4,time_signature_4/4,time_signature_5/4
0,AR,0.611,0.389,99373,0.91,0.0,0.346,0.0525,166.969,0.814,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
0,AU,0.611,0.389,99373,0.91,0.0,0.346,0.0525,166.969,0.814,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
0,AT,0.611,0.389,99373,0.91,0.0,0.346,0.0525,166.969,0.814,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
0,BE,0.611,0.389,99373,0.91,0.0,0.346,0.0525,166.969,0.814,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
0,BO,0.611,0.389,99373,0.91,0.0,0.346,0.0525,166.969,0.814,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


## Model Training:

We will be using a Random Forest Regressor since our target data is quantitative. Our goal is to predict the most popular song for each available market. To do so, we will iterate over each available market, and fit a random forest model for each one.The performance evaluation of the model will be measured using the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R²). A lower MSE and MAE indicate a better fit of the model and an R² closer to 1 indicates a better fit of the model.

We see that every available markets has over 130 000 datapoints associated to them which allows us to split the dataset and fit a random forest model for each one without worrying about insufficient data.

In [13]:
markets = data["available_markets"].unique()
per_market_count = data.groupby('available_markets').size().reset_index(name='Count')

less_data = per_market_count[per_market_count['Count']<130000]
print(f"Markets with insufficient rows for data analysis- \n{less_data}")

print("All markets have enough rows for data analysis.")

Markets with insufficient rows for data analysis- 
    available_markets   Count
25                 BY  121553
76                 IQ  129532
101                LY  128764
178                VU  129858
All markets have enough rows for data analysis.


We will now do this same process for all market places:

* **<u> One Hot Encoding**:</u> From the dataset, the categorical variables are available_markets, genre, key, mode, and time signature. We will need to encode them into numerical format using one-hot encoding. This creates a binary column for each category.

* **<u>Splitting the Dataset**:</u> We will now split the data into features (X) and target (y). Here, our target variable is the song popularity. 

* **<u>Training of Random Forest**</u>: We will be using a Random Forest Regressor since our target data is quantitative. Our goal is to predict the most popular song for each available market. To do so, we will iterate over each available market, and fit a random forest model for each one.The performance evaluation of the model will be measured using the Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R²). A lower MSE and MAE indicate a better fit of the model and an R² closer to 1 indicates a better fit of the model.

* **<u>Feature Importance**</u>: After training, we will now be checking which features are most important in predicting the song's popularity to provide the information in the visualisation map. We will do that by accessing the feature_importances_ attribute of the fitted model. The importance is computed as the (normalized) total reduction of the criterion brought by that feature. 


In [42]:
def decode_data(encoder, categorical_columns, encoded_row, encoded_columns):
    row_df = pd.DataFrame(encoded_row.values.reshape(1, -1), columns=encoded_row.index)
    encoded_row_df = row_df[encoded_columns]
    decoded_data = encoder.inverse_transform(encoded_row_df)
    decoded_df = pd.DataFrame(decoded_data, columns=categorical_columns)
    decoded_market_data = row_df.drop(encoded_columns, axis=1).join(decoded_df)
    return decoded_market_data


In [69]:
def get_most_popular_song(market, predictions, X_test, complete_data, feature_importances=None, encoder= None, categorical_columns= [], encoded_columns= []):
    most_popular_idx = np.argmax(predictions)
    print(most_popular_idx)
    most_popular_song = X_test.iloc[most_popular_idx].copy()
    matching_rows = complete_data[(complete_data[list(most_popular_song.index)].eq(most_popular_song)).all(axis=1)]
    if len(matching_rows)==0:
        return []
    most_popular_song = matching_rows.iloc[0].copy()
    decoded_data= decode_data(encoder, categorical_columns, most_popular_song, encoded_columns)
    decoded_data = decoded_data.iloc[0]
    decoded_data['available_markets'] = market
    print(decoded_data)
    
    decoded_data['feature_importance'] = feature_importances
    return decoded_data

In [50]:
def get_train_test_data(market):
    market_data = data[data["available_markets"] == market]
    market_data.reset_index(drop=True, inplace=True)
    market_data= market_data.drop(['available_markets'], axis=1)
    
    #Split the data into train and test sets:
    y = market_data["popularity"]
    X = market_data.drop(["popularity"], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state= 42)
    return X_train, X_test, y_train, y_test

In [51]:
def get_final_prediction(market, final_predictions=[]):
    X_train, X_test, y_train, y_test = get_train_test_data(market)

    #Train the model:
    model = RandomForestRegressor(random_state=42) 
    model.fit(X_train, y_train)

    #Make predictions:
    predictions = model.predict(X_test)
    # Evaluate the model
    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    print(f"For market: {market}")

    print(f"Mean Squared Error: {mse}")
    print(f"Mean Absolute Error: {mae}")
    print(f"R-squared: {r2}")

    #Feature importances:
    importances = model.feature_importances_
    feature_names = X_train.columns

    feature_importances = dict(zip(feature_names, importances))
    sorted_importances = sorted(feature_importances.items(), key=lambda x: x[1], reverse=True)
    
    final_predictions.append(get_most_popular_song(market, predictions, X_test, complete_data, sorted_importances, encoder, categorical_columns, encoded_columns))

    return final_predictions
    

In [70]:
final_predictions=[]

In [80]:
for market in markets[5:10]:
    final_predictions= get_final_prediction(market, final_predictions)

print(final_predictions)
pd.DataFrame(final_predictions).to_csv('../outputs/final_predictions.csv', index=False) 

For market: AU
Mean Squared Error: 146.1895112309048
Mean Absolute Error: 9.399691239661642
R-squared: 0.6347769383527415
8565
track_id             7IsEXPk6qqt30FfQv4SZMa
available_markets                        AU
popularity                               66
artist_name                        Fastball
track_name                          The Way
acousticness                          0.154
danceability                          0.653
duration_ms                          257093
energy                                0.874
instrumentalness                   0.000014
liveness                              0.307
loudness                              -5.66
speechiness                          0.0341
tempo                                120.84
valence                               0.902
genre                                   Pop
key                                      F#
mode                                  Minor
time_signature                          4/4
Name: 0, dtype: object


In [79]:
markets

array(['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA', 'CL', 'CO', 'CR',
       'CY', 'CZ', 'DK', 'DO', 'DE', 'EC', 'EE', 'SV', 'FI', 'FR', 'GR',
       'GT', 'HN', 'HK', 'HU', 'IS', 'IE', 'IT', 'LV', 'LT', 'LU', 'MY',
       'MT', 'MX', 'NL', 'NZ', 'NI', 'NO', 'PA', 'PY', 'PE', 'PH', 'PL',
       'PT', 'SG', 'SK', 'ES', 'SE', 'CH', 'TW', 'TR', 'UY', 'US', 'GB',
       'AD', 'LI', 'MC', 'ID', 'JP', 'TH', 'VN', 'RO', 'IL', 'ZA', 'SA',
       'AE', 'BH', 'QA', 'OM', 'KW', 'EG', 'MA', 'DZ', 'TN', 'LB', 'JO',
       'PS', 'IN', 'BY', 'KZ', 'MD', 'UA', 'AL', 'BA', 'HR', 'ME', 'MK',
       'RS', 'SI', 'KR', 'BD', 'PK', 'LK', 'GH', 'KE', 'NG', 'TZ', 'UG',
       'AG', 'AM', 'BS', 'BB', 'BZ', 'BT', 'BW', 'BF', 'CV', 'CW', 'DM',
       'FJ', 'GM', 'GE', 'GD', 'GW', 'GY', 'HT', 'JM', 'KI', 'LS', 'LR',
       'MW', 'MV', 'ML', 'MH', 'FM', 'NA', 'NR', 'NE', 'PW', 'PG', 'WS',
       'SM', 'ST', 'SN', 'SC', 'SL', 'SB', 'KN', 'LC', 'VC', 'SR', 'TL',
       'TO', 'TT', 'TV', 'VU', 'AZ', 'BN', 'BI', 'K

### Convert feature importance encoded features to decoded ones and normalize int or float cols

In [2]:
from collections import defaultdict

def decode_cols(feature_str):
    feature_importance_str = feature_str
    feature_importance_list = ast.literal_eval(feature_importance_str)

    decoded_columns_sum = defaultdict(float)
    for feature, importance in feature_importance_list:
        decoded_column = feature.split('_')[0]  # Extract the decoded column name
        decoded_columns_sum[decoded_column] += importance
    sorted_decoded_columns_sum = dict(sorted(decoded_columns_sum.items(), key=lambda item: item[1], reverse=True))
    return sorted_decoded_columns_sum

In [3]:
def normalize_cols(df):
    int_cols = df.select_dtypes(include=['int','float']).columns
    df[int_cols] = (df[int_cols] - df[int_cols].min()) / (df[int_cols].max() - df[int_cols].min())
    return df

In [4]:
import pandas as pd
import ast
file_path = '../outputs/final_predictions.csv'

final_predictions_df = pd.read_csv(file_path)
final_predictions_df.head()
final_predictions_df['feature_importance'] = final_predictions_df['feature_importance'].apply(lambda x: decode_cols(x))
df= normalize_cols(final_predictions_df)
df.to_csv('../outputs/normalised_final_predictions.csv', index= False)