4-ON-4 : The 2024 Power Ranking by [dyl_m](https://www.youtube.com/playlist?list=PLOMUdQFdS-XNqUpFzE89aHgwn0wrBidyG)
----------------------------------------

## Dependencies

In [None]:
import json
import pandas as pd
import plotly.express as px

from sklearn.preprocessing import MinMaxScaler

## Functions

In [None]:
def load_data(file_path: str, category: str = None) -> pd.DataFrame:
    """Load data as pandas Dataframe, with basic information and selected statistics if needed
    :param file_path: path to data as JSON file to parse
    :param category: category of stats to select
    :return: pandas Dataframe with all information requested.
    """
    with open(file_path, 'r', encoding='utf-8') as j_file:
        json_data = json.load(j_file)

    with open('../data/stats_category.json', 'r', encoding='utf-8') as c_file:
        stats_categories = json.load(c_file)

    general_data_keys = ('artist_list', 'genre', 'label', 'title')
    renaming = {'s_id': 'songstats_id', 's_title': 'songstats_title'}

    ids = pd.DataFrame([{key: value for key, value in field['songstats_identifiers'].items()} for field in
                        json_data]).rename(columns=renaming)

    general_data = pd.DataFrame(
        [{key: value for key, value in field.items() if key in general_data_keys} for field in json_data])

    if not category:
        return pd.concat((ids, general_data), axis=1)

    stats_selected = pd.DataFrame(
        [{key: value for key, value in field['data'].items() if key in stats_categories[category]}
         for field in json_data]).fillna(0)

    return pd.concat((ids, general_data, stats_selected), axis=1)

In [None]:
def rescale_stats(df: pd.DataFrame, other_fields: list) -> pd.DataFrame:
    """Rescale stats for better comparaison
    :param df: input pandas Dataframe
    :param other_fields: field to not rescale
    :return: data rescaled.
    """
    df[df.columns.difference(other_fields)] = MinMaxScaler(feature_range=(0, 100)).fit_transform(
        df[df.columns.difference(other_fields)])

    return df


In [None]:
def compute_power(df: pd.DataFrame, other_fields: list) -> (pd.DataFrame, float):
    """Compute
    :param df: input pandas Dataframe
    :param other_fields: field to not rescale
    :return: data with power ranking computed and category weight
    """
    # "prw" stand for "Power Ranking Weight" : proportion of data available for each stat
    prw = df[df.columns.difference(other_fields)].astype(bool).sum(axis=0) / df.shape[0]
    pr = df[df.columns.difference(other_fields)].dot(prw) / prw.sum()

    category_weight = prw.sum() / prw.shape[0]

    pr_results = pd.concat((df, pr), axis=1)
    pr_results.rename({0: "PWR"}, axis=1, inplace=True)

    return pr_results, category_weight

In [None]:
def plot_bar(dataframe: pd.DataFrame, x: str, y: str, title: str, x_label: str, y_label: str, col_range: tuple):
    """Compute a Bar Chart using Plotly express
    :param dataframe: dataframe as input (has to be sorted)
    :param x: x value for bar chart (labels)
    :param y: y value for bar chart (feature)
    :param title: title of the chart
    :param x_label: x-axis label
    :param y_label: y-axis label
    :param col_range: color range for the chart
    :return: bar chart as plotly "Figure" object.
    """
    if x_label is None:
        x_label = x

    if y_label is None:
        y_label = y

    fig = px.bar(data_frame=dataframe, y=y, x=x,
                 labels={x: x_label, y: y_label},
                 title=title, color=y,
                 width=960, height=540, color_continuous_scale='orrd', range_color=col_range)

    fig.update_layout(xaxis={'categoryorder': 'total descending'})
    fig.update_coloraxes(showscale=False)
    return fig

## Data Loading & Pre-treatments

In [None]:
data = load_data(file_path='../data/data_2024.json')
general_field = data.columns.tolist()
data

In [None]:
# Stats by artist
data_art = data.explode('artist_list').reset_index(drop=True)
data_art.rename({'artist_list': 'artist'}, axis=1, inplace=True)

In [None]:
# Stats by labels
data_lab = data.explode('label').reset_index(drop=True)
data_lab = data_lab.loc[data_lab.label != '[no labels]']

In [None]:
# Stats by genre
data_gen = data.explode('genre').reset_index(drop=True)

## Basic statistics
### Artists of the year by number of release


In [None]:
data_art_count = (data_art[['artist', 'title']]
                  .groupby('artist', as_index=False)
                  .count()
                  .rename({'title': 'n_release'}, axis=1)
                  .sort_values('n_release', ascending=False)
                  .reset_index(drop=True))

data_art_count.describe()

In [None]:
bar_artists_release = plot_bar(dataframe=data_art_count.head(10),
                               x='artist',
                               y='n_release',
                               title='Number of release by Artist (Top 10)<br><sup>Remixes included</sup>',
                               x_label='Artists',
                               y_label='Number of releases',
                               col_range=(data_art_count.n_release.min(), data_art_count.n_release.max()))

bar_artists_release.show()


### Labels of the year by number of release

In [None]:
data_lab_count = (data_lab[['label', 'title']]
                  .groupby('label', as_index=False)
                  .count()
                  .rename({'title': 'n_release'}, axis=1)
                  .sort_values('n_release', ascending=False)
                  .reset_index(drop=True))

data_lab_count.describe()

In [None]:
bar_label_release = plot_bar(dataframe=data_lab_count.head(10),
                             x='label',
                             y='n_release',
                             title='Number of release by Label (Top 10)',
                             x_label='Labels',
                             y_label='Number of releases',
                             col_range=(data_lab_count.n_release.min(), data_lab_count.n_release.max()))
bar_label_release.show()

### Genre of the year by number of release


In [None]:
data_gen_count = (data_gen[['genre', 'title']]
                  .groupby('genre', as_index=False)
                  .count()
                  .rename({'title': 'n_release'}, axis=1)
                  .sort_values('n_release', ascending=False)
                  .reset_index(drop=True))

data_gen_count.describe()

In [None]:
bar_genre_release = plot_bar(dataframe=data_gen_count,
                             x='genre',
                             y='n_release',
                             title='Number of release by Genre',
                             x_label='Music Genres',
                             y_label='Number of releases',
                             col_range=(data_gen_count.n_release.min(), data_gen_count.n_release.max()))
bar_genre_release.show()

## Power Rankings by stats categories
### Loading data

In [None]:
data_cha = load_data(file_path='../data/data_2024.json', category='charts')
data_eng = load_data(file_path='../data/data_2024.json', category='engagement')
data_pla = load_data(file_path='../data/data_2024.json', category='playlists')
data_pop = load_data(file_path='../data/data_2024.json', category='popularity')
data_pro = load_data(file_path='../data/data_2024.json', category='professional_support')
data_rea = load_data(file_path='../data/data_2024.json', category='reach')
data_sho = load_data(file_path='../data/data_2024.json', category='shorts')
data_str = load_data(file_path='../data/data_2024.json', category='streams')

### Charts
#### Rescale statistics

In [None]:
data_cha = rescale_stats(data_cha, general_field)
data_cha

#### Weighting by recorded by stats

In [None]:
pr_cha, w_cha = compute_power(data_cha, general_field)  # 'w_cha' is calculated for the final step of the Power Ranking
pr_cha

#### Graphical Representation

In [None]:
bar_pr_cha = plot_bar(dataframe=pr_cha.sort_values('PWR', ascending=False).head(20),
                      x='songstats_title',
                      y='PWR',
                      title='Charts Power Ranking (Top 20)',
                      x_label='Track',
                      y_label='Power',
                      col_range=(pr_cha.PWR.min(), pr_cha.PWR.max()))
bar_pr_cha.show()

### Engagement
#### Rescale statistics

In [None]:
data_eng = rescale_stats(data_eng, general_field)
data_eng

#### Weighting by recorded by stats

In [None]:
pr_eng, w_eng = compute_power(data_eng, general_field)  # 'w_eng' is calculated for the final step of the Power Ranking
pr_eng

#### Graphical Representation

In [None]:
bar_pr_eng = plot_bar(dataframe=pr_eng.sort_values('PWR', ascending=False).head(20),
                      x='songstats_title',
                      y='PWR',
                      title='Engagement Power Ranking (Top 20)',
                      x_label='Track',
                      y_label='Power',
                      col_range=(pr_eng.PWR.min(), pr_eng.PWR.max()))
bar_pr_eng.show()

### Playlists
#### Rescale statistics

In [None]:
data_pla = rescale_stats(data_pla, general_field)
data_pla

#### Weighting by recorded by stats

In [None]:
pr_pla, w_pla = compute_power(data_pla, general_field)  # 'w_pla' is calculated for the final step of the Power Ranking
pr_pla

#### Graphical Representation

In [None]:
bar_pr_pla = plot_bar(dataframe=pr_pla.sort_values('PWR', ascending=False).head(20),
                      x='songstats_title',
                      y='PWR',
                      title='Playlist Power Ranking (Top 20)',
                      x_label='Track',
                      y_label='Power',
                      col_range=(pr_pla.PWR.min(), pr_pla.PWR.max()))
bar_pr_pla.show()

### Popularity
#### Rescale statistics

In [None]:
data_pop = rescale_stats(data_pop, general_field)
data_pop

#### Weighting by recorded by stats

In [None]:
pr_pop, w_pop = compute_power(data_pop, general_field)  # 'w_pop' is calculated for the final step of the Power Ranking
pr_pop

#### Graphical Representation

In [None]:
bar_pr_pop = plot_bar(dataframe=pr_pop.sort_values('PWR', ascending=False).head(20),
                      x='songstats_title',
                      y='PWR',
                      title='Popularity Power Ranking (Top 20)',
                      x_label='Track',
                      y_label='Power',
                      col_range=(pr_pop.PWR.min(), pr_pop.PWR.max()))
bar_pr_pop.show()

### Professional Support
#### Rescale statistics

In [None]:
data_pro = rescale_stats(data_pro, general_field)
data_pro

#### Weighting by recorded by stats

In [None]:
pr_pro, w_pro = compute_power(data_pro, general_field)  # 'w_pro' is calculated for the final step of the Power Ranking
pr_pro

#### Graphical Representation

In [None]:
bar_pr_pro = plot_bar(dataframe=pr_pro.sort_values('PWR', ascending=False).head(20),
                      x='songstats_title',
                      y='PWR',
                      title='Pro. Support Power Ranking (Top 20)',
                      x_label='Track',
                      y_label='Power',
                      col_range=(pr_pro.PWR.min(), pr_pro.PWR.max()))
bar_pr_pro.show()

### Reach
#### Rescale statistics

In [None]:
data_rea = rescale_stats(data_rea, general_field)
data_rea

#### Weighting by recorded by stats

In [None]:
pr_rea, w_rea = compute_power(data_rea, general_field)  # 'w_rea' is calculated for the final step of the Power Ranking
pr_rea

#### Graphical Representation

In [None]:
bar_pr_rea = plot_bar(dataframe=pr_rea.sort_values('PWR', ascending=False).head(20),
                      x='songstats_title',
                      y='PWR',
                      title='Reach Power Ranking (Top 20)',
                      x_label='Track',
                      y_label='Power',
                      col_range=(pr_rea.PWR.min(), pr_rea.PWR.max()))
bar_pr_rea.show()

### Shorts
#### Rescale statistics

In [None]:
data_sho = rescale_stats(data_sho, general_field)
data_sho

#### Weighting by recorded by stats

In [None]:
pr_sho, w_sho = compute_power(data_sho, general_field)  # 'w_sho' is calculated for the final step of the Power Ranking
pr_sho

#### Graphical Representation

In [None]:
bar_pr_sho = plot_bar(dataframe=pr_sho.sort_values('PWR', ascending=False).head(20),
                      x='songstats_title',
                      y='PWR',
                      title='Shorts Power Ranking (Top 20)',
                      x_label='Track',
                      y_label='Power',
                      col_range=(pr_sho.PWR.min(), pr_sho.PWR.max()))
bar_pr_sho.show()

### Streams
#### Rescale statistics

In [None]:
data_str = rescale_stats(data_str, general_field)
data_str

#### Weighting by recorded by stats

In [None]:
pr_str, w_str = compute_power(data_str, general_field)  # 'w_str' is calculated for the final step of the Power Ranking
pr_str

#### Graphical Representation

In [None]:
bar_pr_str = plot_bar(dataframe=pr_str.sort_values('PWR', ascending=False).head(20),
                      x='songstats_title',
                      y='PWR',
                      title='Streams Power Ranking (Top 20)',
                      x_label='Track',
                      y_label='Power',
                      col_range=(pr_str.PWR.min(), pr_str.PWR.max()))
bar_pr_str.show()

## Overall Power Ranking
### Weights by category

I redefine the weights applied to each category arbitrarily. This may be open to criticism, but it's a matter of
differentiating the importance of each of the metrics in relation to the others. In my opinion, it's important not to
 put them on an equal basis. The level are :

1. **Negligible**: no coefficient applied
2. **Low Importance**: weight multiplied by 2
3. **Average Importance**: weight multiplied by 3
4. **High Importance**: weight multiplied by 4

In [None]:
# Negligible
new_w_cha = w_cha
new_w_eng = w_eng

# Low
new_w_rea = w_rea * 2
new_w_sho = w_sho * 2

# Mid
new_w_pla = w_pla * 3
new_w_pro = w_pro * 3

# Important
new_w_pop = w_pop * 4
new_w_str = w_str * 4

# Store them as DF
new_weights_dict = {'pwr_cha': new_w_cha,
                    'pwr_eng': new_w_eng,
                    'pwr_pla': new_w_pla,
                    'pwr_pop': new_w_pop,
                    'pwr_pro': new_w_pro,
                    'pwr_rea': new_w_rea,
                    'pwr_sho': new_w_sho,
                    'pwr_str': new_w_str}

new_weights = pd.DataFrame.from_dict(new_weights_dict, orient='index')
new_weights

### Merging all data

In [None]:
pr_cha.rename({'PWR': 'pwr_cha'}, axis=1, inplace=True)
pr_eng.rename({'PWR': 'pwr_eng'}, axis=1, inplace=True)
pr_pla.rename({'PWR': 'pwr_pla'}, axis=1, inplace=True)
pr_pop.rename({'PWR': 'pwr_pop'}, axis=1, inplace=True)
pr_pro.rename({'PWR': 'pwr_pro'}, axis=1, inplace=True)
pr_rea.rename({'PWR': 'pwr_rea'}, axis=1, inplace=True)
pr_sho.rename({'PWR': 'pwr_sho'}, axis=1, inplace=True)
pr_str.rename({'PWR': 'pwr_str'}, axis=1, inplace=True)

all_data = pd.concat([pr_cha[general_field + ['pwr_cha']],
                      pr_eng.iloc[:,-1:],
                      pr_pla.iloc[:,-1:],
                      pr_pop.iloc[:,-1:],
                      pr_pro.iloc[:,-1:],
                      pr_rea.iloc[:,-1:],
                      pr_sho.iloc[:,-1:],
                      pr_str.iloc[:,-1:]], axis=1)

all_data

In [None]:
global_pr = all_data[all_data.columns.difference(general_field)].dot(new_weights) / new_weights.sum()

power_ranking = (pd.concat((all_data, global_pr), axis=1)
                 .rename({0: 'PWR'}, axis=1)
                 .sort_values('PWR', ascending=False)
                 .reset_index(drop=True))

power_ranking.PWR = power_ranking.PWR.round(4)
power_ranking

In [None]:
bar_power_ranking = plot_bar(dataframe=power_ranking.head(20),
                      x='songstats_title',
                      y='PWR',
                      title='GLOBAL POWER RANKING 2024 (Top 20)',
                      x_label='Track',
                      y_label='Power',
                      col_range=(power_ranking.PWR.min(), power_ranking.PWR.max()))

bar_power_ranking.show()

In [None]:
power_ranking[['songstats_id', 'PWR', 'title']].to_csv('../data/power_ranking_2024.csv', index=False, encoding='utf-8')