#4on4 : 2024 Power Ranking
---------------------------

## Dependencies

In [None]:
import json
import pandas as pd
import plotly.express as px

from sklearn.preprocessing import MinMaxScaler

## Functions

In [None]:
def load_data(file_path: str, category: str = None) -> pd.DataFrame:
    """Load data as pandas Dataframe, with basic information and selected statistics if needed
    :param file_path: path to data as JSON file to parse
    :param category: category of stats to select
    :return: pandas Dataframe with all information requested.
    """
    with open(file_path, 'r', encoding='utf-8') as j_file:
        json_data = json.load(j_file)

    with open('../data/stats_category.json', 'r', encoding='utf-8') as c_file:
        stats_categories = json.load(c_file)

    general_data_keys = ('artist_list', 'genre', 'label', 'title')
    renaming = {'s_id': 'songstats_id', 's_title': 'songstats_title'}

    ids = pd.DataFrame([{key: value for key, value in field['songstats_identifiers'].items()} for field in
                        json_data]).rename(columns=renaming)

    general_data = pd.DataFrame(
        [{key: value for key, value in field.items() if key in general_data_keys} for field in json_data])

    if not category:
        return pd.concat((ids, general_data), axis=1)

    stats_selected = pd.DataFrame(
        [{key: value for key, value in field['data'].items() if key in stats_categories[category]}
         for field in json_data]).fillna(0)

    return pd.concat((ids, general_data, stats_selected), axis=1)

In [None]:
def plot_bar(dataframe: pd.DataFrame, x: str, y: str, title: str, x_label: str, y_label: str):
    """Compute a Bar Chart using Plotly express
    :param dataframe: dataframe as input (has to be sorted)
    :param x: x value for bar chart (labels)
    :param y: y value for bar chart (feature)
    :param title: title of the chart
    :param x_label: x-axis label
    :param y_label: y-axis label
    :return: bar chart as plotly "Figure" object.
    """
    if x_label is None:
        x_label = x

    if y_label is None:
        y_label = y

    fig = px.bar(data_frame=dataframe, y=y, x=x,
                 labels={x: x_label, y: y_label},
                 title=title, color=y,
                 width=960, height=540, color_continuous_scale='orrd')

    fig.update_layout(xaxis={'categoryorder': 'total descending'})
    fig.update_coloraxes(showscale=False)
    return fig

## Data Loading & Pre-treatments

In [None]:
data = load_data(file_path='../data/data_2024.json')
general_field = data.columns.tolist()
data

In [None]:
# Stats by artist
data_art = data.explode('artist_list').reset_index(drop=True)
data_art.rename({'artist_list': 'artist'}, axis=1, inplace=True)

In [None]:
# Stats by labels
data_lab = data.explode('label').reset_index(drop=True)
data_lab = data_lab.loc[data_lab.label != '[no labels]']

In [None]:
# Stats by genre
data_gen = data.explode('genre').reset_index(drop=True)

## Basic statistics

### Artists of the year by number of release


In [None]:
data_art_count = (data_art[['artist', 'title']]
                  .groupby('artist', as_index=False)
                  .count()
                  .rename({'title': 'n_release'}, axis=1)
                  .sort_values('n_release', ascending=False)
                  .reset_index(drop=True))

data_art_count.describe()

In [None]:
bar_artists_release = plot_bar(data_art_count.head(10), 'artist', 'n_release',
                               'Number of release by Artist (Top 10)<br><sup>Remixes included</sup>',
                               'Artists', 'Number of releases')

bar_artists_release.show()


### Labels of the year by number of release

In [None]:
data_lab_count = (data_lab[['label', 'title']]
                  .groupby('label', as_index=False)
                  .count()
                  .rename({'title': 'n_release'}, axis=1)
                  .sort_values('n_release', ascending=False)
                  .reset_index(drop=True))

data_lab_count.describe()

In [None]:
bar_label_release = plot_bar(data_lab_count.head(10), 'label', 'n_release', 'Number of release by Label (Top 10)',
                             'Labels', 'Number of releases')
bar_label_release.show()

### Genre of the year by number of release


In [None]:
data_gen_count = (data_gen[['genre', 'title']]
                  .groupby('genre', as_index=False)
                  .count()
                  .rename({'title': 'n_release'}, axis=1)
                  .sort_values('n_release', ascending=False)
                  .reset_index(drop=True))

data_gen_count.describe()

In [None]:
bar_genre_release = plot_bar(data_gen_count, 'genre', 'n_release', 'Number of release by Genre (Top 10)',
                             'Music Genres', 'Number of releases')
bar_genre_release.show()

## Power Rankings by stats categories

### Loading data

In [None]:
data_pop = load_data(file_path='../data/data_2024.json', category='popularity')
data_pop[data_pop.columns.difference(general_field)] = MinMaxScaler(feature_range=(0, 100)).fit_transform(
    data_pop[data_pop.columns.difference(general_field)])

data_pop

### Share of value not recorded by stats

In [None]:
# "prw" stand for "Power Ranking Weight" : proportion of data available for each stat
prw_pop = data_pop[data_pop.columns.difference(general_field)].astype(bool).sum(axis=0) / data_pop.shape[0]
pr = data_pop[data_pop.columns.difference(general_field)].dot(prw_pop) / prw_pop.sum()

pr_pop = pd.concat((data_pop, pr), axis=1)
pr_pop.rename({0: "PWR_POP"}, axis=1, inplace=True)
pr_pop

In [None]:
bar_pr_pop = plot_bar(pr_pop.sort_values('PWR_POP', ascending=False).head(20), 'songstats_title', 'PWR_POP',
                      'Popularity Power Ranking (Top 20)', 'Track', 'Power')
bar_pr_pop.show()