# Dashboard

In [None]:
from dash import Dash, html, dash_table, dcc, callback, Output, Input
import numpy as np
import pandas as pd
import plotly.express as px

In [None]:
data_path = "./../data/"

## Prétraitement

In [None]:
df = pd.read_csv(data_path + "goodreads_data.csv", converters={"genres": lambda x: x[1:-1].split(",")})
df["publication_date"] = pd.to_datetime(df["published"].apply(lambda x: " ".join(str(x).split()[:3])), errors="coerce")
df["publication_year"] = df["publication_date"].dt.year.astype("float")
df["n_pages"] = df["pages_format"].apply(lambda x: int(x.split()[0]) if x.split()[0].isdigit() else np.nan).astype("float")
df["format"] = df["pages_format"].apply(lambda x: x.replace("Kindle Edition", "Kindle").split()[-1].lower() if x.replace("Kindle Edition", "Kindle").split()[-1].lower() in ["hardcover", "paperback", "ebook", "audiobook", "audio", "cd", "kindle"] else "other").astype("category")

dic_rating = {1: "one_star", 2: "two_stars", 3: "three_stars", 4: "four_stars", 5: "five_stars"}
for key, value in dic_rating.items():
    df[value] = df["ratings"].apply(lambda x: float(x[1:-1].split(",")[key-1])) / df["n_ratings"]
df["average_rating"] = df["ratings"].apply(lambda x: sum([(i+1) * float(x[1:-1].split(",")[i]) for i in range(5)])) / df["n_ratings"]
df["average_rating"] = df["average_rating"].round(1)

In [None]:
columns_pretty_name = {
    "title": "Titre",
    "author": "Auteur",
    "price": "Prix", 
    "average_rating": "Note moyenne", 
    "n_ratings": "Nombre de notes", 
    "n_reviews": "Nombre de commentaires",
    "n_pages": "Nombre de pages",
    "publication_year": "Année de publication", 
    "format": "Format", 
    "language": "Langue",
    "isbn": "ISBN"
}

ratings_pretty_name = {
    "one_star": "1 étoile",
    "two_stars": "2 étoiles",
    "three_stars": "3 étoiles",
    "four_stars": "4 étoiles",
    "five_stars": "5 étoiles"
}

In [None]:
df_datatable = df[["title", "author", "publication_year", "price", "average_rating", "n_ratings", "n_reviews", "format", "language", "n_pages", "isbn"]].rename(columns=columns_pretty_name)
df_datatable_preview = df_datatable[["Titre", "Auteur", "Année de publication", "Prix", "Note moyenne", "Nombre de notes", "Format", "ISBN"]]
df_datatable_numeric = df_datatable[["Année de publication", "Prix", "Note moyenne", "Nombre de notes", "Nombre de commentaires", "Nombre de pages"]].describe().round(2).reset_index()
df_datatable_numeric["index"] = ["Compte", "Moyenne", "Écart-type", "Minimum", "25%", "50%", "75%", "Maximum"]
df_datatable_numeric = df_datatable_numeric.rename(columns={"index": ""})

format_counts = df["format"].value_counts()
language_counts = df["language"].apply(lambda x: x if x == "English" else "Other").value_counts()
prices_time = df.groupby("publication_year")["price"].mean()
ratings_over_time = df.groupby("publication_year")[["average_rating"]].mean()

df_genres = df.explode("genres")
df_genres["genres"] = df_genres["genres"].apply(lambda x: x.strip()[1:-1]) 
df_genres["genres"].astype("category")
df_genres = df_genres[df_genres["genres"] != "...more"]
genres_groups = df_genres.groupby("genres")
genres_average_prices = genres_groups["price"].mean()
genres_counts = df_genres["genres"].value_counts()
most_common_genres_counts = genres_counts.head(20)
df_genres_year = df_genres.groupby(["publication_year", "genres"]).count().sort_values(by=["publication_year", "title"], ascending=False).groupby(level=0).head(5)
genres_year, genres_year_counts = np.unique(df_genres_year.index.get_level_values("genres"), return_counts=True)
df_genres_year_counts = pd.DataFrame({"genre": genres_year, "count": genres_year_counts}).sort_values(by="count", ascending=False).set_index("genre")

authors_ratings_groups = df[["author", "n_ratings"]].groupby("author")
authors_ratings = authors_ratings_groups.mean().sort_values(by="n_ratings", ascending=False)
authors_ratings["n_books"] = authors_ratings_groups.count()
authors_average_rating = df[["author", "average_rating", "n_ratings"]].groupby("author").mean().sort_values(by="average_rating", ascending=False)
authors_average_rating["log_n_ratings"] = np.log(authors_average_rating["n_ratings"])
average_ratings_per_n_books = authors_ratings.groupby("n_books").mean()

df_violinplot = df[["one_star", "two_stars", "three_stars", "four_stars", "five_stars"]].rename(columns=ratings_pretty_name)

## Application

In [None]:
background_color = "#f5f6f8"
figures_paper_background_color = "#ffffff"

In [None]:
app = Dash(__name__)

app.layout = html.Div([
    html.H1(children="ÉCRITURE D'UN LIVRE POPULAIRE", style={'text-align': 'center'}),
    html.Hr(style={"height": "5px", "color": "#026b9c", "border": "none", "background-color": "#026b9c"}),
    html.Div([
        html.H2(children="Jeu de Données", style={"text-align": "center", "margin": "0 0 15px 0", "color": "#2a303b"}),
        html.Span("Les livres qui composent le jeu de données ont été extraits à partir du site goodreads. Il est possible de visualiser ci-dessous les principales variables qualitatives et quantitatives pour chaque livre. Les tableaux \"Aperçu\" et \"Détaillé\" \
                  représentent tous les enregistrements de la table, le tableau \"Détaillé\" donnant davantage de caractéristiques sur les livres de la table. Le tableau correspondant au \"Résumé Statistique\" décrit les quantités statistiques des variables quantitatives principales.", 
                  style={"margin-bottom": "15px", "font-style": "italic"}),
        dcc.RadioItems(options=["Aperçu", "Détaillé", "Résumé Statistique"], value="Aperçu", id="radio-data", style={"display": "flex", "justify-content": "space-around", "margin": "20px 0 10px 0"}),
        dash_table.DataTable(data=df_datatable_preview.to_dict("records"), page_size=10, id="data-table", style_table={"overflowX": "auto"}, style_cell={"textAlign": "left", "font-family": "Helvetica", "padding": "0 5px 0 5px"}, style_header={"background-color": "#026b9c", "color": "white", "font-weight": "bold"})
    ], style={"background-color": background_color, "padding": "20px", "border-radius": "10px", "margin-bottom": "30px"}),

    html.Div([
        html.Div([
            html.H2(children="Accessibilité", style={"text-align": "center", "margin": "0", "color": "#2a303b"}),
                dcc.Dropdown([{"label": "Format", "value": "format"}, {"label": "Langue", "value": "language"}], value="format", id="dropdown-accessibility", style={"margin": "20px 0 10px 0"}),
                dcc.Graph(id="accessibility-histplot", style={"margin-bottom": "30px"}),
                dcc.Graph(id="accessibility-pieplot"),
            ], style={"background-color": background_color, "padding": "20px 20px 0 20px", "border-radius": "10px",}),
        
        html.Div([
            html.Div([
                html.H2(children="Corrélations", style={"text-align": "center", "margin": "0 0 20px 0", "color": "#2a303b"}),
                dcc.Graph(figure=px.imshow(df[["average_rating", "n_ratings", "n_reviews", "price", "publication_year", "n_pages"]].corr(), title="Matrice de corrélation entre les principales variables quantitatives", height=525).update_layout(paper_bgcolor=figures_paper_background_color), id="correlation_matrix"),
            ], style={"background-color": background_color, "padding": "20px", "border-radius": "10px"}),

            html.Div([
                html.H2(children="Prix", style={"text-align": "center", "margin": "0", "color": "#2a303b"}),
                dcc.Dropdown([{"label": "Distribution", "value": "distribution"}, {"label": "Évolution des prix par année", "value": "time"}, {"label": "Prix par genre", "value": "genres"}], value="distribution", id="dropdown-prices"),
                dcc.Graph(id="prices-figure"),
            ], style={"display": "grid", "grid-template-rows": "1fr 1fr", "background-color": background_color, "padding": "20px", "border-radius": "10px"}),
        ], style={"display": "grid", "grid-template": "1fr 1fr / 1fr", "gap": "10px"})
    ], style={"display": "grid", "grid-template-columns": "1fr 1fr", "gap": "10px", "margin-bottom": "30px"}),

    html.Div([
        html.Div([
            html.H2(children="Genres Littéraires", style={"text-align": "center", "margin": "0", "color": "#2a303b"}),
            dcc.Dropdown([{"label": "Genres les plus communs", "value": "mostcommon"}, {"label": "Genres les plus populaires", "value": "popular"}], value="mostcommon", id="dropdown-genres", style={"margin": "20px 0 10px 0"}),
            dcc.Graph(id="genres-figure"),
        ], style={"background-color": background_color, "padding": "20px", "border-radius": "10px"}),

        html.Div([
            html.H2(children="Notes Attribuées", style={"text-align": "center", "margin": "0", "color": "#2a303b"}),
            dcc.Dropdown([{"label": "Corrélation avec le nombre de livres écrit", "value": "correlation-nbooks"}, {"label": "Corrélation note moyenne et nombre de notes", "value": "correlation-nratings"}, {"label": "Distribution des notes moyennes", "value": "distribution"}, {"label": "Distribution par nombre d'étoiles", "value": "distribution_stars"}, {"label": "Évolution temporelle", "value": "time"}], value="distribution", id="dropdown-ratings", style={"margin": "20px 0 10px 0"}),
            dcc.Graph(id="ratings-figure"),
        ], style={"background-color": background_color, "padding": "20px", "border-radius": "10px"}),
    ], style={"display": "grid", "grid-template-columns": "1fr 1fr", "gap": "10px"}),
], style={"font-family": "Helvetica", "background-color": "white", "padding": "10px", "border-radius": "5px", "margin-bottom": "10px"})


In [None]:
@callback(
    Output(component_id="data-table", component_property="data"),
    Input(component_id="radio-data", component_property="value")
)
def update_datatable(mode):
    if mode == "Aperçu":
        return df_datatable_preview.to_dict("records")
    elif mode == "Détaillé":
        return df_datatable.to_dict("records")
    elif mode == "Résumé Statistique":
        return df_datatable_numeric.to_dict("records")
    
@callback(
    Output(component_id="accessibility-histplot", component_property="figure"),
    Input(component_id="dropdown-accessibility", component_property="value")
)
def update_accessibility_histplot(accessibility_column):
    if accessibility_column == "format":
        return px.bar(x=format_counts.index, y=format_counts.values, labels={"x": "Format", "y": "Nombre de livres"}, title="Nombre de livres par format", height=535).update_layout(paper_bgcolor=figures_paper_background_color)
    elif accessibility_column == "language":
        return px.bar(x=language_counts.index, y=language_counts.values, labels={"x": "Langue", "y": "Nombre de livres"}, title="Nombre de livres par langue d'écriture", height=535).update_layout(paper_bgcolor=figures_paper_background_color)

@callback(
    Output(component_id="accessibility-pieplot", component_property="figure"),
    Input(component_id="dropdown-accessibility", component_property="value")
)
def update_accessibility_pieplot(accessibility_column):
    if accessibility_column == "format":
        return px.pie(names=format_counts.index, values=format_counts.values, title="Répartition des livres par format", height=535).update_layout(paper_bgcolor=figures_paper_background_color)
    elif accessibility_column == "language":
        return px.pie(names=language_counts.index, values=language_counts.values, title="Répartition des livres par langue", height=535).update_layout(paper_bgcolor=figures_paper_background_color)
    
@callback(
    Output(component_id="prices-figure", component_property="figure"),
    Input(component_id="dropdown-prices", component_property="value")
)
def update_prices_figure(mode):
    if mode == "distribution":
        return px.box(df, y="price", labels={"y": "Prix (en dollars)"}, title="Distribution des prix des livres en dollars").update_layout(paper_bgcolor=figures_paper_background_color, yaxis_title="Prix (en dollars)")
    elif mode == "genres":
        return px.scatter(x=genres_average_prices.index, y=genres_average_prices.values, labels={"x": "Genre littéraire", "y": "Prix (en dollars)"}, title="Prix des livres par genre littéraire").update_layout(paper_bgcolor=figures_paper_background_color)
    elif mode == "time":
        return px.line(x=prices_time.index, y=prices_time.values, labels={"x": "Année de publication", "y": "Prix (en dollars)"}, title="Évolution des prix des livres par année").update_layout(paper_bgcolor=figures_paper_background_color)
    
@callback(
    Output(component_id="genres-figure", component_property="figure"),
    Input(component_id="dropdown-genres", component_property="value")
)
def update_genres_figure(mode):
    if mode == "mostcommon":
        return px.bar(x=most_common_genres_counts.index, y=most_common_genres_counts.values, labels={"x": "Genre littéraire", "y": "Nombre de livres"}, title="Les 20 genres littéraires les plus communs").update_layout(paper_bgcolor=figures_paper_background_color)
    elif mode == "popular":
        return px.bar(df_genres_year_counts, labels={"x": "Genres", "y": "Nombre de livres"}, title="Nombre d'années dans le top 5 des genres les plus communs", barmode="group").update_layout(paper_bgcolor=figures_paper_background_color, showlegend=False)
    
@callback(
    Output(component_id="ratings-figure", component_property="figure"),
    Input(component_id="dropdown-ratings", component_property="value")
)
def update_ratings_figure(mode):
    if mode == "correlation-nbooks":
        return px.scatter(average_ratings_per_n_books, title="Note moyenne par rapport au nombre de notes reçues").update_layout(paper_bgcolor=figures_paper_background_color, xaxis_title="Nombre de livres écrits", yaxis_title="Note moyenne", showlegend=False)
    elif mode == "correlation-nratings":
        return px.scatter(authors_average_rating, x="log_n_ratings", y="average_rating", title="Note moyenne par rapport au nombre de notes reçues").update_layout(paper_bgcolor=figures_paper_background_color, xaxis_title="Nombre de notes (log)", yaxis_title="Note moyenne")
    elif mode == "distribution":
        return px.box(df, y="average_rating", title="Distribution des notes moyennes").update_layout(paper_bgcolor=figures_paper_background_color, yaxis_title="Note moyenne")
    elif mode == "distribution_stars":
        return px.violin(df_violinplot, box=True, title="Distribution des notes moyennes par nombre d'étoiles").update_layout(paper_bgcolor=figures_paper_background_color, xaxis_title="Nombre d'étoiles", yaxis_title="Note moyenne")
    elif mode == "time":
        return px.line(ratings_over_time, labels={"x": "Année de publication", "y": "Note moyenne"}, title="Évolution de la note moyenne des livres par année").update_layout(paper_bgcolor=figures_paper_background_color, xaxis_title="Année de publication", yaxis_title="Note moyenne", showlegend=False)

In [None]:
app.run()