In [None]:
import pandas as pd
import plotly_express as px
import numpy as np
import matplotlib.pyplot as plt
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

athlete_events = pd.read_csv("../Data/athlete_events.csv")

ski_jumping = athlete_events[(athlete_events["Sport"] == "Ski Jumping")]

skiing_olympics = (
    ski_jumping.groupby(["Team"])["Medal"]
    .count()
    .reset_index(name="Count") # new name for medal column
    .sort_values(["Count"], ascending=False)
)

skiing_olympics.head(5)

# top_countries_medals = ski_jumping[(ski_jumping["Team"] == "Austria") | (ski_jumping["Team"] == "Norway") | (ski_jumping["Team"] == "Finland")]

bronze_ski_jumping = ski_jumping[(ski_jumping["Medal"] == "Bronze")]
silver_ski_jumping = ski_jumping[(ski_jumping["Medal"] == "Silver")]
gold_ski_jumping = ski_jumping[(ski_jumping["Medal"] == "Gold")]

medals_per_country = ski_jumping.groupby("Team")["Medal"].value_counts(dropna=True)
bronze_ski_jumping = bronze_ski_jumping.groupby("Team")["Medal"].value_counts(
    dropna=False
)
silver_ski_jumping = silver_ski_jumping.groupby("Team")["Medal"].value_counts(
    dropna=False
)
gold_ski_jumping = gold_ski_jumping.groupby("Team")["Medal"].value_counts(dropna=False)


def plotly_bar_plot_with_labels_sublabels(
    x, y, title, labels, sublabels,
):

    fig = px.bar(
        x=x,
        y=y,
        barmode="group",  # groups the bars next to eachother instead of stacking on eachother
        labels=labels,
        title=title,
    )
    newnames = sublabels
    # To be able to change the sub titles for 'Antal doser' without changing the data source,
    # you can switch the legendgroups name with a dict and map it onto existing subtitle names.
    # I had to do this since I couldn't change it through 'labels=' like the other titles
    # source: https://stackoverflow.com/questions/64371174/plotly-how-to-change-variable-label-names-for-the-legend-in-a-plotly-express-li
    fig.for_each_trace(lambda t: t.update(name=newnames[t.name]))

    fig.show()


labels_skiing = {
    "value": "Medals won",
    "variable": "Medals",
    "x": "Teams",
}

sublabels_skiing = {
    "wide_variable_0": "Bronze",
    "wide_variable_1": "Silver",
    "wide_variable_2": "Gold",
}

plotly_bar_plot_with_labels_sublabels(
    top_countries_medals.Team.unique(),
    [bronze_ski_jumping, silver_ski_jumping, gold_ski_jumping],
    'Top 3 Countries With Most Medals won in Ski Jumping',
    labels_skiing,
    sublabels_skiing
)

In [None]:
athlete_events['Sport'].unique()

array(['Basketball', 'Judo', 'Football', 'Tug-Of-War', 'Speed Skating',
       'Cross Country Skiing', 'Athletics', 'Ice Hockey', 'Swimming',
       'Badminton', 'Sailing', 'Biathlon', 'Gymnastics',
       'Art Competitions', 'Alpine Skiing', 'Handball', 'Weightlifting',
       'Wrestling', 'Luge', 'Water Polo', 'Hockey', 'Rowing', 'Bobsleigh',
       'Fencing', 'Equestrianism', 'Shooting', 'Boxing', 'Taekwondo',
       'Cycling', 'Diving', 'Canoeing', 'Tennis', 'Modern Pentathlon',
       'Figure Skating', 'Golf', 'Softball', 'Archery', 'Volleyball',
       'Synchronized Swimming', 'Table Tennis', 'Nordic Combined',
       'Baseball', 'Rhythmic Gymnastics', 'Freestyle Skiing',
       'Rugby Sevens', 'Trampolining', 'Beach Volleyball', 'Triathlon',
       'Ski Jumping', 'Curling', 'Snowboarding', 'Rugby',
       'Short Track Speed Skating', 'Skeleton', 'Lacrosse', 'Polo',
       'Cricket', 'Racquets', 'Motorboating', 'Military Ski Patrol',
       'Croquet', 'Jeu De Paume', 'Roque', 'Al

In [None]:
    
def medal_distribution_per_sport(sport, df):

    ski_jumping = df[(df["Sport"] == sport)]

    bronze_ski_jumping = ski_jumping[(ski_jumping["Medal"] == "Bronze")]
    silver_ski_jumping = ski_jumping[(ski_jumping["Medal"] == "Silver")]
    gold_ski_jumping = ski_jumping[(ski_jumping["Medal"] == "Gold")]

    bronze_ski_jumping = (
        bronze_ski_jumping.groupby(["Team"])["Medal"]
        .count()
        .reset_index(name="Bronze") # new name for medal column
        .sort_values(["Bronze"], ascending=False)
    )

    silver_ski_jumping = (
        silver_ski_jumping.groupby(["Team"])["Medal"]
        .count()
        .reset_index(name="Silver") # new name for medal column
        .sort_values(["Silver"], ascending=False)
    )

    gold_ski_jumping = (
        gold_ski_jumping.groupby(["Team"])["Medal"]
        .count()
        .reset_index(name="Gold") # new name for medal column
        .sort_values(["Gold"], ascending=False)
    )

    medal_total = [bronze_ski_jumping, silver_ski_jumping, gold_ski_jumping]
    # Added a lambda function in order to merge 3 dataframes with only needed columns
    df_final = ft.reduce(lambda left, right: pd.merge(left, right), medal_total)
    # Creates a sum column with total medal sum, only for sorting purpose
    df_final['Sum'] = df_final.sum(axis=1)
    # sort by 'sum' column - highest to lowest
    df_final.sort_values(by='Sum', ascending=False, inplace=True)

    labels_skiing = {
        "value": "Medals won",
        "variable": "Medals",
        "x": "Teams",
    }

    sublabels_skiing = {
        "wide_variable_0": "Bronze",
        "wide_variable_1": "Silver",
        "wide_variable_2": "Gold",
    }

    fig = px.bar(
        x=df_final["Team"],
        y=[df_final["Bronze"],df_final["Silver"], df_final["Gold"]],
        barmode="group",  # groups the bars next to eachother instead of stacking on eachother
        labels=labels_skiing,
        title=f'Top 3 Countries With Most Medals won in {sport}',
    )
    newnames = sublabels_skiing
    # To be able to change the sub titles for 'Antal doser' without changing the data source,
    # you can switch the legendgroups name with a dict and map it onto existing subtitle names.
    # I had to do this since I couldn't change it through 'labels=' like the other titles
    # source: https://stackoverflow.com/questions/64371174/plotly-how-to-change-variable-label-names-for-the-legend-in-a-plotly-express-li
    fig.for_each_trace(lambda t: t.update(name=newnames[t.name]))

    return fig

medal_distribution_per_sport("Ski Jumping", athlete_events)


Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.



In [12]:

    # import tool for lambda func
    import functools as ft

    athlete_events = pd.read_csv("../Projekt-Databehandling/Data/athlete_events.csv")

    ski_jumping = athlete_events[(athlete_events["Sport"] == sport)]

    bronze_ski_jumping = ski_jumping[(ski_jumping["Medal"] == "Bronze")]
    silver_ski_jumping = ski_jumping[(ski_jumping["Medal"] == "Silver")]
    gold_ski_jumping = ski_jumping[(ski_jumping["Medal"] == "Gold")]

    bronze_ski_jumping = (
        bronze_ski_jumping.groupby(["Team"])["Medal"]
        .count()
        .reset_index(name="Bronze") # new name for medal column
        .sort_values(["Bronze"], ascending=False)
    )

    silver_ski_jumping = (
        silver_ski_jumping.groupby(["Team"])["Medal"]
        .count()
        .reset_index(name="Silver") # new name for medal column
        .sort_values(["Silver"], ascending=False)
    )

    gold_ski_jumping = (
        gold_ski_jumping.groupby(["Team"])["Medal"]
        .count()
        .reset_index(name="Gold") # new name for medal column
        .sort_values(["Gold"], ascending=False)
    )

    medal_total = [bronze_ski_jumping, silver_ski_jumping, gold_ski_jumping]
    df_final = ft.reduce(lambda left, right: pd.merge(left, right), medal_total)
    df_final['Sum'] = df_final.sum(axis=1)

    def plotly_bar_plot_with_labels_sublabels(
        x, y, title, labels, sublabels,
    ):

        fig = px.bar(
            x=x,
            y=y,
            barmode="group",  # groups the bars next to eachother instead of stacking on eachother
            labels=labels,
            title=title,
        )
        newnames = sublabels
        # *** RE-USED SOURCE FROM LABB1 ***
        # To be able to change the sub titles for 'Antal doser' without changing the data source,
        # you can switch the legendgroups name with a dict and map it onto existing subtitle names.
        # I had to do this since I couldn't change it through 'labels=' like the other titles
        # source: https://stackoverflow.com/questions/64371174/plotly-how-to-change-variable-label-names-for-the-legend-in-a-plotly-express-li
        fig.for_each_trace(lambda t: t.update(name=newnames[t.name]))

        fig.show()


    labels_skiing = {
        "value": "Medals won",
        "variable": "Medals",
        "x": "Teams",
    }

    sublabels_skiing = {
        "wide_variable_0": "Bronze",
        "wide_variable_1": "Silver",
        "wide_variable_2": "Gold",
    }

    plotly_bar_plot_with_labels_sublabels(df_final.head(5)
        [df_final.Team.unique()],
        ["Bronze", "Silver", "Gold"],
        f'Top 3 Countries With Most Medals won in {sport}',
        labels_skiing,
        sublabels_skiing
    )

medal_distribution_per_sport("Ski Jumping")

NameError: name 'sport' is not defined

In [None]:
    # import tool for lambda func
    import functools as ft

    athlete_events = pd.read_csv("../Projekt-Databehandling/Data/athlete_events.csv")

    ski_jumping = athlete_events[(athlete_events["Sport"] == sport)]

    bronze_ski_jumping = ski_jumping[(ski_jumping["Medal"] == "Bronze")]
    silver_ski_jumping = ski_jumping[(ski_jumping["Medal"] == "Silver")]
    gold_ski_jumping = ski_jumping[(ski_jumping["Medal"] == "Gold")]

    bronze_ski_jumping = (
        bronze_ski_jumping.groupby(["Team"])["Medal"]
        .count()
        .reset_index(name="Bronze") # new name for medal column
        .sort_values(["Bronze"], ascending=False)
    )

    silver_ski_jumping = (
        silver_ski_jumping.groupby(["Team"])["Medal"]
        .count()
        .reset_index(name="Silver") # new name for medal column
        .sort_values(["Silver"], ascending=False)
    )

    gold_ski_jumping = (
        gold_ski_jumping.groupby(["Team"])["Medal"]
        .count()
        .reset_index(name="Gold") # new name for medal column
        .sort_values(["Gold"], ascending=False)
    )

    medal_total = [bronze_ski_jumping, silver_ski_jumping, gold_ski_jumping]
    df_final = ft.reduce(lambda left, right: pd.merge(left, right), medal_total)
    df_final['Sum'] = df_final.sum(axis=1)

NameError: name 'pd' is not defined

In [13]:
def treemap_most_x_medals_won(medal, df):
  gold_athlete = df[(df['Medal'] == medal)].dropna()
  #gold_athlete = gold_athlete[['Team', 'Medal']].sort_values(by=['Team'])
  gold_athlete = gold_athlete.groupby(['Team'])['Medal'].count().reset_index(
    name='Count').sort_values(['Count'], ascending=False)

  fig = px.treemap(gold_athlete.head(30), path=['Team'], values='Count',
                    color='Count', hover_data=['Team'], title=f'Top 30 Countries With Most {medal} Medals Won')
  return fig

treemap_most_x_medals_won("Gold", athlete_events)