## Visualizing Multivariant Categorical Data using Plotly 
### On the example of the famous mushroom data-set

In [1]:
## libraries

# graphing
from dash import Dash, dcc, html, Input, Output, no_update
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# frameworks
import pandas as pd
import numpy as np

# hadling iamges
import cv2
import glob

# displaying all columns of dataset
pd.set_option('display.max_columns', None)

In [2]:
## data 

# Replace this path with your file path
file_path = r"C:\Users\Nutzer\Desktop\Projects\Mushroom_Project\Data\mushrooms.csv"
# read csv data from file into pandas dataframe - note that we specify "?" as missing values
df = pd.read_csv(file_path, na_values="?")

# rename columns
df.rename(columns = {"class":"is-edible"}, inplace = True)
df_columns = df.columns 

In [3]:
## value replacement - increase readability

# value dictionaries for replacement
dict_is_edible = {"p" : "poisonous", "e" : "edible"}
dict_cap_shape = {"b" : "bell", "c" : "conical", "x":"convex", "f" : "flat", "k" : "knobbed", "s" : "sunken"}
dict_cap_surface = {"f" : "fibrous", "g" : "grooves", "y" : "scaly", "s" : "smooth"}
dict_cap_color = {"n" : "brown", "b" : "buff", "c" : "cinnamon", "g" : "gray", "r" : "green", "p" : "pink", "u" : "purple", "e" : "red", "w" : "white", "y" : "yellow" }
dict_bruises = {"t" : "bruises", "f" : "no" }
dict_odor = {"a" : "almond", "l" : "anise", "c" : "creosote", "y" : "fishy", "f" : "foul", "m" : "musty", "n" : "none", "p" : "pungent", "s" : "spicy" }
dict_gill_attachment = {"a" : "attached", "d" : "descending", "f" : "free", "n" : "notched"}
dict_gill_spacing = {"c" : "close", "w" : "crowded", "d" : "distant"}
dict_gill_size = {"b" : "broad", "n" : "narrow"}
dict_gill_color = {"k" : "black", "n" : "brown", "b" : "buff", "h" : "chocolate", "g" : "gray", "r" : "green", "o" : "orange", "p" : "pink", "u" : "purple", "e" : "red", "w" : "white", "y" : "yellow"}
dict_stalk_shape = {"e" : "enlarging", "t" : "tapering"}
dict_stalk_root = {"b" : "bulbous", "c" : "club", "u" : "cup", "e" : "equal", "z" : "rhizomorphs", "r" : "rooted"}
dict_stalk_surface_abov_ring = {"f" : "fibrous", "y" : "scaly", "k" : "silky", "s" : "smooth"}
dict_stalk_surface_below_ring = {"f" : "fibrous", "y" : "scaly", "k" : "silky", "s" : "smooth"}
dict_stalk_color_above_ring = {"n" : "brown", "b" : "buff", "c" : "cinnamon", "g" : "gray", "o" : "orange", "p" : "pink", "e" : "red", "w" : "white", "y" : "yellow"}
dict_stalk_color_below_ring = {"n" : "brown", "b": "buff", "c" : "cinnamon", "g": "gray", "o" : "orange", "p" : "pink", "e" : "red", "w" : "white", "y" : "yellow"}
dict_veil_type = {"p" : "partial", "u" : "universal"}
dict_veil_color = {"n" : "brown" , "o" : "orange", "w" : "white", "y" : "yellow"}
dict_ring_number = {"n" : "none", "o" : "one", "t" : "two"}
dict_ring_type = {"c" : "cobwebby", "e" : "evanescent", "f" : "flaring", "l" : "large", "n" : "none", "p" : "pendant", "s" : "sheathing", "z" : "zone"}
dict_spore_print_color = {"k" :"black", "n" : "brown", "b" : "buff", "h" : "chocolate", "r" : "green", "o" : "orange", "u" : "purple", "w" : "white", "y" : "yellow"}
dict_population = {"a" : "abundant", "c" : "clustered", "n" : "numerous", "s" : "scattered", "v" : "several", "y" : "solitary" }
dict_habitat = {"g" : "grasses","l" : "leaves", "m" : "meadows", "p" : "paths", "u" : "urban", "w" : "waste", "d" : "woods" }

# create a list of dictionaries for facilitated reference 
column_dict_names = [
                    dict_is_edible, dict_cap_shape, dict_cap_surface, \
                    dict_cap_color, dict_bruises, dict_odor, \
                    dict_gill_attachment, dict_gill_spacing, dict_gill_size, \
                    dict_gill_color, dict_stalk_shape, dict_stalk_root, \
                    dict_stalk_surface_abov_ring, dict_stalk_surface_below_ring, dict_stalk_color_above_ring, \
                    dict_stalk_color_below_ring, dict_veil_type, dict_veil_color, \
                    dict_ring_number, dict_ring_type, dict_spore_print_color, \
                    dict_population, dict_habitat
                    ]

# iterate through pandas dataframe column names and list of dictionaries simultanously to replace values in columns
for column_name, dict_name in zip(df_columns, column_dict_names):
    df[column_name].replace(dict_name, inplace=True)

In [32]:
# inspect top rows of resulting pandas dataframe
df.head(5)

Unnamed: 0,is-edible,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,is-edible-bool
0,poisonous,convex,smooth,brown,bruises,pungent,free,close,narrow,black,enlarging,equal,smooth,smooth,white,white,partial,white,one,pendant,black,scattered,urban,0
1,edible,convex,smooth,yellow,bruises,almond,free,close,broad,black,enlarging,club,smooth,smooth,white,white,partial,white,one,pendant,brown,numerous,grasses,1
2,edible,bell,smooth,white,bruises,anise,free,close,broad,brown,enlarging,club,smooth,smooth,white,white,partial,white,one,pendant,brown,numerous,meadows,1
3,poisonous,convex,scaly,white,bruises,pungent,free,close,narrow,brown,enlarging,equal,smooth,smooth,white,white,partial,white,one,pendant,black,scattered,urban,0
4,edible,convex,smooth,gray,no,none,free,crowded,broad,black,tapering,equal,smooth,smooth,white,white,partial,white,one,evanescent,brown,abundant,grasses,1


### 1 Dimension
#### todo: make two plots next to each one for edible, one for not edible

In [5]:
## dataframe creation - for plotting

#create new pandas dataframe which contains all counts sorted by population
population_df = df.groupby(['population']) \
                  .size() \
                  .reset_index(name='Counts') \
                  .sort_values(by=['population'])

#display the dataframe
population_df

Unnamed: 0,population,Counts
0,abundant,384
1,clustered,340
2,numerous,400
3,scattered,1248
4,several,4040
5,solitary,1712


In [21]:
## Creating a pie chart

# create labels using all unique values in the column named "population"
labels = population_df["population"].unique()

# group by count of the "population" column. 
values = population_df["Counts"]

# Custom define a list of colors to be used for the pie chart. 
# Note that the same number of colors are specified as there are unique populations. It is not mandatory, but 
# will avoid a single color to be used multiple times.
earth_colors =  [
                'rgb(210,180,140)',
                'rgb(218,165,32)', 
                'rgb(139,69,19)',
                'rgb(175, 51, 21)', 
                'rgb(35, 36, 21)', 
                'rgb(188,143,143)'
               ]


# defining the actual figure using the dimension: population
# Note that a pull keyword was specified to explode pie pieces out of the center
fig = go.Figure(
                data = [
                    go.Pie(
                            labels = labels,
                            values = values,
                            # pull is given as a fraction of the pie radius
                            pull = [0, 0, 0.07, 0.08, 0.02, 0.2],
                            # iterate through earth_colors list to color individual pie pieces
                            marker_colors = earth_colors
                            )
                     ]
                )

#Update layout to show a title
fig.update_layout(
    title_text="Mushroom Polulation")

# display the figure
fig.show()

In [7]:
## dataframe creation - for plotting

#create new pandas dataframe which contains all counts filtered by 'is-edible' == "edible" and sorted by population
edible_population_df = df.loc[df['is-edible'] == "edible"] \
                         .groupby(['population']) \
                         .size() \
                         .reset_index(name='Counts') \
                         .sort_values(by=['population'])

#create new pandas dataframe which contains all counts filtered by 'is-edible' == "poisonous" and sorted by population
poisonous_population_df = df.loc[df['is-edible'] == "poisonous"] \
                            .groupby(['population']) \
                            .size() \
                            .reset_index(name='Counts') \
                            .sort_values(by=['population'])

#get unique values from the just created pandas dataframes and store them in an array
labels_edible_population = edible_population_df['population'].unique()
labels_poisonous_population = poisonous_population_df['population'].unique()

# get all the counts from the created pandas dataframes and store them in an array
values_edible_population = edible_population_df['Counts']
values_poisonous_population = poisonous_population_df['Counts']

In [8]:
## Creating two pie charts

# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=1, 
                    cols=2, 
                    specs=[[{'type':'domain'}, {'type':'domain'}]])

# create an array of colors which will be custom colors to the plot
earth_colors =  [
                'rgb(210,180,140)',
                'rgb(218,165,32)', 
                'rgb(139,69,19)',
                'rgb(175, 51, 21)', 
                'rgb(35, 36, 21)', 
                'rgb(188,143,143)'
               ]

# crate traces to specify the various properties of the first pie chart subplot
fig.add_trace(go.Pie(labels = labels_edible_population,
                     values = values_edible_population, 
                     name = "Edible Mushroom", 
                     marker_colors = cafe_colors),
               1, 1)

# crate traces to specify the various properties of the second pie chart subplot
fig.add_trace(go.Pie(labels = labels_poisonous_population, 
                     values = values_poisonous_population, 
                     name="Poisonous Mushroom", 
                     marker_colors = cafe_colors),
              1, 2)

# Use `hole` to create a donut-like pie chart
fig.update_traces(hole=.4, 
                  hoverinfo="label+percent+name")

#adabt layout of the chart for reability
fig.update_layout(
    title_text="Mushroom Population by Edibility",
    # Add annotations in the center of the donut pies.
    annotations=[dict(text = 'Edible', 
                         x = 0.18, 
                         y = 0.5, 
                         font_size = 17, 
                         showarrow = False),
                 
                 dict(text = 'Poisonous', 
                         x = 0.82, 
                         y = 0.5, 
                         font_size = 17, 
                         showarrow = False)])
fig.show()

## 2 Dimensions

#### todo: show hover picture

In [10]:
## define colors used through the remaning part of the notebook
medimumvioletred = '#C71585'
seagreen = '#2E8B57'

## Creating bar chart

# define figure element
fig = go.Figure()

# define histogram properties
fig = px.histogram (                 
                    df,
                    x = "cap-shape",
    
                    # feature by which color will be arranged
                    color = "is-edible",
    
                    # set text_auto to true to show the numbers on hist
                    text_auto = True,
                    histnorm = "probability",
                    # specify desired colors for feature "is-edible"
                    color_discrete_sequence = [medimumvioletred,
                                               seagreen],
                    labels = { 
                               "cap-shape": "Mushroom Cap-Shape" 
                                }
                    ) \
        .update_layout (
                    # update layout with a title
                    title = { 
                               "text": "Mushroom Absolute Frequencies: Cap-Shape - Edibility",
                               "x": 0.5 
                                },
    
                    yaxis_title = "Frequency"
                    ) \
        .update_xaxes ( 
                    # rearrange x axis bars. Categories are alphabetically ordered
                    categoryorder = 'category ascending'
                    )

# display the figure
fig.show()

## 3 Dimensions

In [11]:
## Creating bar chart

# define figure element
fig = go.Figure()

# define histogram properties
fig = px.histogram (
                    df,
                    x = "cap-shape",
                    color = "is-edible",
                    pattern_shape = "gill-size",
                    color_discrete_sequence = [medimumvioletred, seagreen],
                    barmode = "relative",
                    #barnorm = "percent",
                    #histnorm = "probability",
                    text_auto = True,
                    labels={
                             "cap-shape": "Mushroom Cap-Shape & Gill Size",
                            }
                    ) \
        .update_traces(
                  hoverinfo="all") \
        .update_layout (
                        # update layout with titles
                        title={
                                "text": "Percent - Dataset: Cap Shape & Gill Size - Edibility",
                                "x": 0.5
                              },
    
                        yaxis_title= "Absolute Counts"
                    ) \
        .update_xaxes(categoryorder='category ascending')

# display the figure
fig.show()

In [12]:
## Creating bar chart

# define figure element
fig = go.Figure()

# define histogram properties
fig = px.histogram (
                    df,
                    x = "cap-shape",
                    color = "is-edible",
                    pattern_shape = "gill-size",
                    color_discrete_sequence = [medimumvioletred, seagreen],
                    barmode = "relative",
                    barnorm = "percent",
                    #histnorm = "probability",
                    text_auto = True,
                    labels={
                             "cap-shape": "Mushroom Cap-Shape & Gill Size",
                            }
                    ) \
        .update_layout (
                        # update layout with titles
                        title={
                                "text": "Percent - Dataset: Cap Shape & Gill Size - Edibility",
                                "x": 0.5
                              },
    
                        yaxis_title= "Percent"
                    ) \
        .update_xaxes(categoryorder='category ascending')

# display the figure
fig.show()

## 4 Dimensions
#### Make visible what colum names corresponds to each level of the pie ?

In [33]:
## dataframe creation - for plotting
df_combinations = df.groupby(['is-edible', 
                              'population', 
                              'habitat', 
                              'bruises'])\
                    .size()\
                    .reset_index()\
                    .rename(columns = {0: 'count'})

# display pandas dataframe
df_combinations

Unnamed: 0,is-edible,population,habitat,bruises,count
0,edible,abundant,grasses,no,384
1,edible,clustered,leaves,no,96
2,edible,clustered,waste,bruises,192
3,edible,numerous,grasses,bruises,128
4,edible,numerous,grasses,no,144
5,edible,numerous,meadows,bruises,128
6,edible,scattered,grasses,bruises,176
7,edible,scattered,grasses,no,528
8,edible,scattered,meadows,bruises,128
9,edible,scattered,paths,bruises,48


In [14]:
## Creating sunburst chart

# define figure element
fig = px.sunburst(  df, 
                    path = ['is-edible', 
                            'bruises',
                            'population', 
                            'habitat'
                           ], 
                    title="Edibility Mushrooms - bruises, population & habitat",
                    color='is-edible',
                    color_discrete_sequence = [medimumvioletred, seagreen],
                    height=800
                 )
# display the figure
fig.show()

## 5 Dimensions
#### todo: Same here, what dimension is which ?

In [28]:
## dataframe creation - for plotting
df_combinations = df.groupby(['is-edible', 
                              'gill-spacing', 
                              'gill-size', 
                              'gill-color', 
                              'gill-attachment']) \
                    .size() \
                    .reset_index() \
                    .rename(columns = {0: 'count'})


# display pandas dataframe
df_combinations.head(5)

Unnamed: 0,is-edible,gill-spacing,gill-size,gill-color,gill-attachment,count
0,edible,close,broad,black,free,128
1,edible,close,broad,brown,attached,64
2,edible,close,broad,brown,free,624
3,edible,close,broad,gray,free,128
4,edible,close,broad,orange,attached,64


In [30]:
## dataframe creation - for plotting

# create treemap figure element
fig = px.treemap(df_combinations_2, 
                 path = [
                         px.Constant("all"), 
                        'is-edible',
                        'gill-spacing', 
                        'gill-size', 
                        'gill-color',
                        'gill-attachment'
                       ], 
                 values='count',
                 color='is-edible',
                 color_discrete_map = {'(?)':'lightgrey', 
                                       'edible': seagreen, 
                                       'poisonous': medimumvioletred,
                                      })

# adjust the margin of the plot to increase reability
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))

#display the figure
fig.show()

#### Parallel Categories Plot

In [31]:
## Creating parallel categores chart

# creation is edible integer
df['is-edible-bool'] = df['is-edible'].map({'edible': 1, 'poisonous': 0})

# Create dimensions 
# stalk-shape
stalk_shape_dim = go.parcats.Dimension(
                                       values = df['stalk-shape'], 
                                       categoryorder = 'category ascending', 
                                       label = "Stalk-Shape"
                                   )
# stalk-root
stalk_root_dim = go.parcats.Dimension(values = df['stalk-root'], 
                                      label = "Stalk-Root"
                                   )
#stalk-surface-above-ring
stalk_surface_above_ring_dim = go.parcats.Dimension(
                                                    values = df['stalk-surface-above-ring'], 
                                                    label = "Stalk-Surface-above-Ring"
                                                )
#stalk-surface-below-ring
stalk_surface_bellow_ring_dim = go.parcats.Dimension(
                                                     values = df['stalk-surface-below-ring'], 
                                                     label = "Stalk-Surface-bellow-Ring"
                                                )

#is-edible
edible_dim = go.parcats.Dimension(
                                  values = df['is-edible'], 
                                  label = "Is Edibile", 
                                  categoryarray = ['edible', 'poisonous'],
                                  ticktext = ['edible', 'poisonous']
                                )



# Create parcats trace
color = df['is-edible-bool'];
colorscale = [[0, medimumvioletred], [1, seagreen]];

# create figure object
fig = go.Figure(data = [go.Parcats(dimensions=[stalk_shape_dim, 
                                               stalk_surface_above_ring_dim,
                                               stalk_root_dim,
                                               stalk_surface_bellow_ring_dim,
                                              edible_dim],
        
        line = {'color': color, 'colorscale': colorscale},
        hoveron='color', hoverinfo='count + probability',
        labelfont={'size': 18, 'family': 'Times'},
        tickfont={'size': 16, 'family': 'Times'},
        arrangement='freeform')])

#display the figure
fig.show()