In [None]:
%load_ext autoreload
%autoreload 2
%aimport utils

import altair as alt
import pandas as pd

from constants import COLUMNS
from utils import apply_upset_theme, get_visualization_subtitle
from web import for_website

### Descriptions
- The input format is identical to "File" in UpSetR-shiny (https://github.com/hms-dbmi/UpSetR-shiny)
    - e.g., {attribute 1, attribute 2, ... attribute N, set 1, set2, ..., set M} where "set" columns either represent 1 or 0, 1 indicating the '⬤' representation in UpSet
- Options
    1. Specify sets of interest
    2. Show empty intersections or not
    3. Sorting type: Frequency or Degree
    4. Sorting order: "descending" or "ascending"

In [None]:
SETS_OF_INTEREST = ["Shortness of Breath", "Diarrhea", "Fever", "Cough", "Anosmia", "Fatigue"] # ["Action", "Adventure", "Children", "Comedy"]
ABBR_SETS_OF_INTEREST = ["B", "D", "Fe", "C", "A", "Fa"]
SHOW_EMPTY_INTERSECTIONS = False
SORT_BY = "Frequency" # or "Degree"
SORT_ORDER = "ascending" # or "descending"

# Data

In [None]:
df = pd.read_csv("../data/movies.csv")

df.head()

In [None]:
# df["count"] = 0

# f_df = df[SETS_OF_INTEREST + ["count"]]

# f_df = f_df.groupby(SETS_OF_INTEREST).count().reset_index()

# To Test with COVID (TODO: Remove this line and remove comments above before release)
f_df = pd.read_csv("../data/covid_symptoms.csv")

f_df.head()

In [None]:
def UpSetAltair(
    data=None,
    sets=None, # This list reflect the order of sets to be shown in the plots as well.
    abbre=None,
    sort_by="Frequency",
    sort_order="ascending",
): 
    if (data is None) or (sets is None):
        print("No data and/or a list of sets are provided")
        return

    # Data preprocessing
    data["intersection_id"] = data.index
    data["degree"] = data[sets].sum(axis=1)
    data = data.sort_values(by=["count"], ascending=True if sort_order == "ascending" else False)

    data = pd.melt(data, id_vars=[
        # wide to long
        "intersection_id", "count", "degree"
    ])
    data = data.rename(columns={"variable": "set", "value": "is_intersect"})

    sets_to_abbre = { sets[i]: abbre[i] for i in range(len(sets)) }
    data["set_order"] = data["set"]
    data["set_order"] = data["set_order"].apply(lambda x: len(SETS_OF_INTEREST) - SETS_OF_INTEREST.index(x))
    data["set_abbre"] = data["set"]
    data["set_abbre"] = data["set_abbre"].apply(lambda x: x[:2].upper() if abbre == None else sets_to_abbre[x])

    # Style definitions
    color_range = ["#55A8DB", "#3070B5", "#30363F", "#F1AD60", "#DF6234", "#BDC6CA"]
    width = 1000
    main_color = "#3A3A3A"
    bar_size = min(30, width / len(data["intersection_id"].unique().tolist()) - 16)
    circle_size = 200
    sort_order = {
        "field": "count" if sort_by == "Frequency" else "degree",
        "order": sort_order
    }
    x_sort = alt.Sort(field=sort_order["field"], order=sort_order["order"])
    
    # Plots
    base = alt.Chart(data)
    
    # Cardinality by intersections
    bar = base.mark_bar(color=main_color, size=bar_size).encode(
        x=alt.X(
            "intersection_id:N", 
            axis=alt.Axis(grid=False, labels=False, ticks=False, domain=True), 
            sort=x_sort,
            title=None
        ),
        y=alt.Y(
            "max(count):Q",
            axis=alt.Axis(grid=False, tickCount=3, orient='right'),
            title="Intersection Size"
        )
    ).properties(
        width=width
    )
    text = bar.mark_text(
        color=main_color, 
        dy=-10,
        size=16
    ).encode(
        text=alt.Text("count:Q", format=".0f")
    )
    cardinality_by_intersections = (bar + text)
    
    # UpSet intersections
    circle_bg = bar.mark_circle(size=circle_size, opacity=1).encode(
        x=alt.X(
            "intersection_id:N", 
            axis=alt.Axis(grid=False, labels=False, ticks=False, domain=False), 
            sort=x_sort,
            title=None
        ),
        y=alt.Y(
            "set_order:Q",
            axis=alt.Axis(grid=False, labels=False, ticks=False, domain=False),
            title=None
        ),
        color=alt.value("#E6E6E6")
    ).properties(
        height=250
    )

    rect_bg = circle_bg.mark_rect(height=30, width=50).transform_filter(
        alt.datum["set_order"] % 2 == 1
    ).encode(
        color=alt.value("#F7F7F7")
    )

    circle = circle_bg.transform_filter(
        alt.datum["is_intersect"] == 1
    ).encode(
        color=alt.value(main_color)
    )

    line_connection = bar.mark_bar(size=2, color=main_color).transform_filter(
        alt.datum["is_intersect"] == 1
    ).encode(
        y=alt.Y("min(set_order):Q"),
        y2=alt.Y2("max(set_order):Q")
    )

    upset_intersections = (rect_bg + circle_bg + line_connection + circle)

    # Cardinality by sets
    label_bg = base.mark_circle(size=1000).encode(
        y=alt.Y(
            "set_order:Q",
            axis=alt.Axis(grid=False, labels=False, ticks=False, domain=False),
            title=None
        ),
        color=alt.Color(
            "set:N",
            scale=alt.Scale(domain=sets, range=color_range),
            title=None
        ),
        opacity=alt.value(1)
    )
    label = label_bg.mark_text().encode(
        text=alt.Text("set_abbre:N"),
        color=alt.value("white")

    )

    bar_by_sets = label_bg.mark_bar(size=20).transform_filter(
        alt.datum["is_intersect"] == 1
    ).encode(
        x=alt.X(
            "sum(count):Q",
            axis=alt.Axis(grid=False, tickCount=3),
            title="Set Size"
        )
    ).properties(
        width=300
    )

    # Properties
    upsetaltair = alt.vconcat(
        cardinality_by_intersections,
        alt.hconcat(upset_intersections, (label_bg + label), bar_by_sets, spacing=5).resolve_scale(y="shared"),
        spacing=20
    )
    
    return apply_upset_theme(
            upsetaltair, 
            legend_orient="top-left", 
            legend_stroke_color=None,
            legend_symbol_size=circle_size*2.5
        ).properties(
            title={
                "text": "Symptoms reported by users of the COVID Symptom Tracker app",
                # "color": main_color,
                # "subtitle": get_visualization_subtitle(),
                "subtitleColor": "gray"
            }
        )

In [None]:
UpSetAltair(
    data=f_df.copy(), 
    sets=SETS_OF_INTEREST,
    abbre=ABBR_SETS_OF_INTEREST,
    sort_by=SORT_BY,
    sort_order=SORT_ORDER
)

In [None]:
UpSetAltair(
    data=f_df.copy(), 
    sets=["Shortness of Breath", "Diarrhea", "Fever", "Cough", "Anosmia", "Fatigue"],
    abbre=["B", "D", "Fe", "C", "A", "Fa"],
    sort_by="Degree",
    sort_order="descending"
)