In [None]:
import pandas as pd
import json
from urllib.request import urlopen
import networkx as nx
import matplotlib.pyplot as plt
from networkx.drawing.nx_pydot import graphviz_layout

In [None]:
alias_dict = urlopen("https://raw.githubusercontent.com/cov-lineages/pango-designation/master/pango_designation/alias_key.json")
alias_dict = json.loads("".join([x.decode("utf-8").strip() for x in alias_dict]))

def unalias_lineage(input_lineage, alias_dict=alias_dict):
    isWithdrawn = input_lineage[0] == "*"
    stem = input_lineage.split(".")[0].replace("*", "")
    rest_of_it = ".".join(input_lineage.split(".")[1:])
    if alias_dict.get(stem) and isinstance(alias_dict.get(stem), str):
        if isWithdrawn:
            return "*" + alias_dict[stem] + "." + rest_of_it
        else:
            return alias_dict[stem] + "." + rest_of_it
    else:
        return input_lineage

In [None]:
in_df = pd.read_json(
            "https://github.com/cov-lineages/lineages-website/raw/master/_data/lineage_data.full.json",
            orient="index"
        ).drop([
            "Country counts", 
            "Date",
            "Travel history"
            ], axis=1)

in_df["Unaliased"] = in_df["Lineage"].apply(lambda x: unalias_lineage(x))

in_df["Countries"] = in_df["Countries"].apply(lambda x: x.replace("_", "\u00A0"))

in_df["Earliest date"] = in_df["Earliest date"].apply(lambda x: x.replace("-", "\u2011"))
in_df["Latest date"] = in_df["Latest date"].apply(lambda x: x.replace("-", "\u2011"))

in_df = in_df[[
                'Lineage',
                'Unaliased',
                'Countries', 
                'Earliest date', 
                'Latest date',
                'Number designated', 
                'Number assigned', 
                'Description'
            ]]

in_df

In [None]:
list_of_withdrawn_lineages = list(
    in_df.loc[in_df["Lineage"].str.startswith("*")]["Lineage"]
)

## get just the withdrawn lineages
withdrawn_df = in_df.loc[
    in_df["Lineage"].isin(list_of_withdrawn_lineages)
][[
    "Lineage",
    "Unaliased",
    "Description"
]]

## get rid of the withdrawn lineages
in_df = in_df.loc[
    ~in_df["Lineage"].isin(list_of_withdrawn_lineages)
    & ~in_df["Lineage"].isin([x.replace("*", "") for x in list_of_withdrawn_lineages])
]

In [None]:
in_df = in_df.loc[in_df["Unaliased"].str.startswith("B.1.1.529.")]

In [None]:
unalias_df = in_df

unalias_df["period_count"] = unalias_df["Unaliased"].apply(lambda x: x.count("."))

unalias_df["parent"] = unalias_df["Unaliased"].apply(lambda x: ".".join(x.split(".")[:-1]) if "." in x else "B.1.1.529")

unalias_df = pd.merge(
    unalias_df,
    unalias_df[["Lineage", "Unaliased"]].rename({"Lineage": "realiased_parent", "Unaliased": "ua_lookup"}, axis=1),
    left_on="parent",
    right_on="ua_lookup",
    how="left"
).drop("ua_lookup", axis=1)

unalias_df["realiased_parent"] = unalias_df["realiased_parent"].fillna("B.1.1.529")

unalias_df


In [None]:
test = nx.DiGraph()

test.add_edges_from(unalias_df[["realiased_parent", "Lineage"]].to_records(index=False))

fig = plt.figure(1, figsize=(100,100), dpi=80)

pos = graphviz_layout(
    test, 
    prog="twopi", 
    root="B.1.1.529"
)

nx.draw_networkx(
    test,
    pos
)
