In [None]:
from Bio import SeqIO
import random
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd
import time
import datetime
import pickle
from copy import deepcopy
import re

In [None]:
datadir = "/storage/bjarke/GenBank/"
fastafile = datadir + "aligned.fasta"
tsvfile = datadir + "metadata.tsv"

In [None]:
# Load the tsv metadata file into a pandas DataFrame:
df = pd.read_csv(tsvfile, sep='\t', header=0)

In [None]:
df

In [None]:
#df_USA = df.loc[df['country'] == "USA"]
df_USA = df.loc[df['country'] == "United Kingdom"]
#df_USA = df.loc[df['region'] == "Europe"]
#df_USA = df.loc[df['region'] == "Asia"]

# Filter to get only human sequences ...

df_USA = df_USA.loc[df_USA['host'] == "Homo sapiens"]

# Only accept those that have overall quality "good"

df_USA = df_USA.loc[df_USA['QC_overall_status'] == "good"]

print(f"Matched {len(df_USA)} sequences")

In [None]:
df_USA

In [None]:
# A simple way to measure Hamming distance only from the tsv data:

def dist_from_subs(subs0, subs1):
    subs0 = str(subs0).split(",")
    subs1 = str(subs1).split(",")

    dist = 0

    for sub in subs0:
        if sub in subs1:
            pass
        else:
            dist += 1

    for sub in subs1:
        if sub in subs0:
            pass
        else:
            dist += 1

    return dist

def dist_from_subs_and_dels(subs0, subs1, dels0, dels1):
    subs0 = str(subs0).split(",")
    subs1 = str(subs1).split(",")
    dels0_prelim = str(dels0).split(",")
    dels1_prelim = str(dels1).split(",")
    
    dels0_true = []
    dels1_true = []
    
    for dp in dels0_prelim:
        if "-" in dp: # It's a range
            dpsplit = dp.split("-")
            dp_beg_end = (int(dpsplit[0]), int(dpsplit[1]))
            dpdiff = dp_beg_end[1]-dp_beg_end[0]
            for i in range(0,dpdiff+1):
                dels0_true.append(deepcopy(dp_beg_end[0]) + i)
        else: # It's a single deletion (or none)
            if dp=='nan':
                pass
            else:
                dels0_true.append(int(deepcopy(dp)))
    
    for dp in dels1_prelim:
        if "-" in dp: # It's a range
            dpsplit = dp.split("-")
            dp_beg_end = (int(dpsplit[0]), int(dpsplit[1]))
            dpdiff = dp_beg_end[1]-dp_beg_end[0]
            for i in range(0,dpdiff+1):
                dels1_true.append(deepcopy(dp_beg_end[0]) + i)
        else: # It's a single deletion (or none)
            if dp=='nan':
                pass
            else:
                dels1_true.append(deepcopy(int(dp)))

    d0 = dels0_true
    d1 = dels1_true
            
    dist = 0

    for sub in subs0:
        if sub in subs1 or sub=='nan':
            pass
        else:
            dist += 1

    for sub in subs1:
        if sub in subs0 or sub=='nan':
            pass
        else:
            dist += 1

    for d in d0:
        if d in d1:
            pass
        else:
            dist += 1

    for d in d1:
        if d in d0:
            pass
        else:
            dist += 1
            
    return dist

def dist_from_nuc(seq1, seq2):
    # Compute also the Hamming distance between the two:
    bases = ["G", "C", "T", "A"]
    ndiff = 0
    print("Length before padding removal:", len(seq1))
    # Remove padding (at both ends!)
    seq1_new = str(seq1)
    seq2_new = str(seq2)
    # First, remove left
    for i in range(len(seq1_new)):
        if seq1_new[0]=="-" and seq2_new[0]=="-":
            seq1_new = seq1_new[1:]
            seq2_new = seq1_new[1:]
        else:
            break
    # Then right:
    L_tmp = len(seq1_new)
    while True:
        if seq1_new[-1]=="-" and seq2_new[-1]=="-":
            seq1_new = seq1_new[:-1]
            seq2_new = seq1_new[:-1]
        else:
            break
    seq1 = seq1_new
    seq2 = seq2_new
    
    for i in range(len(seq1)):
        if (seq1[i] in bases) and (seq2[i] in bases):
            print(seq1[i], "vs", seq2[i])
            if seq1[i] != seq2[i]:
                print("Difference at", i)
                ndiff += 1
    return ndiff

def dist_from_nuc_simple(seq1, seq2, verbose=False):
    # Compute also the Hamming distance between the two:
    bases = ["G", "C", "T", "A"]
    ndiff = 0
    seq1 = str(seq1)
    seq2 = str(seq2)
    
    for i in range(len(seq1)):
        if seq1[i] != seq2[i]:
            if (seq1[i] in bases) and (seq2[i] in bases):
                if verbose:
                    print("Difference (sub) at ", i+1, seq1[i], "vs", seq2[i])
                ndiff += 1
            elif (seq1[i] in bases) or (seq2[i] in bases):
                usable = True
                # Check that not all previous entries are "-" in either string:
                if seq1[i]=="N" or seq2[i]=="N":
                    usable = False
                if len(seq1[0:i].replace("-",""))==0 or len(seq2[0:i].replace("-",""))==0:
                    usable = False
                if len(seq1[i:].replace("-",""))==0 or len(seq2[i:].replace("-",""))==0:
                    usable = False
                if usable:
                    if verbose:
                        print("Difference (del) at", i+1, seq1[i], "vs", seq2[i])
                    ndiff += 1
    return ndiff

# A rudimentary way to measure Hamming distance only from the tsv data:
def diffs_site(subs0, subs1, dels0, dels1):
    subs0 = str(subs0).split(",")
    subs1 = str(subs1).split(",")
    dels0_prelim = str(dels0).split(",")
    dels1_prelim = str(dels1).split(",")
    
    dels0_true = []
    dels1_true = []
    
    for dp in dels0_prelim:
        if "-" in dp: # It's a range
            dpsplit = dp.split("-")
            dp_beg_end = (int(dpsplit[0]), int(dpsplit[1]))
            dpdiff = dp_beg_end[1]-dp_beg_end[0]
            for i in range(0,dpdiff+1):
                dels0_true.append(deepcopy(dp_beg_end[0]) + i)
        else: # It's a single deletion (or none)
            if dp=='nan':
                pass
            else:
                dels0_true.append(int(deepcopy(dp)))
    
    for dp in dels1_prelim:
        if "-" in dp: # It's a range
            dpsplit = dp.split("-")
            dp_beg_end = (int(dpsplit[0]), int(dpsplit[1]))
            dpdiff = dp_beg_end[1]-dp_beg_end[0]
            for i in range(0,dpdiff+1):
                dels1_true.append(deepcopy(dp_beg_end[0]) + i)
        else: # It's a single deletion (or none)
            if dp=='nan':
                pass
            else:
                dels1_true.append(deepcopy(int(dp)))

    d0 = dels0_true
    d1 = dels1_true
            
    dist = []

    for sub in subs0:
        if sub in subs1 or sub=='nan':
            pass
        else:
            # Strip bases and convert to integer
            sub_s = re.sub("[^0-9]", "", sub)
            try:
                sub_s = int(sub_s)
            except ValueError:
                print("Error, sub was:", sub)
            dist.append(sub_s)

    for sub in subs1:
        if sub in subs0 or sub=='nan':
            pass
        else:
            # Strip bases and convert to integer
            sub_s = re.sub("[^0-9]", "", sub)
            try:
                sub_s = int(sub_s)
            except ValueError:
                print("Error, sub was:", sub)
            dist.append(sub_s)

    for d in d0:
        if d in d1:
            pass
        else:
            dist.append(d)

    for d in d1:
        if d in d0:
            pass
        else:
            dist.append(d)
            
    return dist

In [None]:
def get_avg_dists(df_match, downsample=False):
    if downsample:
        comparisons = 50
    else:
        comparisons = 5000
    comps_done = 0
    dists = []
    while comps_done < comparisons:
        i = random.randint(0,len(df_match)-1)
        j = random.randint(0,len(df_match)-1)
        while i==j:
            j = random.randint(0,len(df_match)-1)
        #dist_loc = dist_from_subs(df_match.iloc[i].substitutions,df_match.iloc[j].substitutions)
        dist_loc = dist_from_subs_and_dels(df_match.iloc[i].substitutions,df_match.iloc[j].substitutions,df_match.iloc[i].deletions,df_match.iloc[j].deletions)
        dists.append(dist_loc)
        comps_done += 1
    return dists

def get_avg_dists_two_sets(df1, df2, downsample=False):
    if downsample:
        comparisons = 50
    else:
        comparisons = 5000
    comps_done = 0
    dists = []
    while comps_done < comparisons:
        i = random.randint(0,len(df1)-1)
        j = random.randint(0,len(df2)-1)
        dist_loc = dist_from_subs_and_dels(df1.iloc[i].substitutions,df2.iloc[j].substitutions,df1.iloc[i].deletions,df2.iloc[j].deletions)
        dists.append(dist_loc)
        comps_done += 1
    return dists

def get_avg_dists_site(df_match, downsample=False):
    if downsample:
        comparisons = 50
    else:
        comparisons = 1000
    comps_done = 0
    dists = []
    while comps_done < comparisons:
        i = random.randint(0,len(df_match)-1)
        j = random.randint(0,len(df_match)-1)
        while i==j:
            j = random.randint(0,len(df_match)-1)
        #dist_loc = dist_from_subs(df_match.iloc[i].substitutions,df_match.iloc[j].substitutions)
        diffs_loc = diffs_site(df_match.iloc[i].substitutions,df_match.iloc[j].substitutions,df_match.iloc[i].deletions,df_match.iloc[j].deletions)
        dists = dists + diffs_loc
        comps_done += 1
    return dists

In [None]:
datadir = "/insert/storage/directory/here/"
datadir = datadir + "UK/"
timespan = 875 # days

distributions = []
dateranges = []

savefigs = True
savedata = True
downsample = False

# Filter by date
date_beg = datetime.datetime(2020, 3, 1)
#date_beg = datetime.datetime(2022, 1, 1)

# Set reference sequences:
df_ref = df_USA.loc[df_USA['date'] >= "2021-03-15"]
df_ref = df_ref.loc[df_ref['date'] < "2021-03-22"]

for i in range(timespan):
    date_beg_str = date_beg.strftime('%Y-%m-%d')
    date_end = date_beg
    date_end += datetime.timedelta(days=7)
    date_end_str = date_end.strftime('%Y-%m-%d')
    df_match = df_USA.loc[df_USA['date'] >= date_beg_str]
    df_match = df_match.loc[df_match['date'] < date_end_str]
    print(f"{date_beg_str}. Rows in filtered dataframe:", round(len(df_match)/1e3,2), "x 10^3" )
    if len(df_match) < 10:
        print("Skipping due to missing data! Fiducially adding [0] distribution.")
        distributions.append([0,0,0,0,0,0,0,0,0])
        dateranges.append((date_beg_str,date_end_str))
    else:
        #dists = get_avg_dists(df_match, downsample=downsample)
        dists = get_avg_dists_two_sets(df_match, df_ref, downsample=downsample)
        if savefigs:
            __, __, __ = plt.hist(dists, range=[0, 150], bins=151, density=True)
            plt.xlim([0,150])
            plt.ylim([0,0.1])
            plt.xlabel("Hamming distance")
            plt.ylabel("Frequency")
            plt.title(date_beg_str)
            plt.savefig(datadir + str(i) + ".png", dpi=175, facecolor='white', transparent=False, bbox_inches='tight', pad_inches=0)
            plt.clf()
        distributions.append(dists)
        dateranges.append((date_beg_str,date_end_str))
    date_beg += datetime.timedelta(days=1)
if savedata:
    datadict = dict()
    datadict["hammingdistributions"] = distributions
    datadict["dateranges"] = dateranges
    rand_ID = str(random.randint(100000000,999999999))
    filename = "data_" + rand_ID
    pklname = filename + ".pkl"
    f = open(datadir + pklname, "wb")
    pickle.dump(datadict,f)
    f.close()

In [None]:
# Parallelized version! 
from multiprocessing import Pool
from matplotlib.ticker import FuncFormatter

def generate_heatmaps_parready(paramdict):
    date_beg_str = paramdict["date_beg_str"]
    date_end_str = paramdict["date_end_str"]
    df_ref = paramdict["df_ref"]
    i = paramdict["i"]
    df_USA = paramdict["df"]
    print("i:", i)
    
    savefigs = False
    downsample = False

    df_match = df_USA.loc[df_USA['date'] >= date_beg_str]
    df_match = df_match.loc[df_match['date'] < date_end_str]
    if len(df_match) < 10:
        dists = [0,0,0,0,0,0,0,0,0]
    else:
        #dists = get_avg_dists(df_match, downsample=downsample)
        dists = get_avg_dists_two_sets(df_match, df_ref, downsample=downsample)
        if savefigs:
            plt.ioff()
            fontsize=10
            font = {'family' : 'sans',
                    'weight' : 'normal',
                    'size'   : fontsize}
            mystyle = 'seaborn'
            plt.style.use(mystyle)
            plt.rc('font', **font)

            __, __, __ = plt.hist(dists, range=[0, 150], bins=151, density=True)
            plt.xlim([0,150])
            plt.ylim([0,0.1])
            plt.xlabel("Hamming distance")
            plt.ylabel("Frequency")
            plt.title(date_beg_str)
            plt.savefig(datadir + str(i) + ".png", dpi=175, facecolor='white', transparent=False, bbox_inches='tight', pad_inches=0)
            plt.clf()
    return dists

datadir = "/insert/storage/directory/here/"
datadir = datadir + "UK_vsBA2/"

#datadir = datadir + "UK_test/"
#timespan = 2*365 # days
#timespan = 10 # days
timespan = 875 # days

# Set reference sequences:
# Alpha:
#df_ref = df_USA.loc[df_USA['date'] >= "2021-03-15"]
#df_ref = df_ref.loc[df_ref['date'] < "2021-03-22"]
# Delta:
#df_ref = df_USA.loc[df_USA['date'] >= "2021-09-20"]
#df_ref = df_ref.loc[df_ref['date'] < "2021-09-27"]
# Omicron (BA1):
#df_ref = df_USA.loc[df_USA['date'] >= "2022-01-01"]
#df_ref = df_ref.loc[df_ref['date'] < "2022-01-08"]
# Omicron BA2:
df_ref = df_USA.loc[df_USA['date'] >= "2022-04-09"]
df_ref = df_ref.loc[df_ref['date'] < "2022-04-16"]

# Filter by date
date_beg = datetime.datetime(2020, 3, 1)
#date_beg = datetime.datetime(2022, 1, 1)


distributions = []
dateranges = []


# First create list of parameter dictionaries:
# Generate all the starting dates:
date_beg_strs = []
date_end_strs = []
for i in range(timespan):
    date_beg_str = date_beg.strftime('%Y-%m-%d')
    date_end = date_beg
    date_end += datetime.timedelta(days=7)
    date_end_str = date_end.strftime('%Y-%m-%d')
    date_beg_strs.append(date_beg_str)
    date_end_strs.append(date_end_str)
    date_beg += datetime.timedelta(days=1)

paramdicts = []

for i in range(len(date_beg_strs)):
    paramdicts.append({"date_beg_str":date_beg_strs[i], "date_end_str":date_end_strs[i], "df_ref": df_ref, "i": i, "df": df_USA}  )

if __name__ == '__main__':
        pool = Pool(processes=30) # How many concurrent processes? 
        outarr = pool.map(generate_heatmaps_parready, paramdicts)


In [None]:
## Populate datadict on the basis of parallell runs:
# First, generate the begdates list of 2-tuples:

savedata = True

dateranges = []
distributions = []

for i in range(len(date_beg_strs)):
    dateranges.append((date_beg_strs[i],date_end_strs[i]))
    distributions.append(outarr[i])
    

datadict = dict()
datadict["hammingdistributions"] = distributions
datadict["dateranges"] = dateranges

if savedata:
    rand_ID = str(random.randint(100000000,999999999))
    filename = "data_" + rand_ID
    pklname = filename + ".pkl"
    f = open(datadir + pklname, "wb")
    pickle.dump(datadict,f)
    f.close()
    print("Saved to:", datadir + pklname)

In [None]:
# Hamming distance at each site!
# (parallel)
from multiprocessing import Pool
from matplotlib.ticker import FuncFormatter

def generate_site_heatmaps_parready(paramdict):
    date_beg_str = paramdict["date_beg_str"]
    date_end_str = paramdict["date_end_str"]
    df_ref = paramdict["df_ref"]
    i = paramdict["i"]
    df_USA = paramdict["df"]
    print("i:", i)
    
    savefigs = False
    downsample = False

    df_match = df_USA.loc[df_USA['date'] >= date_beg_str]
    df_match = df_match.loc[df_match['date'] < date_end_str]
    if len(df_match) < 10:
        dists = [0,0,0,0,0,0,0,0,0]
    else:
        dists = get_avg_dists_site(df_match, downsample=downsample)
    return dists

datadir = "/insert/storage/directory/here/"
#datadir = datadir + "UK_downsample/"
#datadir = datadir + "UK_vsAlpha/"
#datadir = datadir + "UK_vsDelta/"
#datadir = datadir + "UK_vsBA2/"
datadir = datadir + "UK_site/"

#datadir = datadir + "UK_test/"
#timespan = 2*365 # days
#timespan = 10 # days
timespan = 875 # days

# Set reference sequences:
# Alpha:
df_ref = df_USA.loc[df_USA['date'] >= "2021-03-15"]
df_ref = df_ref.loc[df_ref['date'] < "2021-03-22"]
# Delta:
#df_ref = df_USA.loc[df_USA['date'] >= "2021-09-20"]
#df_ref = df_ref.loc[df_ref['date'] < "2021-09-27"]
# Omicron (BA1):
#df_ref = df_USA.loc[df_USA['date'] >= "2022-01-01"]
#df_ref = df_ref.loc[df_ref['date'] < "2022-01-08"]
# Omicron BA2:
#df_ref = df_USA.loc[df_USA['date'] >= "2022-04-09"]
#df_ref = df_ref.loc[df_ref['date'] < "2022-04-16"]

# Filter by date
date_beg = datetime.datetime(2020, 3, 1)
#date_beg = datetime.datetime(2022, 1, 1)


distributions = []
dateranges = []


# First create list of parameter dictionaries:
# Generate all the starting dates:
date_beg_strs = []
date_end_strs = []
for i in range(timespan):
    date_beg_str = date_beg.strftime('%Y-%m-%d')
    date_end = date_beg
    date_end += datetime.timedelta(days=7)
    date_end_str = date_end.strftime('%Y-%m-%d')
    date_beg_strs.append(date_beg_str)
    date_end_strs.append(date_end_str)
    date_beg += datetime.timedelta(days=1)

paramdicts = []

for i in range(len(date_beg_strs)):
    paramdicts.append({"date_beg_str":date_beg_strs[i], "date_end_str":date_end_strs[i], "df_ref": df_ref, "i": i, "df": df_USA}  )

if __name__ == '__main__':
        pool = Pool(processes=30) # How many concurrent processes? 
        outarr_site = pool.map(generate_site_heatmaps_parready, paramdicts)

In [None]:
## Populate datadict on the basis of parallell runs:
# First, generate the begdates list of 2-tuples:

savedata = True

dateranges = []
distributions = []

for i in range(len(date_beg_strs)):
    dateranges.append((date_beg_strs[i],date_end_strs[i]))
    distributions.append(outarr_site[i])
    

datadict_site = dict()
datadict_site["hammingdistributions"] = distributions
datadict_site["dateranges"] = dateranges

if savedata:
    rand_ID = str(random.randint(100000000,999999999))
    filename = "data_site_" + rand_ID
    pklname = filename + ".pkl"
    f = open(datadir + pklname, "wb")
    pickle.dump(datadict_site,f)
    f.close()
    print("Saved to:", datadir + pklname)

In [None]:
# Generate Heatmap matrix as well
x_max = 30000
bins = 300
ny = len(distributions) # time
nx = bins
plotmat_sites = np.zeros((ny,nx))
for i in range(ny):
    hist_n, hist_bins = np.histogram(distributions[i], density=False, range=[0, x_max], bins=bins)
    plotmat_sites[i,:] = deepcopy(hist_n)
    print(f"t={i+1} out of {ny}")

In [None]:
# Plot site-wise map

fig, ((ax)) = plt.subplots(1, 1, dpi=175, figsize=(6,8))

showmat = np.log(0.05+plotmat_sites[:,:]/np.max(plotmat_sites)) 
#showmat = plotmat_sites[:,:]/np.max(plotmat_sites)

cax = ax.imshow(showmat, interpolation='nearest', aspect='auto', cmap=plt.get_cmap('inferno'))

cbar = fig.colorbar(cax, ticks=[np.min(showmat), np.max(showmat)])
cbar.ax.set_yticklabels(['0', '1'])  # vertically oriented colorbar
cbar.set_label('Frequency (logarithmic) within generation', rotation=270)

ax = plt.gca()

ax.grid(False)

country_name = "UK"
#country_name = "USA"
#country_name = "Europe"

startdate = "2020-03-01"

plt.xlabel("Site/100")
plt.ylabel(f"Day (since {startdate})")

In [None]:
# Load existing data:
load_from = "/insert/storage/directory/here/UK_origin/data_708266984.pkl"
datadict = pickle.load( open(load_from, "rb" ) )

ddh = datadict["hammingdistributions"]
ddd = datadict["dateranges"]
means = []
variances = []
deciles = []
begdates = []

for i in range(1,10):
    deciles.append([])
i_s = []
for i in range(len(ddh)):
    if np.mean(ddh[i])>0:
        means.append(np.mean(ddh[i]))
        variances.append(np.var(ddh[i]))
        for j in range(1,10):
            pctile = np.percentile(ddh[i], j*10)
            deciles[j-1].append(pctile)
        i_s.append(i)
        begdates.append(ddd[i][0])

In [None]:
x_values = [datetime.datetime.strptime(d,"%Y-%m-%d").date() for d in begdates]

In [None]:
start_tidx = 200

fig, ((ax1)) = plt.subplots(1, 1, dpi=175, figsize=(5,3.5))

ax = plt.gca()
formatter = mdates.DateFormatter("%Y-%m-%d")
ax.xaxis.set_major_formatter(formatter)

#ax.xaxis.set_major_locator(mdates.WeekdayLocator(interval=4))

locator = mdates.DayLocator()
ax.xaxis.set_major_locator(locator)



dec_idx=4


In [None]:
%matplotlib inline

from matplotlib.ticker import FuncFormatter

fontsize=10

font = {'family' : 'sans',
        'weight' : 'normal',
        'size'   : fontsize}


#mystyle = 'seaborn-notebook'
mystyle = 'seaborn'

# bmh is quite good somehow, but maybe too heavy
plt.style.use(mystyle)
plt.rc('font', **font)
#plt.style.use("ggplot")

#plt.figure(figsize=(5, 4), dpi=175)
fig, ((ax1)) = plt.subplots(1, 1, dpi=275, figsize=(6,3.5))

#ax = plt.gca()

# If percentage y axis is desired:
#ax1.yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y)))

cm = plt.cm.get_cmap('inferno')
#cm = plt.cm.get_cmap('Set1')
#cm = plt.cm.get_cmap('YlGnBu_r')

#dcol = 0.25
#col0 = 0.0

#dcol=0.2
#dcol = 0.25
dcol = 0.25
#col0=-dcol+0.05
col0 = 0

#t = np.array(range(len(I_noss_hi)))/48
#t_coarse = np.array(range(len(I_noss_lo)))/48

#plt.plot(t, I_noss_hi+E_noss_hi, color=cm(col0 + 3*dcol), alpha=0.3)

# Set colours:
# Normal order
col_multipliers = [1, 2, 2.8]
# Reversed order:
col_reversed = True
if col_reversed:
    col_multipliers = list(reversed(col_multipliers))
col1 = cm(col0 + col_multipliers[0]*dcol)
col2 = cm(col0 + col_multipliers[1]*dcol)
col3 = cm(col0 + col_multipliers[2]*dcol)

ax = plt.gca()




plt.plot_date(begdates, means, fmt='', label="Mean", color=col1)
plt.plot_date(begdates, deciles[dec_idx], fmt='--', label=f"{(dec_idx+1)*10}% percentile", color=col2)

#plt.xlabel("Time (generations)")
plt.ylabel("Hamming distance")

ticklist = []
for i in range(len(begdates)):
    if begdates[i][-2:]=="01":
        ticklist.append(begdates[i])
        print("Adding tick", ticklist[-1])
plt.xticks(ticklist, rotation=60)

print("First day:", begdates[0])
print("Last day:", begdates[-1])

plt.legend()

plt.xlim([-1,850])
print(begdates[850])

In [None]:
# MEAN VS VARIANCE
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}

%matplotlib inline


fig, ((ax1)) = plt.subplots(1, 1, dpi=275, figsize=(6,3.5))

ax = plt.gca()

varmultiply = 0.08

plt.plot_date(begdates, means, fmt='', label="Mean", color=col1)
plt.plot_date(begdates, varmultiply*np.array(variances), fmt='--', label=f"Variance scaled by {varmultiply}", color=col2)

#plt.xlabel("Time (generations)")
plt.ylabel("Hamming distance")

#plt.ylim([0,100])

ticklist = []
for i in range(len(begdates)):
    if begdates[i][-2:]=="01":
        ticklist.append(begdates[i])
plt.xticks(ticklist, rotation=60)
plt.legend()

In [None]:
# Convert to matrix:

x_max = 200
ny = len(ddh) # time
nx = x_max
plotmat = np.zeros((ny,nx))
for i in range(ny):
    hist_n, hist_bins = np.histogram(ddh[i], density=True, range=[0, x_max], bins=x_max)
    plotmat[i,:] = deepcopy(hist_n)
    print(f"t={i+1} out of {ny}")

In [None]:
fig, ((ax)) = plt.subplots(1, 1, dpi=175, figsize=(6,8))

showmat = np.log(0.02+plotmat[0:800,:175]) 

cax = ax.imshow(showmat, interpolation='nearest', aspect='auto', cmap=plt.get_cmap('inferno'))

cbar = fig.colorbar(cax, ticks=[np.min(showmat), np.max(showmat)])
cbar.ax.set_yticklabels(['0', '1'])  # vertically oriented colorbar
cbar.set_label('Frequency (logarithmic) within generation', rotation=270)

ax = plt.gca()

ax.grid(False)

country_name = "UK"
#country_name = "USA"
#country_name = "Europe"

startdate = "2020-03-01"

plt.xlabel("Hamming distance")
plt.ylabel(f"Day (since {startdate})")
plt.title(f"Time-dependent Hamming heatmap ({country_name})")