In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
seq_length = len(open("omicron.seq", "r").read())
seq_length

1270

In [3]:
columns = ["query", "subject", "identity", "alignment_length", "mismatches", "gap_opens", "q_start", "q_end", "s_start", "s_end", "evalue", "bit_score"]

def read_blast_output(virus, offset):
    
    print(f"{virus } offset {offset}")
        
    output_path = f"{virus}/blast_output_offset_{offset}.txt"
    
    rows = []
    for line in open(output_path, "r"):
        if not line.startswith("Query"):
            continue
        parts = line.split()
        row = parts[:2]
        row.extend(list(map(float, parts[2:])))
        rows.append(row)
    
    return pd.DataFrame(rows, columns = columns)

In [4]:
def aggregate_df(virus, df):
    g = df.groupby("subject")
    adf = g["identity"].sum().to_frame()
    adf.rename(columns={"identity": f"{virus}_identity"}, inplace=True)
    adf[f"{virus}_count"] = g["identity"].count()

    adf[f"{virus}_alignment_length"] = g["alignment_length"].sum()
    adf[f"{virus}_mismatches"] = g["mismatches"].sum()
    adf[f"{virus}_matches"] = adf[f"{virus}_alignment_length"] - adf[f"{virus}_mismatches"]
    adf[f"{virus}_match_percent"] = adf[f"{virus}_matches"] / seq_length

#     adf.reset_index(inplace=True)
    
    return adf

In [5]:
viruses = ['omicron', 'lambda', 'kappa', 'delta', 'gamma', 'beta', 'alpha']
total_adf = None
for offset in range(1, 10):
    
    for virus in viruses:
        df = read_blast_output(virus, offset)
        adf = aggregate_df(virus, df)
        
        if total_adf is None:
            total_adf = adf
        else:
            total_adf = pd.concat([total_adf, adf], axis=1)
            
    total_adf.to_csv(f"adf/adf_offset_{offset}.csv")
        

omicron offset 1
lambda offset 1
kappa offset 1
delta offset 1
gamma offset 1
beta offset 1
alpha offset 1
omicron offset 2
lambda offset 2
kappa offset 2
delta offset 2
gamma offset 2
beta offset 2
alpha offset 2
omicron offset 3
lambda offset 3
kappa offset 3
delta offset 3
gamma offset 3
beta offset 3
alpha offset 3
omicron offset 4
lambda offset 4
kappa offset 4
delta offset 4
gamma offset 4
beta offset 4
alpha offset 4
omicron offset 5
lambda offset 5
kappa offset 5
delta offset 5
gamma offset 5
beta offset 5
alpha offset 5
omicron offset 6
lambda offset 6
kappa offset 6
delta offset 6
gamma offset 6
beta offset 6
alpha offset 6
omicron offset 7
lambda offset 7
kappa offset 7
delta offset 7
gamma offset 7
beta offset 7
alpha offset 7
omicron offset 8
lambda offset 8
kappa offset 8
delta offset 8
gamma offset 8
beta offset 8
alpha offset 8
omicron offset 9
lambda offset 9
kappa offset 9
delta offset 9
gamma offset 9
beta offset 9
alpha offset 9


In [8]:
sub_df = total_adf[[f"{virus}_match_percent" for virus in viruses]]
#sub_df

In [10]:
# plt.figure(figsize=(15,10))
# sns.histplot(sub_df.melt(), x='value', hue='variable', multiple='dodge', shrink=.75, bins=20);