In [None]:
import polars as pl
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import pyarrow as pyarrow

import glob 
archaic_df = pl.read_csv("archaic_df.csv", separator="\t", heas_header=True) 

In [None]:
# archaic_df = dataframe with Archaic fragments and added posterior probability interval column 

def interval(prob: float) -> str:
    """Define a function for binning mean_prob into intervals 
    """
    if prob >= 0.5 and prob < 0.6:
        return("[0.5-0.6)")
    elif prob >= 0.6 and prob < 0.7:
        return("[0.6-0.7)")
    elif prob >= 0.7 and prob < 0.8:
        return("[0.7-0.8)")
    elif prob >= 0.8 and prob < 0.9:
        return("[0.8-0.9)")
    elif prob >= 0.9 and prob < 0.95:
        return("[0.9-0.95)")
    elif prob >= 0.95 and prob < 0.99:
        return("[0.95-0.99)")
    elif prob >= 0.99 and prob < 1:
        return("[0.99-1)")
    elif prob == 1:
        return("1")

# Archaic only df with added column of posterior probability intervals for further processing. Without snp count and coordinates. 
archaic_df = (decoded_df.filter(pl.col("state") == "Archaic")
              .select(["pop", "ID", "length", "mean_prob"])
              .with_columns(
                  (pl.col("mean_prob").apply(lambda x: interval(x)).alias("interval_prob"),
                   pl.col("mean_prob").round(1).alias("rounded_mean_prob"))
))
print(archaic_df)

In [None]:
# Getting all archaic fragment count for each population 
archaic_df_GBR = archaic_df.filter(pl.col("pop") == "GBR")
print(archaic_df_GBR)

archaic_df_CHS = archaic_df.filter(pl.col("pop") == "CHS")
print(archaic_df_CHS)

In [None]:
# Plotting a boxplot of archaic fragment length in probability intervals in two populations 
px.box(archaic_df, x="interval_prob", y="length", color="pop",  
           labels={"pop":"populations", "interval_prob":"posterior probability interval", "length":"archaic fragment length"},
           width=1000)

In [None]:
# Plotting a histogram of fragment length against rounded mean_prob
px.histogram(archaic_df, x="length", color="rounded_mean_prob", log_x=True, 
            category_orders={"rounded_mean_prob":[0.5, 0.6, 0.7, 0.8, 0.9, 1]},
            labels={"rounded_mean_prob":"rounded posterior probability"}, 
            title="Archaic fragment length distribution (log-scaled) based on posterior probability cut-off values",
            opacity=0.8,
            width=1000)

In [None]:
# Plotting a histogram of fragment count against mean_prob
px.histogram(archaic_df, x="mean_prob", nbins=50, color="pop", 
             title="Archaic fragment count distribution across posterior probability cut-off values", 
             labels={"mean_prob":"posterior probability", "pop":"populations"}, range_x=(0.5, 1), 
             width=1000, opacity=0.8)

In [None]:
# Calculating all archaic fragment count and fragment length stats in defined posterior probability cutoff intervals for each pop
interval_archaic = archaic_df.groupby(["interval_prob", "pop"]).agg(
    (pl.count("interval_prob").alias("fragment_count")), 
    (pl.mean("length").alias("mean_length")), 
    (pl.median("length").alias("median_length")), 
    (pl.min("length").alias("min_length")), 
    (pl.max("length").alias("max_lenght"))
    ).sort("interval_prob")
print(interval_archaic)

# Saving interval_archaic dataframe to excel worksheet to produce a table figure 
# interval_archaic.write_excel("pos_ct/excel_output.xlsx", "probability_interval_stat")

In [None]:
# Calculating total Archaic fragment count and fragment length stats in defined posterior probability cutoff interavals
interval_archaic = archaic_df.groupby(["interval_prob", "pop"]).agg(
    (pl.count("interval_prob").alias("fragment_count")), 
    (pl.mean("length").alias("mean_length")), 
    (pl.median("length").alias("median_length")), 
    (pl.min("length").alias("min_length")), 
    (pl.max("length").alias("max_lenght"))
    ).sort("interval_prob")
print(interval_archaic)

# Saving interval_archaic dataframe to excel worksheet to produce a table figure 
# interval_archaic.write_excel("pos_ct/excel_output.xlsx", "probability_interval_stat")