In [1]:
import polars as pl
import glob 
import seaborn as sns
import plotly.express as px
import pyarrow as pyarrow

decoded_files = glob.glob("pos_ct/*/*.decoded.hap*.txt")

In [4]:
# create list for storing df from each file 
dfs = []

# for each file extract pop and ID from the file name and add to each df 
for file in decoded_files:
    file_name = file.split('.')[0]
    ind_id = file_name.split('/')[2]
    pop = file_name.split('/')[1]
    
    df = pl.read_csv(file, has_header=True, separator='\t')
    
    # Adding ID and pop to each df
    df_1 = df.with_columns( # with _columns to add columns to a data frame 
        (pl.lit(ind_id).alias("ID")), # pl.lit returns literal value pl.alias to name a column
        (pl.lit(pop).alias("pop")),
        (pl.col("end") + 1000) # adding 1000 to the end coordinate to match fragment length 
        )
    # print(df_1.head(5))

    dfs.append(df_1) # adding df to the dfs list 

# concatenating dfs from the list using align option to align dfs by column names 
decoded_df = pl.concat(dfs, how='align')
decoded_df.head(5)

chrom,start,end,length,state,mean_prob,snps,ID,pop
i64,i64,i64,i64,str,f64,i64,str,str
1,0,1000,1000,"""Archaic""",0.51065,0,"""HG00698""","""CHS"""
1,0,2000,2000,"""Archaic""",0.51407,0,"""HG00674""","""CHS"""
1,0,7000,7000,"""Archaic""",0.54458,0,"""HG00442""","""CHS"""
1,0,11000,11000,"""Archaic""",0.56568,0,"""HG00566""","""CHS"""
1,0,15000,15000,"""Archaic""",0.61127,0,"""HG00707""","""CHS"""


In [4]:
def interval(prob: float) -> str:
    """Define a function for binning mean_prob into intervals 
    """
    if prob >= 0.5 and prob < 0.6:
        return("[0.5-0.6)")
    elif prob >= 0.6 and prob < 0.7:
        return("[0.6-0.7)")
    elif prob >= 0.7 and prob < 0.8:
        return("[0.7-0.8)")
    elif prob >= 0.8 and prob < 0.9:
        return("[0.8-0.9)")
    elif prob >= 0.9 and prob < 0.95:
        return("[0.9-0.95)")
    elif prob >= 0.95 and prob < 0.99:
        return("[0.95-0.99)")
    elif prob >= 0.99 and prob < 1:
        return("[0.99-1)")
    elif prob == 1:
        return("1")


In [None]:
# Archaic only df with added column of posterior probability intervals for further processing. Without snp count and coordinates. 
archaic_df = (decoded_df.filter(pl.col("state") == "Archaic")
              .select(["pop", "ID", "length", "mean_prob"])
              .with_columns(
                  (pl.col("mean_prob").apply(lambda x: interval(x)).alias("interval_prob"),
                   pl.col("mean_prob").round(1).alias("rounded_mean_prob"))
))
# print(archaic_df.head(20))

# Plotting a histogram of fragment length against rounded mean_prob
# px.histogram(archaic_df, x="length", color="rounded_mean_prob", log_x=True, 
#             labels={"rounded_mean_prob":"rounded posterior probability"}, 
#             title="Fragment length distribution (log-scaled) based on posterior probability cut-off",
#             opacity=0.8)

# Plotting a histogram of fragment count against mean_prob
# px.histogram(archaic_df, x="mean_prob", nbins=50, color="pop", marginal="box", hover_data=archaic_df.columns, 
#              title="Archaic fragment distribution across posterior probability cut-off values", 
#              labels={"mean_prob":"posterior probability", "pop":"populations"}, range_x=(0.5, 1))
 
# Calculating fragment count and fragment length stats in defined posterior probability cutoff interavals
# interval_archaic = archaic_df.groupby(["interval_prob", "pop"]).agg(
#     (pl.count("interval_prob").alias("fragment_count")), 
#     (pl.mean("length").alias("mean_length")), 
#     (pl.median("length").alias("median_length")), 
#     (pl.min("length").alias("min_length")), 
#     (pl.max("length").alias("max_lenght"))
#     ).sort("interval_prob")
# print(interval_archaic)

# Saving interval_archaic dataframe to excel worksheet to produce a table figure 
# interval_archaic.write_excel("pos_ct/excel_output.xlsx", "probability_interval_stat")

In [5]:
# Filter Archaic fragments with posterior probability above (and including) 0.9 
filtered_df = (decoded_df.filter(pl.col("state") == "Archaic")
              .filter(pl.col("mean_prob") >= 0.9)
              .select(["pop", "ID", "chrom", "start", "end", "length"])
)
print(filtered_df.head(20))


shape: (20, 6)
┌─────┬─────────┬───────┬─────────┬─────────┬────────┐
│ pop ┆ ID      ┆ chrom ┆ start   ┆ end     ┆ length │
│ --- ┆ ---     ┆ ---   ┆ ---     ┆ ---     ┆ ---    │
│ str ┆ str     ┆ i64   ┆ i64     ┆ i64     ┆ i64    │
╞═════╪═════════╪═══════╪═════════╪═════════╪════════╡
│ GBR ┆ HG00234 ┆ 1     ┆ 1104000 ┆ 1128000 ┆ 24000  │
│ GBR ┆ HG00115 ┆ 1     ┆ 1107000 ┆ 1129000 ┆ 22000  │
│ GBR ┆ HG00109 ┆ 1     ┆ 1495000 ┆ 1521000 ┆ 26000  │
│ GBR ┆ HG00128 ┆ 1     ┆ 1495000 ┆ 1521000 ┆ 26000  │
│ …   ┆ …       ┆ …     ┆ …       ┆ …       ┆ …      │
│ CHS ┆ HG00654 ┆ 1     ┆ 1520000 ┆ 1559000 ┆ 39000  │
│ CHS ┆ HG00705 ┆ 1     ┆ 1521000 ┆ 1542000 ┆ 21000  │
│ CHS ┆ HG00598 ┆ 1     ┆ 1521000 ┆ 1559000 ┆ 38000  │
│ GBR ┆ HG00254 ┆ 1     ┆ 1521000 ┆ 1559000 ┆ 38000  │
└─────┴─────────┴───────┴─────────┴─────────┴────────┘


In [20]:
# Group by pop and ID to calculate average fragment length for each individual
ind_mean_length = filtered_df.groupby(["pop", "ID"]).agg( 
                  avg_length=pl.mean("length"), 
                  fragment_count=pl.count("ID"))
print(ind_mean_length.head(20))

# Group by pop and calculate mean fragment count and length per haploid genome for each population 
pop_average = ind_mean_length.groupby("pop").agg(
    avg_fragment_length=pl.col("avg_length").mean(),
    avg_fragment_count=pl.col("fragment_count").mean(),
    median_fragment_count=pl.col("fragment_count").median(),
    min_fragment_count=pl.col("fragment_count").min(),
    max_fragment_count=pl.col("fragment_count").max(),
    sd_fragment_count=pl.col("fragment_count").std()
)
# Export to excel for table figure creation
# pop_average.write_excel("pos_ct/avg_lenght_and_count.xlsx", "avg_length_and_count_per_genome")

shape: (20, 4)
┌─────┬─────────┬──────────────┬────────────────┐
│ pop ┆ ID      ┆ avg_length   ┆ fragment_count │
│ --- ┆ ---     ┆ ---          ┆ ---            │
│ str ┆ str     ┆ f64          ┆ u32            │
╞═════╪═════════╪══════════════╪════════════════╡
│ CHS ┆ HG00622 ┆ 95498.595506 ┆ 1424           │
│ CHS ┆ HG00699 ┆ 90361.239288 ┆ 1517           │
│ CHS ┆ HG00675 ┆ 97417.232022 ┆ 1474           │
│ CHS ┆ HG00674 ┆ 90645.653616 ┆ 1507           │
│ …   ┆ …       ┆ …            ┆ …              │
│ CHS ┆ HG00632 ┆ 95488.103821 ┆ 1387           │
│ GBR ┆ HG00254 ┆ 86224.306688 ┆ 1226           │
│ CHS ┆ HG00662 ┆ 90977.303071 ┆ 1498           │
│ CHS ┆ HG00653 ┆ 88356.521739 ┆ 1495           │
└─────┴─────────┴──────────────┴────────────────┘


In [22]:
# Calculate unique sequence: using entire population find overlapping fragments and separate them into continuous fragments with cumulative frequency 

# filtered_df.head(20)

# Group by population and fragment coordinates and length, counts in how many individuals the same fragment is encountered
# Can check with .all() after groupby. also converts groupby object to a dataframe
unique_count=filtered_df.groupby(["pop", "chrom", "start", "end", "length"]).agg(
    pl.all().sort_by(["chrom", "start"]),
    frequency=pl.count("ID")
)

unique_count.head(20)

# Find overlapping fragments 
# [start, end) - fragments include start base and up to, not including end base
unique_count

pop,chrom,start,end,length,ID,frequency
str,i64,i64,i64,i64,list[str],u32
"""CHS""",1,2790000,2885000,95000,"[""HG00472""]",1
"""CHS""",1,2878000,3041000,163000,"[""HG00478""]",1
"""GBR""",1,2896000,3000000,104000,"[""HG00146"", ""HG00137""]",2
"""CHS""",1,3011000,3041000,30000,"[""HG00556"", ""HG00446""]",2
"""CHS""",1,3103000,3259000,156000,"[""HG00650""]",1
"""CHS""",1,3385000,3413000,28000,"[""HG00626""]",1
"""CHS""",1,3418000,3455000,37000,"[""HG00729""]",1
"""CHS""",1,3579000,3681000,102000,"[""HG00584""]",1
"""CHS""",1,3682000,3804000,122000,"[""HG00596""]",1
"""CHS""",1,6879000,6976000,97000,"[""HG00442"", ""HG00533"", ""HG00654""]",3


In [None]:
from typing import List

def overlap(x: List[pl.Series]) -> float:
    for pop, chrom, start, end, lenght, frequency in x:
        if pop

In [34]:

a = pl.DataFrame(
    {
     "pop": ["GBR", "GBR", "GBR", "GBR", "GBR"],
     "chrom": [1, 1, 2, 2, 2],
     "start": [2, 5, 5, 1, 2],
     "end": [7, 10, 10, 8, 7],
     "length": [5, 5, 5, 7, 5],
     "frequency": [1, 2, 2, 3, 1]}
)

row_count = a.select(pl.count()).item()

# a.select(
#     pl.col("*").rolling_apply(lambda s: s.sum(), window_size=2)
# )

a["chrom"] == a["chrom"].shift(-1) 
# print(a.select(pl.all().take([1])))

# print(a.filter(pl.col("chrom") == 1)[0])

# for x, y, z in a:
#     print("x:", x, "y:", y, "z:", z, "\n")

# for x in a:
#     print(x[0])

# for x in a.iter_rows(named=True):
#     print(x["chrom"])

process_overlaps(a)

# df = a.select("*").with_columns(
#     overlap=(pl.col("start").shift(-1) < pl.col("end"))
    # (df["pop"] == df["pop"].shift(-1)),
    # (df["chrom"] == df["chrom"].shift(-1))
    # )

# df = a.select("*").with_columns(
#     overlap=((pl.col("start").shift(-1) < pl.col("end")) &
#     (pl.col("pop") == pl.col("pop").shift(-1)) &
#     (pl.col("chrom") == pl.col("chrom").shift(-1))
# ))



ComputeError: series length 3 doesn't match the dataframe height of 5

In [33]:
def process_overlaps(df):
    # Create a new DataFrame to store the modified rows
    new_df = pl.DataFrame(
        {
            "pop": pl.Series([], dtype=str), # check dtype, might need to change 
            "chrom": pl.Series([], dtype=pl.UInt64), 
            "start": pl.Series([], dtype=pl.UInt64),
            "end": pl.Series([], dtype=pl.UInt64),
            "lenght": pl.Series([], dtype=pl.UInt64),
            "frequency": pl.Series([], dtype=pl.UInt32),
        }
    )

    # Create a temporary column 'overlap' that holds 1 if the next row overlaps the current one
    df = a.select("*").with_columns(
        overlap=((pl.col("start").shift(-1) < pl.col("end")) &
        (pl.col("pop") == pl.col("pop").shift(-1)) &
        (pl.col("chrom") == pl.col("chrom").shift(-1))
        ))

    # Calculate the start, end, and frequency values for the new DataFrame
    new_df = new_df.vstack(
        [
            df.filter(pl.col("overlap")).select(
                "start", "end", "frequency"
            ),  # Rows without overlap
            df.filter(~pl.col("overlap")).select(
                "start",
                df["end"].shift(-1) - 1,
                "frequency",
            ),  # Rows with overlap, modified end
            df.filter(pl.col("overlap")).select(
                df["start"].shift(-1),
                "end",
                df["frequency"].shift(-1),
            ),  # Rows with overlap, modified start
            df.filter(pl.col("overlap")).select(
                df["start"].shift(-1),
                df["end"].shift(-1),
                df["frequency"] + df["frequency"].shift(-1),
            ),  # Rows with overlap, new row for overlap
        ]
    )

    # Drop the temporary column 'overlap' from the new DataFrame
    new_df = new_df.drop("overlap")

    return new_df
