In [1]:
import polars as pl
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import pyarrow as pyarrow

import glob 

In [None]:
# Only run if changed decoded_df and need to change filtered_df. Otherwise go to next cell

# Read saved csv file containing all decoded files and filter 
decoded_df = pl.read_csv("decoded_df.csv", separator="\t", has_header=True)
decoded_df

# Filter Archaic fragments with posterior probability above (and including) 0.9 
filtered_df = (decoded_df.filter(pl.col("state") == "Archaic")
              .filter(pl.col("mean_prob") >= 0.9)
              .select(["pop", "ID", "chrom", "start", "end", "length"])
)
print(filtered_df)

# Save to filtered_df.csv
# filtered_df.write_csv("filtered_df.csv", separator="\t", has_header=True)

In [2]:
# Read diltered_df.csv
filtered_df = pl.read_csv("filtered_df.csv", separator="\t", has_header=True)

filtered_df

pop,ID,chrom,start,end,length
str,str,i64,i64,i64,i64
"""GBR""","""HG00265""",1,3421000,3452000,31000
"""GBR""","""HG00265""",1,4526000,4553000,27000
"""GBR""","""HG00265""",1,5021000,5083000,62000
"""GBR""","""HG00265""",1,6157000,6196000,39000
"""GBR""","""HG00265""",1,10212000,10239000,27000
"""GBR""","""HG00265""",1,12156000,12179000,23000
"""GBR""","""HG00265""",1,14326000,14393000,67000
"""GBR""","""HG00265""",1,20774000,20886000,112000
"""GBR""","""HG00265""",1,22482000,22547000,65000
"""GBR""","""HG00265""",1,31384000,31573000,189000


In [None]:
# Get total filtered fragment count for each pop
filtered_df_GBR = filtered_df.filter(pl.col("pop") == "GBR")
print(filtered_df_GBR)

filtered_df_CHS = filtered_df.filter(pl.col("pop") == "CHS")
print(filtered_df_CHS)

In [None]:
# Group by pop and ID to calculate mean fragment length for each individual. Save excel
ind_mean_length = filtered_df.groupby(["pop", "ID"]).agg( 
                  avg_length=pl.mean("length"), 
                  fragment_count=pl.count("ID"))
print(ind_mean_length)

# Group by pop and calculate mean fragment count and length per haploid genome for each population 
pop_average = ind_mean_length.groupby("pop").agg(
    total_fragment_count=pl.col("fragment_count").sum(),
    avg_fragment_length=pl.col("avg_length").mean(),
    avg_fragment_count=pl.col("fragment_count").mean(),
    median_fragment_count=pl.col("fragment_count").median(),
    min_fragment_count=pl.col("fragment_count").min(),
    max_fragment_count=pl.col("fragment_count").max(),
    sd_fragment_count=pl.col("fragment_count").std()
)

print(pop_average)

# Export to excel for table figure creation
# pop_average.write_excel("pos_ct/avg_lenght_and_count_per_genome.xlsx", "avg_length_and_count_per_genome")

In [7]:
# unique_count = df with only unique filtered Archaic fragments in both populations 

# Group by population and fragment coordinates and length, counts in how many individuals the same fragment is encountered
# Can check with .all() after groupby. also converts groupby object to a dataframe
unique_count=filtered_df.groupby(["pop", "chrom", "start", "end", "length"]).agg(
    pl.all().sort_by(["chrom", "start"]),
    frequency=pl.count("ID")
)
unique_count

pop,chrom,start,end,length,ID,frequency
str,i64,i64,i64,i64,list[str],u32
"""GBR""",4,153914000,153942000,28000,"[""HG00265"", ""HG00111"", … ""HG00113""]",25
"""GBR""",4,181899000,181974000,75000,"[""HG00265""]",1
"""GBR""",6,81227000,81307000,80000,"[""HG00265""]",1
"""GBR""",8,76339000,76822000,483000,"[""HG00265"", ""HG00236""]",2
"""GBR""",8,129494000,129533000,39000,"[""HG00265""]",1
"""GBR""",11,26279000,26310000,31000,"[""HG00265""]",1
"""GBR""",11,26586000,26703000,117000,"[""HG00265""]",1
"""GBR""",12,11970000,12008000,38000,"[""HG00265""]",1
"""GBR""",12,23641000,23748000,107000,"[""HG00265""]",1
"""GBR""",20,15762000,15936000,174000,"[""HG00265""]",1


In [11]:
unique_count1 = unique_count.select(pl.col("pop", "chrom", "start", "end", "length", "frequency"))

unique_count1.write_csv("unique_overlapping_fragments.csv", has_header=True, separator="\t")
unique_count1

pop,chrom,start,end,length,frequency
str,i64,i64,i64,i64,u32
"""GBR""",4,153914000,153942000,28000,25
"""GBR""",4,181899000,181974000,75000,1
"""GBR""",6,81227000,81307000,80000,1
"""GBR""",8,76339000,76822000,483000,2
"""GBR""",8,129494000,129533000,39000,1
"""GBR""",11,26279000,26310000,31000,1
"""GBR""",11,26586000,26703000,117000,1
"""GBR""",12,11970000,12008000,38000,1
"""GBR""",12,23641000,23748000,107000,1
"""GBR""",20,15762000,15936000,174000,1


In [None]:
# Get unique archaic fragment count for each population 

unique_count_GBR = unique_count.filter(pl.col("pop") == "GBR")
# print(unique_count_GBR.describe())

unique_count_CHS = unique_count.filter(pl.col("pop") == "CHS")
# print(unique_count_CHS.describe())

In [29]:
px.box(unique_count, x="chrom", y="length", color="pop", 
        labels={"pop":"populations", "lenght":"unique fragment length", "chrom":"chromosome"},
        title="Unique archaic fragment lengths across autosomes in two populations",
        category_orders={"chrom":[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]},
        width=1000)

In [13]:
# Separate overlapping fragments and assign cumulative frequency 
from Intervals import Interval, IntervalFrame

# Partition df by pop and chrom and store them all in a list
partitioned_df = unique_count1.partition_by(["pop", "chrom"])
# print(len(partitioned_df))

# Empty list to store processed dataframes and concatenate them later
processed_list = []

# Iterate through partitioned dfs and run the preprocess function on them 
for df in partitioned_df:
    # Create df with only start, end and frequency
    df_to_pandas = df.drop("pop", "chrom", "length") #"ID", 
   
    # Convert to pandas df
    df_to_process = df_to_pandas.to_pandas()
    # Reset index
    df_to_process.reset_index(inplace=True, drop=True)
    # Sort values by start then end
    df_to_process.sort_values(by=["start", "end"], inplace=True)
    
    frame = IntervalFrame(
        [
            Interval(
                start=row[1]["start"],
                end=row[1]["end"],
                freq=row[1]["frequency"]
            ) for row in df_to_process.iterrows()
        ]
    )
    
    # Preprocess - separating overlaps
    frame.preprocess()

    pd_preprocessed = frame.to_dataframe()

     # Convert back to polars df
    preprocessed = pl.from_pandas(pd_preprocessed)
    
    # Extract pop and chrom from original df 
    pop = df.item(0, "pop")
    chrom = df.item(0, "chrom")
    
    # Use extracted values to create new column in the preprocessed polars df and add length column with new fragment lengths
    processed = preprocessed.with_columns(
        length=pl.col("end") - pl.col("start"),
        pop=pl.lit(pop), 
        chrom=pl.lit(chrom)   
    )
    # Append df to frames list 
    processed_list.append(processed)
    # print(processed.describe()) # - processed = df with unique non-overlapping archaic fragments for each chrom and pop

# Concatenate processed dfs
no_overlaps = pl.concat(processed_list)
print(no_overlaps)

# Saving to csv -> no_overlaps.csv -> no_overlaps_processing.ipynb
# no_overlaps.write_csv("no_overlaps.csv", separator="\t", has_header=True)

shape: (159_209, 6)
┌──────────┬──────────┬───────────┬────────┬─────┬───────┐
│ start    ┆ end      ┆ frequency ┆ length ┆ pop ┆ chrom │
│ ---      ┆ ---      ┆ ---       ┆ ---    ┆ --- ┆ ---   │
│ i64      ┆ i64      ┆ i64       ┆ i64    ┆ str ┆ i32   │
╞══════════╪══════════╪═══════════╪════════╪═════╪═══════╡
│ 61000    ┆ 327000   ┆ 1         ┆ 266000 ┆ GBR ┆ 4     │
│ 470000   ┆ 471000   ┆ 3         ┆ 1000   ┆ GBR ┆ 4     │
│ 471000   ┆ 493000   ┆ 6         ┆ 22000  ┆ GBR ┆ 4     │
│ 704000   ┆ 705000   ┆ 7         ┆ 1000   ┆ GBR ┆ 4     │
│ …        ┆ …        ┆ …         ┆ …      ┆ …   ┆ …     │
│ 57345000 ┆ 57370000 ┆ 11        ┆ 25000  ┆ CHS ┆ 19    │
│ 57370000 ┆ 57372000 ┆ 7         ┆ 2000   ┆ CHS ┆ 19    │
│ 57372000 ┆ 57373000 ┆ 6         ┆ 1000   ┆ CHS ┆ 19    │
│ 57373000 ┆ 57375000 ┆ 1         ┆ 2000   ┆ CHS ┆ 19    │
└──────────┴──────────┴───────────┴────────┴─────┴───────┘


In [37]:
# Calculating total archaic sequence per chromosome 
total_length_chrom = no_overlaps.groupby(["pop", "chrom"]).all() #.agg(
    # archaic_sequence=pl.col("length").sum() # total archaic sequence per chromosome
# )
print(total_length_chrom)

shape: (44, 6)
┌─────┬───────┬──────────────────────┬──────────────────────┬───────────────┬──────────────────────┐
│ pop ┆ chrom ┆ start                ┆ end                  ┆ frequency     ┆ length               │
│ --- ┆ ---   ┆ ---                  ┆ ---                  ┆ ---           ┆ ---                  │
│ str ┆ i32   ┆ list[i64]            ┆ list[i64]            ┆ list[i64]     ┆ list[i64]            │
╞═════╪═══════╪══════════════════════╪══════════════════════╪═══════════════╪══════════════════════╡
│ CHS ┆ 16    ┆ [284000, 285000, …   ┆ [285000, 323000, …   ┆ [10, 26, … 2] ┆ [1000, 38000, …      │
│     ┆       ┆ 90123000]            ┆ 90124000]            ┆               ┆ 1000]                │
│ CHS ┆ 10    ┆ [484000, 845000, …   ┆ [525000, 1111000, …  ┆ [2, 1, … 2]   ┆ [41000, 266000, …    │
│     ┆       ┆ 135051000]           ┆ 135054000]           ┆               ┆ 3000]                │
│ GBR ┆ 14    ┆ [20334000, 20390000, ┆ [20390000, 20391000, ┆ [2, 1, … 1]   

In [8]:
df = pd.DataFrame(
        {
            "start": [2, 5, 7, 11, 13],
            "end": [7, 10, 10, 14, 17],
            "frequency": [1, 2, 2, 3, 1]
        }
    )

for i, row in enumerate(df.iterrows()):
        # Append the current row to the modified rows list
        # row = row[1]
        print(type(row))
        # print(df.iloc[i, 0])


<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
