In [1]:
import polars as pl
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import pyarrow as pyarrow

import glob 

In [2]:
# Reading no-overlaps.csv, which contains unique non-overlapping fragments for both populations 
no_overlaps = pl.read_csv("no_overlaps.csv", separator="\t", has_header=True)
print(no_overlaps.sort(by="chrom"))
print(no_overlaps.describe())

shape: (159_209, 6)
┌──────────┬──────────┬───────────┬────────┬─────┬───────┐
│ start    ┆ end      ┆ frequency ┆ length ┆ pop ┆ chrom │
│ ---      ┆ ---      ┆ ---       ┆ ---    ┆ --- ┆ ---   │
│ i64      ┆ i64      ┆ i64       ┆ i64    ┆ str ┆ i64   │
╞══════════╪══════════╪═══════════╪════════╪═════╪═══════╡
│ 1104000  ┆ 1107000  ┆ 1         ┆ 3000   ┆ GBR ┆ 1     │
│ 1107000  ┆ 1128000  ┆ 2         ┆ 21000  ┆ GBR ┆ 1     │
│ 1128000  ┆ 1129000  ┆ 1         ┆ 1000   ┆ GBR ┆ 1     │
│ 1495000  ┆ 1521000  ┆ 2         ┆ 26000  ┆ GBR ┆ 1     │
│ …        ┆ …        ┆ …         ┆ …      ┆ …   ┆ …     │
│ 51056000 ┆ 51078000 ┆ 1         ┆ 22000  ┆ CHS ┆ 22    │
│ 51078000 ┆ 51101000 ┆ 3         ┆ 23000  ┆ CHS ┆ 22    │
│ 51101000 ┆ 51104000 ┆ 2         ┆ 3000   ┆ CHS ┆ 22    │
│ 51104000 ┆ 51113000 ┆ 1         ┆ 9000   ┆ CHS ┆ 22    │
└──────────┴──────────┴───────────┴────────┴─────┴───────┘
shape: (9, 7)
┌────────────┬───────────┬───────────┬───────────┬──────────────┬────────┬───────

In [3]:
# Need to create two bedGraph files with unique non-overlapping intervals 

# Define a function for adding "chr" prefix to the chrom column values
def add_chr(chrom):
    chrom_string = str(chrom)
    chr_prefixed = "chr" + chrom_string
    return chr_prefixed

add_chr(1)

# filter for each population and apply add_chr, then select only chr, start, end, frequency and save to csv files 
bedGraph_GBR1 = no_overlaps.filter(pl.col("pop") == "GBR").with_columns(
    pl.col("chrom").apply(lambda chrom: add_chr(chrom)) 
)

bedGraph_GBR = bedGraph_GBR1.select(["chrom", "start", "end", "frequency"])

# bedGraph_GBR.write_csv("pos_ct/GBR_bedGraph_no_overlaps.csv", separator="\t", has_header=True)


bedGraph_CHS1 = no_overlaps.filter(pl.col("pop") == "CHS").with_columns(
    pl.col("chrom").apply(lambda chrom: add_chr(chrom)) 
)

bedGraph_CHS = bedGraph_CHS1.select(["chrom", "start", "end", "frequency"])

# bedGraph_CHS.write_csv("pos_ct/CHS_bedGraph_no_overlaps.csv", separator="\t", has_header=True)

In [4]:
# Calculating total archaic sequence per population
total_length = no_overlaps.groupby("pop").agg(
    pl.col("length").sum().alias("total_archaic_sequence"))
total_length

pop,total_archaic_sequence
str,i64
"""CHS""",1022025000
"""GBR""",989609000


In [5]:
# Calculating total archaic sequence per chromosome 
total_length_chrom = no_overlaps.groupby(["pop", "chrom"]).agg(
    archaic_sequence=pl.col("length").sum() # total archaic sequence per chromosome
)
total_length_chrom

pop,chrom,archaic_sequence
str,i64,i64
"""GBR""",5,60390000
"""GBR""",6,73622000
"""CHS""",14,43400000
"""GBR""",18,26242000
"""GBR""",16,21455000
"""GBR""",8,45182000
"""CHS""",8,39156000
"""CHS""",5,65621000
"""GBR""",9,46784000
"""CHS""",4,72074000


In [6]:
# Make a new dataframe with all autosome lengths (https://www.ncbi.nlm.nih.gov/grc/human/data?asm=GRCh37)
chrom_lengths_1 = [[1, 249250621], [2, 243199373], [3, 198022430], [4, 191154276], [5, 180915260], [6, 171115067], [7, 159138663], [8, 146364022],
[9, 141213431], [10, 135534747], [11, 135006516], [12, 133851895], [13, 115169878], [14, 107349540], [15, 102531392] , [16, 90354753], [17, 81195210], 
[18, 78077248], [19, 59128983], [20, 63025520], [21, 48129895], [22, 51304566]]
chrom_lengths = pl.DataFrame(chrom_lengths_1, schema=["chrom", "chrom_length"])
 
# Cancatenate with total_length_chrom
archaic_and_total = pl.concat([total_length_chrom, chrom_lengths], how="align")

archaic_and_total

pop,chrom,archaic_sequence,chrom_length
str,i64,i64,i64
"""GBR""",1,86618000,249250621
"""CHS""",1,93036000,249250621
"""GBR""",2,91429000,243199373
"""CHS""",2,87498000,243199373
"""CHS""",3,76818000,198022430
"""GBR""",3,73266000,198022430
"""CHS""",4,72074000,191154276
"""GBR""",4,69141000,191154276
"""GBR""",5,60390000,180915260
"""CHS""",5,65621000,180915260


In [7]:
# Add column with % coverage of chrom by archaic sequence 
archaic_coverage = archaic_and_total.with_columns(
    percent_coverage=pl.col("archaic_sequence") / pl.col("chrom_length") * 100
)
archaic_coverage

pop,chrom,archaic_sequence,chrom_length,percent_coverage
str,i64,i64,i64,f64
"""GBR""",1,86618000,249250621,34.751368
"""CHS""",1,93036000,249250621,37.326286
"""GBR""",2,91429000,243199373,37.594258
"""CHS""",2,87498000,243199373,35.977889
"""CHS""",3,76818000,198022430,38.792575
"""GBR""",3,73266000,198022430,36.998839
"""CHS""",4,72074000,191154276,37.704623
"""GBR""",4,69141000,191154276,36.170261
"""GBR""",5,60390000,180915260,33.380269
"""CHS""",5,65621000,180915260,36.271678


In [16]:
# Creage a histogram or genomic viewer of chromosome coverage with percentages 
px.bar(archaic_coverage, x="chrom", y="percent_coverage", color="pop", barmode="group",
       labels={"percent_coverage":"Percent coverage", "pop":"populations", "chrom":"chromosome"},
       )


In [None]:
# Boxplot of total archaic fragment coverage across chromosomes for each pop
px.box(no_overlaps, x="chrom", y="length", color="chrom", 
       labels={"pop":"populations", "lenght":"fragment length", "chrom":"chromosome"},
        title="Unique archaic fragment lengths across autosomes in two populations",
        category_orders={"chrom":[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]},
        width=1000))