In [1]:
import polars as pl
import glob 
import seaborn as sns

decoded_files = glob.glob("pos_ct/*/*.decoded.hap*.txt")

In [2]:
# create list for storing df from each file 
dfs = []

# for each file extract pop and ID from the file name and add to each df 
for file in decoded_files:
    file_name = file.split('.')[0]
    ind_id = file_name.split('/')[2]
    pop = file_name.split('/')[1]
    
    df = pl.read_csv(file, has_header=True, separator='\t')
    
    # Adding ID and pop to each df
    df_1 = df.with_columns( # with _columns to add columns to a data frame 
        (pl.lit(ind_id).alias("ID")), # pl.lit returns literal value pl.alias to name a column
        (pl.lit(pop).alias("pop")),
        (pl.col("end") + 1000) # adding 1000 to the end coordinate to match fragment length 
        )
    # print(df_1.head(5))

    dfs.append(df_1) # adding df to the dfs list 

# concatenating dfs from the list using align option to align dfs by column names 
decoded_df = pl.concat(dfs, how='align')
decoded_df.head(5)

chrom,start,end,length,state,mean_prob,snps,ID,pop
i64,i64,i64,i64,str,f64,i64,str,str
1,0,1000,1000,"""Archaic""",0.51065,0,"""HG00698""","""CHS"""
1,0,2000,2000,"""Archaic""",0.51407,0,"""HG00674""","""CHS"""
1,0,7000,7000,"""Archaic""",0.54458,0,"""HG00442""","""CHS"""
1,0,11000,11000,"""Archaic""",0.56568,0,"""HG00566""","""CHS"""
1,0,15000,15000,"""Archaic""",0.61127,0,"""HG00707""","""CHS"""


In [3]:
def interval(prob):
    if prob >= 0.5 and prob < 0.6:
        return("[0.5-0.6)")
    elif prob >= 0.6 and prob < 0.7:
        return("[0.6-0.7)")
    elif prob >= 0.7 and prob < 0.8:
        return("[0.7-0.8)")
    elif prob >= 0.8 and prob < 0.9:
        return("[0.8-0.9)")
    elif prob >= 0.9 and prob < 0.95:
        return("[0.9-0.95)")
    elif prob >= 0.95 and prob < 0.99:
        return("[0.95-0.99)")
    elif prob >= 0.99 and prob < 1:
        return("[0.99-1)")
    elif prob == 1:
        return("1")


In [10]:
# Archaic only df
archaic_df = (decoded_df.filter(pl.col("state") == "Archaic")
              .select(["pop", "ID", "length", "mean_prob"])
              .with_columns(
                  (pl.col("mean_prob").apply(lambda x: interval(x)).alias("interval_prob"))
))
# print(archaic_df.head(20))
 

interval_archaic = archaic_df.groupby(["pop", "interval_prob"]).agg(
    (pl.count("interval_prob").alias("fragment_count")), 
    (pl.mean("length").alias("mean_length")), 
    (pl.median("length").alias("median_length")), 
    (pl.min("length").alias("min_length")), 
    (pl.max("length").alias("max_lenght"))
    ).sort("interval_prob")
print(interval_archaic)

# sns.histplot(result_df, x="interval_prob", y="fragment_count", hue="pop")


shape: (14, 7)
┌─────┬───────────────┬────────────────┬───────────────┬───────────────┬────────────┬────────────┐
│ pop ┆ interval_prob ┆ fragment_count ┆ mean_length   ┆ median_length ┆ min_length ┆ max_lenght │
│ --- ┆ ---           ┆ ---            ┆ ---           ┆ ---           ┆ ---        ┆ ---        │
│ str ┆ str           ┆ u32            ┆ f64           ┆ f64           ┆ i64        ┆ i64        │
╞═════╪═══════════════╪════════════════╪═══════════════╪═══════════════╪════════════╪════════════╡
│ CHS ┆ [0.5-0.6)     ┆ 59237          ┆ 8378.9017     ┆ 6000.0        ┆ 1000       ┆ 107000     │
│ GBR ┆ [0.5-0.6)     ┆ 43691          ┆ 8765.329244   ┆ 7000.0        ┆ 1000       ┆ 133000     │
│ CHS ┆ [0.6-0.7)     ┆ 47523          ┆ 18487.469225  ┆ 15000.0       ┆ 1000       ┆ 186000     │
│ GBR ┆ [0.6-0.7)     ┆ 36782          ┆ 18477.434615  ┆ 15000.0       ┆ 1000       ┆ 162000     │
│ …   ┆ …             ┆ …              ┆ …             ┆ …             ┆ …          ┆ …       

In [66]:
# Archaic only df
archaic_df = decoded_df.filter(pl.col("state") == "Archaic").select(["pop", "ID", "length", "mean_prob"]) #.groupby(["pop", "ID"]) #.agg(pl.count("mean_prob").alias("count"))

def interval(prob):
    if prob >= 0.5 and prob < 0.6:
        return("0.5-0.6")
    elif prob >= 0.6 and prob < 0.7:
        return("0.6-0.7")
    elif prob >= 0.7 and prob < 0.8:
        return("0.7-0.8")
    elif prob >= 0.8 and prob < 0.9:
        return("0.8-0.9")
    elif prob >= 0.9 and prob < 0.95:
        return("0.9-0.95")
    elif prob >= 0.95 and prob < 0.99:
        return("0.95-0.99")
    elif prob >= 0.99 and prob < 1:
        return("0.99-1")
    elif prob == 1:
        return("1")

with_interval = archaic_df.with_columns(
    pl.col("mean_prob").apply(lambda x: interval(x)).alias("interval_prob")
).groupby(["pop", "ID", "interval_prob"]).agg(pl.count("mean_prob").alias("count"))
with_interval.head(10) 
# for id, data in archaic_df:
#     print(id)
#     print(data)

# intervals = [[0.5-0.6], [0.6-0.7], [0.7-0.8], [0.8-0.9], [0.9-0.95], [0.95-0.99], [0.99-1]]
# archaic_df_with_intervals = archaic_df.with_columns(
#     pl.when(pl.col("mean_prob").is_in(intervals[0]))
#     .then(pl.lit("0.5-0.6"))
#     .when(pl.col("mean_prob").is_in(intervals[1]))
#     .then(pl.lit("0.6-0.7"))
#     .when(pl.col("mean_prob").is_in(intervals[2]))
#     .then(pl.lit("0.7-0.8"))
#     .when(pl.col("mean_prob").is_in(intervals[3]))
#     .then(pl.lit("0.8-0.9"))
#     .when(pl.col("mean_prob").is_in(intervals[4]))
#     .then(pl.lit("0.9-0.95"))
#     .when(pl.col("mean_prob").is_in(intervals[5]))
#     .then(pl.lit("0.95-0.99"))
#     .when(pl.col("mean_prob").is_in(intervals[6]))
#     .then(pl.lit("0.99-1"))
#     .alias("prob_interval")
# )
# archaic_df_with_intervals.head(10)

# sns.histplot(archaic_df)


pop,ID,interval_prob,count
str,str,str,u32
"""GBR""","""HG00128""","""0.9-0.95""",423
"""GBR""","""HG00265""","""0.6-0.7""",195
"""GBR""","""HG00111""","""0.99-1""",8
"""GBR""","""HG00154""","""0.6-0.7""",203
"""GBR""","""HG00264""","""0.7-0.8""",279
"""GBR""","""HG00111""","""0.6-0.7""",214
"""GBR""","""HG00265""","""0.99-1""",5
"""GBR""","""HG00128""","""0.8-0.9""",528
"""GBR""","""HG00096""","""0.5-0.6""",228
"""GBR""","""HG00154""","""0.99-1""",6
