In [None]:
import polars as pl
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import pyarrow as pyarrow

import glob 

In [None]:
# Only run when loading files for the first time 

# Getting all decoded files 
decoded_files = glob.glob("pos_ct/*/*.decoded.hap*.txt")

# create list for storing df from each file 
dfs = []

# for each file extract pop and ID from the file name and add to each df 
for file in decoded_files:
    file_name = file.split('.')[0]
    ind_id = file_name.split('/')[2]
    pop = file_name.split('/')[1]
    
    df = pl.read_csv(file, has_header=True, separator='\t')
    
    # Adding ID and pop to each df
    df_1 = df.with_columns( # with _columns to add columns to a data frame 
        (pl.lit(ind_id).alias("ID")), # pl.lit returns literal value pl.alias to name a column
        (pl.lit(pop).alias("pop")),
        (pl.col("end") + 1000) # adding 1000 to the end coordinate to match fragment length 
        )
    # print(df_1.head(5))

    dfs.append(df_1) # adding df to the dfs list 

# concatenating dfs from the list 
decoded_df = pl.concat(dfs) 

# Saving to csv file to avoid runninng the code every time 
decoded_df.write_csv("decoded_df.csv", separator="\t", has_header=True)

In [None]:
# Read saved csv file containing all decoded files
decoded_df = pl.read_csv("decoded_df.csv", separator="\t", has_header=True)
decoded_df

In [None]:
# Getting total fragments for each population 
decoded_df_GBR = decoded_df.filter(pl.col("pop") == "GBR")
print(decoded_df_GBR)

decoded_df_CHS = decoded_df.filter(pl.col("pop") == "CHS")
print(decoded_df_CHS)