# PoPoolation preparation

In [31]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [32]:
biallelic_fish_maf=pd.read_csv(r"maf_LR537144.csv", sep=",")
biallelic_fish_maf

Unnamed: 0,chr,pos,index,ref_n,A,C,G,T,A.1,C.1,...,G.20,T.20,A.21,C.21,G.21,T.21,A.22,C.22,G.22,T.22
0,LR537144.1,10000025,6430776,C,0,67,0,92,0,75,...,0,7,0,56,0,11,0,35,0,30
1,LR537144.1,10000130,6430850,G,0,98,62,0,0,48,...,56,0,0,15,62,0,0,23,40,0
2,LR537144.1,10000201,6430923,T,0,15,0,152,0,27,...,0,56,0,17,0,77,0,17,0,55
3,LR537144.1,10000306,6431037,G,0,0,132,0,0,0,...,77,0,0,3,65,0,0,0,67,0
4,LR537144.1,10000606,6431194,T,0,0,0,122,0,0,...,0,75,0,0,0,77,0,0,0,77
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160545,LR537144.1,9999619,413689,A,41,0,142,0,48,0,...,41,0,42,0,32,0,32,0,37,0
160546,LR537144.1,9999796,413868,T,0,114,0,33,0,59,...,0,13,0,25,0,53,0,37,0,25
160547,LR537144.1,9999827,413895,a,131,0,11,0,91,0,...,0,0,92,0,0,0,61,0,2,0
160548,LR537144.1,9999900,413963,G,73,0,72,0,50,0,...,48,0,17,0,64,0,34,0,44,0


Drop the first 4 columns, leaving only the nucleotide columns in a new dataframe

In [33]:
nuc_data = biallelic_fish_maf.iloc[:, 4:]
nuc_data

Unnamed: 0,A,C,G,T,A.1,C.1,G.1,T.1,A.2,C.2,...,G.20,T.20,A.21,C.21,G.21,T.21,A.22,C.22,G.22,T.22
0,0,67,0,92,0,75,0,54,0,79,...,0,7,0,56,0,11,0,35,0,30
1,0,98,62,0,0,48,76,0,0,50,...,56,0,0,15,62,0,0,23,40,0
2,0,15,0,152,0,27,0,104,0,13,...,0,56,0,17,0,77,0,17,0,55
3,0,0,132,0,0,0,114,0,0,3,...,77,0,0,3,65,0,0,0,67,0
4,0,0,0,122,0,0,0,152,0,0,...,0,75,0,0,0,77,0,0,0,77
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160545,41,0,142,0,48,0,86,0,32,0,...,41,0,42,0,32,0,32,0,37,0
160546,0,114,0,33,0,59,0,37,0,86,...,0,13,0,25,0,53,0,37,0,25
160547,131,0,11,0,91,0,5,0,115,0,...,0,0,92,0,0,0,61,0,2,0
160548,73,0,72,0,50,0,60,0,49,0,...,48,0,17,0,64,0,34,0,44,0


Create a list according to the order of populations in the biallelic maf file

In [34]:
populations = [
    'wGRE_9', 'wGRE_13', 'wSPA_5', 'fSPA_3',
    'fFRA_1', 'fGRE_10', 'wTUR_14', 'wSPA_4', 
    'wITA_8', 'fSPA_2', 'fGRE_9', 'fGRE_8', 
    'fCRO_5', 'wITA_7', 'wGRE_12', 'wGRE_11', 
    'wGRE_10', 'fITA_4', 'fGRE_6', 'fGRE_7'
]  # list of all 20 population names

In [40]:
def process_population_data(biallelic_fish_maf, populations):
    pop_data = {}
    for i, pop in tqdm(enumerate(populations), total=len(populations), desc="Processing Populations"):
        base_col = i * 4
        A = biallelic_fish_maf.iloc[:, base_col].astype(str)
        T = biallelic_fish_maf.iloc[:, base_col+3].astype(str)
        C = biallelic_fish_maf.iloc[:, base_col+1].astype(str)
        G = biallelic_fish_maf.iloc[:, base_col+2].astype(str)
        
        pop_data[pop] = A + ':' + T + ':' + C + ':' + G + ':0:0'
    return pop_data


In [41]:
# Usage
pop_data = process_population_data(nuc_data, populations)

Processing Populations: 100%|██████████| 23/23 [00:14<00:00,  1.60it/s]


We create a dataframe based on the pop_data dictionary with the ordered_populations order


In [42]:
ordered_populations = [
    'fSPA_3', 'fFRA_1', 'fGRE_10', 'fSPA_2', 
    'fGRE_9', 'fGRE_8', 'fCRO_5', 'fITA_4', 
    'fGRE_6', 'fGRE_7', 'wGRE_9', 'wGRE_13',
    'wSPA_5', 'wTUR_14', 'wSPA_4', 'wITA_8', 
    'wITA_7', 'wGRE_12', 'wGRE_11', 'wGRE_10'
] # all "farmed" populations first for ease of use. 

sync_fish = pd.DataFrame.from_dict(pop_data)[ordered_populations]

Add the rest of the columns to the dataframe

In [43]:
# Extract relevant columns from the source DataFrame
chr_column = biallelic_fish_maf.iloc[:, 0]  # Chromosome info
pos_column = biallelic_fish_maf.iloc[:, 1]  # Position info
ref_n_column = biallelic_fish_maf.iloc[:, 2]  # Reference nucleotide info

# Insert "chr", "pos", and "ref_n" columns into the synchronized file
sync_fish.insert(0, "chr", chr_column)
sync_fish.insert(1, "pos", pos_column)
sync_fish.insert(2, "ref_n", ref_n_column)


## File export

In [44]:
# save the synchronized file to run the analyses
sync_fish.to_csv(r"CHROMOSOME_NAME_popoolation_.csv", header=False, index=False, sep=" ")