# Importing Libraries

In [41]:
import pandas as pd
import numpy as np
import os
import re
import plotly_express as px
import plotly.graph_objects as go
data_dir = "/home/inf-21-2024/binp29/population_genetic_project/data/01_raw_data/"


# Methodology

## Step 1: Preprocessing

### Read the merged.fam file

In [42]:
fam_df = pd.read_csv(os.path.join(data_dir, "LocalAncestry/merged.fam"), sep=" ", header=None)
fam_df = fam_df.rename({0:"SAMPLE_ID",1:'GROUP_ID'},axis=1)[['SAMPLE_ID','GROUP_ID']]
def update_sample_id(row):
    # Remove any number suffix in Sample_ID
    sample_base = re.sub(r'\d+$', '', row['SAMPLE_ID'])
    group_base = re.sub(r'\d+$', '', row['GROUP_ID'])
    
    # If the base of Sample_ID matches the base of GROUP_ID, update Sample_ID
    if sample_base == group_base:
        return row['GROUP_ID']
    return row['SAMPLE_ID']

# Apply the function to the DataFrame
fam_df['SAMPLE_ID'] = fam_df.apply(update_sample_id, axis=1)

# View the result
fam_df

Unnamed: 0,SAMPLE_ID,GROUP_ID
0,HGDP00001,Brahui
1,HGDP00003,Brahui
2,HGDP00005,Brahui
3,HGDP00007,Brahui
4,HGDP00011,Brahui
...,...,...
1064,SubsaharanAfrican11,SubsaharanAfrican11
1065,SubsaharanAfrican12,SubsaharanAfrican12
1066,SubsaharanAfrican13,SubsaharanAfrican13
1067,SubsaharanAfrican14,SubsaharanAfrican14


### Ectract Q file data from the LocalAncestry.zip

In [43]:
data_list = []
q_files_dir = os.path.join(data_dir, "LocalAncestry/Q_files")

# Extract and sort files numerically based on the window number
q_files = []
for file in os.listdir(q_files_dir):
    if file.endswith(".Q"):
        try:
            window = int(file.split('_')[1].split('.')[0])  # Extract window number
            q_files.append((window, file))  # Store as tuple (window_number, filename)
        except (IndexError, ValueError):
            print(f"Skipping {file}: Unexpected filename format")
            continue

# Sort files based on the window number
q_files.sort()  # Sorts by the first element in the tuple (window number)

for window, file in q_files:  
    file_path = os.path.join(q_files_dir, file)
    df = pd.read_csv(file_path, sep=" ", header=None)
    df['window'] = window  + 1 # We are parsing in ascending order of the files in each directory so adding one to the window size would not make a difference

    # Reset individual IDs per file (from 1 to 1069)
    df['individual'] = range(1, len(df) + 1)
    df['SAMPLE_ID'] = fam_df['SAMPLE_ID'].to_list()
    df['GROUP_ID'] = fam_df['GROUP_ID'].to_list()

    data_list.append(df)

# Combine all data
if data_list:  
    combined_df = pd.concat(data_list, ignore_index=True)
else:
    print("No valid .Q files found in directory.")

# Rename columns
combined_df = combined_df.rename({i: f'Admixture{i+1}' for i in range(9)}, axis=1)
combined_df


Unnamed: 0,Admixture1,Admixture2,Admixture3,Admixture4,Admixture5,Admixture6,Admixture7,Admixture8,Admixture9,window,individual,SAMPLE_ID,GROUP_ID
0,0.166067,0.137783,0.035062,0.236366,0.132851,0.000011,0.012957,0.278893,0.000010,1,1,HGDP00001,Brahui
1,0.009162,0.052174,0.052450,0.246235,0.057680,0.000010,0.000010,0.416426,0.165853,1,2,HGDP00003,Brahui
2,0.189324,0.000010,0.000010,0.091682,0.015396,0.000010,0.132960,0.490148,0.080459,1,3,HGDP00005,Brahui
3,0.172040,0.000010,0.050372,0.193109,0.000013,0.007881,0.038743,0.537821,0.000010,1,4,HGDP00007,Brahui
4,0.277328,0.055216,0.000010,0.089922,0.000010,0.000010,0.143065,0.434429,0.000010,1,5,HGDP00011,Brahui
...,...,...,...,...,...,...,...,...,...,...,...,...,...
59859,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.999920,56,1065,SubsaharanAfrican11,SubsaharanAfrican11
59860,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.999920,56,1066,SubsaharanAfrican12,SubsaharanAfrican12
59861,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.999920,56,1067,SubsaharanAfrican13,SubsaharanAfrican13
59862,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.999920,56,1068,SubsaharanAfrican14,SubsaharanAfrican14


In [44]:
# Create a list to hold all the 56 DataFrames
dfs = []
results = "/home/inf-21-2024/binp29/population_genetic_project/data/02_GPS/gps_file/"


# Loop through the DataFrame in chunks of 1069 rows
for i in range(0, len(combined_df), 1069):
    # Select the chunk (subset of rows)
    chunk_df = combined_df.iloc[i:i+1069]
    
    # Select the relevant columns (Sample_ID, Admixture1 to Admixture9, GROUP_ID)
    chunk_df = chunk_df[['SAMPLE_ID'] + [f'Admixture{j}' for j in range(1, 10)] + ['GROUP_ID']]
    
    # Append the chunk DataFrame to the list
    dfs.append(chunk_df)
    
    # Optionally, save each chunk as a CSV
    chunk_df.to_csv(f'{results}data_{i//1069}.csv',index=False)

print('Done!')

Done!


### Extract Chromosome position from merged.bim file

In [5]:
bim_df = pd.read_csv(os.path.join(data_dir,"LocalAncestry/merged.bim"),sep="\t",header=None, names=["chr","snp","cm","pos","a1","a2"])
bim_df

Unnamed: 0,chr,snp,cm,pos,a1,a2
0,1,rs11807848,0,1061166,2,4
1,1,rs2887286,0,1156131,2,4
2,1,rs6685064,0,1211292,4,2
3,1,rs1887284,0,1487059,1,3
4,1,rs263531,0,2164935,3,4
...,...,...,...,...,...,...
48264,23,rs7885463,0,150773472,1,3
48265,23,rs1061420,0,151122772,3,1
48266,23,rs1894356,0,152091689,4,3
48267,23,rs12558151,0,152602717,2,4


In [6]:
unique_chromosomes_per_chunk = {}

for i in range(0, len(bim_df), 500):
    chunk = bim_df.iloc[i:i+500]  # Extract 500-row chunk
    unique_chromosomes = chunk["chr"].unique()  
    start_pos = chunk["pos"].iloc[0]   # First position in chunk
    end_pos = chunk["pos"].iloc[-1]    # Last position in chunk

    unique_chromosomes_per_chunk[f"{i//500+1}"] = {
        "chromosomes": unique_chromosomes.tolist(),
        "start_pos": start_pos,
        "end_pos": end_pos
    }

In [7]:
# Create a mapping dictionary for chromosome, start, and end positions
mapped_info = {
    k: {
        "chromosome": ",".join(map(str, v["chromosomes"])),
        "start_pos": v["start_pos"],
        "end_pos": v["end_pos"]
    } 
    for k, v in unique_chromosomes_per_chunk.items()
}

# Convert 'window' column to string to match dictionary keys
combined_df['window'] = combined_df['window'].astype(str)

# Map each window to its corresponding chromosome info
combined_df['chromosome'] = combined_df['window'].map(lambda x: mapped_info.get(x, {}).get("chromosome", "NA"))
combined_df['start_pos'] = combined_df['window'].map(lambda x: mapped_info.get(x, {}).get("start_pos", "NA"))
combined_df['end_pos'] = combined_df['window'].map(lambda x: mapped_info.get(x, {}).get("end_pos", "NA"))

# Final dataframe with mapped values
combined_df


Unnamed: 0,Admixture1,Admixture2,Admixture3,Admixture4,Admixture5,Admixture6,Admixture7,Admixture8,Admixture9,window,individual,SAMPLE_ID,GROUP_ID,chromosome,start_pos,end_pos
0,0.166067,0.137783,0.035062,0.236366,0.132851,0.000011,0.012957,0.278893,0.000010,1,1,HGDP00001,Brahui,1,1061166,22629057
1,0.009162,0.052174,0.052450,0.246235,0.057680,0.000010,0.000010,0.416426,0.165853,1,2,HGDP00003,Brahui,1,1061166,22629057
2,0.189324,0.000010,0.000010,0.091682,0.015396,0.000010,0.132960,0.490148,0.080459,1,3,HGDP00005,Brahui,1,1061166,22629057
3,0.172040,0.000010,0.050372,0.193109,0.000013,0.007881,0.038743,0.537821,0.000010,1,4,HGDP00007,Brahui,1,1061166,22629057
4,0.277328,0.055216,0.000010,0.089922,0.000010,0.000010,0.143065,0.434429,0.000010,1,5,HGDP00011,Brahui,1,1061166,22629057
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59859,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.999920,56,1065,SubsaharanAfrican11,SubsaharanAfrican11,9,100674565,120088933
59860,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.999920,56,1066,SubsaharanAfrican12,SubsaharanAfrican12,9,100674565,120088933
59861,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.999920,56,1067,SubsaharanAfrican13,SubsaharanAfrican13,9,100674565,120088933
59862,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.999920,56,1068,SubsaharanAfrican14,SubsaharanAfrican14,9,100674565,120088933


## Step 2: Assign Local Ancetry to regions

In [8]:
gps_results = "/home/inf-21-2024/binp29/population_genetic_project/data/02_GPS/gps_results/"
gps_list = []

# Extract numeric suffix from filenames and sort numerically
gps_files = []
for file in os.listdir(gps_results):
    if file.startswith("output_data_") and file.endswith(".txt"):  # Ensure correct file format
        try:
            num = int(file.split('_')[-1].split('.')[0])  # Extract numeric part
            gps_files.append((num, file))  # Store as tuple (number, filename)
        except ValueError:
            print(f"Skipping {file}: Unexpected filename format")
            continue

# Sort files numerically by extracted number
gps_files.sort()

# Read and concatenate files in sorted order
for num, file in gps_files:
    file_path = os.path.join(gps_results, file)
    df = pd.read_csv(file_path, sep="\t")
    gps_list.append(df)

# Combine all data
if gps_list:
    gps_df = pd.concat(gps_list, ignore_index=True)
else:
    print("No valid files found.")

gps_df


Unnamed: 0,Population,Sample_no,Sample_id,Prediction,Lat,Lon
0,Brahui,1,HGDP00001,Tatars_2,53.161599,51.400428
1,Brahui,2,HGDP00003,Tatars_1,56.741362,32.101504
2,Brahui,3,HGDP00005,Finnish_1,60.015120,34.861537
3,Brahui,4,HGDP00007,Finnish_1,60.653373,32.265769
4,Brahui,5,HGDP00011,Tatars_2,58.284928,40.256009
...,...,...,...,...,...,...
59859,SubsaharanAfrican11,1,SubsaharanAfrican11,Yoruba_4,7.982425,4.024412
59860,SubsaharanAfrican12,1,SubsaharanAfrican12,Yoruba_4,7.982425,4.024412
59861,SubsaharanAfrican13,1,SubsaharanAfrican13,Yoruba_4,7.982425,4.024412
59862,SubsaharanAfrican14,1,SubsaharanAfrican14,Yoruba_4,7.982425,4.024412


In [9]:
full_df = pd.concat([combined_df,gps_df],axis=1)
full_df = full_df.loc[:, ~full_df.columns.duplicated()]
full_df.head()

Unnamed: 0,Admixture1,Admixture2,Admixture3,Admixture4,Admixture5,Admixture6,Admixture7,Admixture8,Admixture9,window,...,GROUP_ID,chromosome,start_pos,end_pos,Population,Sample_no,Sample_id,Prediction,Lat,Lon
0,0.166067,0.137783,0.035062,0.236366,0.132851,1.1e-05,0.012957,0.278893,1e-05,1,...,Brahui,1,1061166,22629057,Brahui,1,HGDP00001,Tatars_2,53.161599,51.400428
1,0.009162,0.052174,0.05245,0.246235,0.05768,1e-05,1e-05,0.416426,0.165853,1,...,Brahui,1,1061166,22629057,Brahui,2,HGDP00003,Tatars_1,56.741362,32.101504
2,0.189324,1e-05,1e-05,0.091682,0.015396,1e-05,0.13296,0.490148,0.080459,1,...,Brahui,1,1061166,22629057,Brahui,3,HGDP00005,Finnish_1,60.01512,34.861537
3,0.17204,1e-05,0.050372,0.193109,1.3e-05,0.007881,0.038743,0.537821,1e-05,1,...,Brahui,1,1061166,22629057,Brahui,4,HGDP00007,Finnish_1,60.653373,32.265769
4,0.277328,0.055216,1e-05,0.089922,1e-05,1e-05,0.143065,0.434429,1e-05,1,...,Brahui,1,1061166,22629057,Brahui,5,HGDP00011,Tatars_2,58.284928,40.256009


In [10]:
temp_df = full_df.copy()
temp_df['row'] = temp_df.groupby('Sample_id').cumcount()  # Assign row index (0-55)
temp_df.drop(columns=['GROUP_ID'],axis=1,inplace=True)

# Set MultiIndex
temp_df = temp_df.set_index(['Sample_id', 'row']).sort_index()
temp_df = temp_df[~temp_df['chromosome'].astype(str).str.contains(',')]

# Display result
test_case = temp_df.head(60)  # First few rows

## Step 3: Merging Ancestry Segments

In [11]:
# Assuming temp_df is already loaded
temp_df["Group"] = (temp_df["Prediction"] != temp_df["Prediction"].shift()).cumsum()

# Define aggregation functions
agg_funcs = {
    "start_pos": "first",
    "end_pos": "last",
    "Lat": "mean",
    "Lon": "mean",
}

# Dynamically find all Admixture columns and set aggregation to mean
admixture_cols = temp_df.filter(like="Admixture").columns
for col in admixture_cols:
    agg_funcs[col] = "mean"

# Include other columns that should remain unchanged
other_cols = ["SAMPLE_ID", "window", "individual",
              "Population", "Sample_no", "Prediction"]

for col in other_cols:
    agg_funcs[col] = "first"

# Group and aggregate
df_merged = temp_df.groupby(["chromosome", "Group"], as_index=False).agg(agg_funcs)

# Drop temporary Group column
df_merged.drop(columns=["Group"], inplace=True)

# Define the desired column order
column_order = [
    "SAMPLE_ID", "individual", "chromosome", "start_pos", "end_pos", 
    "Prediction", "Population", "Lat", "Lon"
] + list(temp_df.filter(like="Admixture").columns)

# Reorder the dataframe
df_merged = df_merged[column_order]
df_merged = df_merged.sort_values(by=["individual", "chromosome","start_pos"], ascending=[True, True,True])

# Display the reordered dataframe
df_merged

Unnamed: 0,SAMPLE_ID,individual,chromosome,start_pos,end_pos,Prediction,Population,Lat,Lon,Admixture1,Admixture2,Admixture3,Admixture4,Admixture5,Admixture6,Admixture7,Admixture8,Admixture9
0,HGDP00001,1,1,1061166,50722989,Tatars_2,Brahui,55.905289,45.338307,0.152809,0.089550,0.017739,0.216834,0.079136,0.031447,0.006483,0.364705,0.041295
1,HGDP00001,1,1,50757885,100050789,Altaians_4,Brahui,49.639363,77.548065,0.399954,0.023092,0.074883,0.030503,0.007920,0.012486,0.057318,0.319434,0.074411
2,HGDP00001,1,1,100051668,185344764,Tatars_2,Brahui,56.775267,49.379509,0.325240,0.021357,0.000010,0.078481,0.066444,0.026460,0.007491,0.399812,0.074706
3,HGDP00001,1,1,185410975,211776439,Altaians_4,Brahui,54.436350,62.700449,0.381519,0.000010,0.000010,0.000010,0.130543,0.000327,0.077793,0.389670,0.020118
4,HGDP00001,1,1,211785065,230894365,Finnish_1,Brahui,60.904016,21.054910,0.003562,0.000010,0.000010,0.056238,0.000010,0.000010,0.129396,0.810754,0.000010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18183,SubsaharanAfrican1,1055,5,15571736,175059713,Yoruba_4,SubsaharanAfrican1,7.982425,4.024412,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.999920
20553,SubsaharanAfrican1,1055,6,14329647,146960353,Yoruba_4,SubsaharanAfrican1,7.982425,4.024412,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.999920
22941,SubsaharanAfrican1,1055,7,3609712,146984783,Yoruba_4,SubsaharanAfrican1,7.982425,4.024412,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.999920
25431,SubsaharanAfrican1,1055,8,9375944,126554039,Yoruba_4,SubsaharanAfrican1,7.982425,4.024412,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.999920


In [12]:
df_merged.tail(80)

Unnamed: 0,SAMPLE_ID,individual,chromosome,start_pos,end_pos,Prediction,Population,Lat,Lon,Admixture1,Admixture2,Admixture3,Admixture4,Admixture5,Admixture6,Admixture7,Admixture8,Admixture9
10020,Mediterranean1,935,2,12589208,229406167,Japan_4,Mediterranean1,41.454938,120.459924,0.99992,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001
13874,Mediterranean1,935,3,3846526,193722330,Japan_4,Mediterranean1,41.454938,120.459924,0.99992,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001
16307,Mediterranean1,935,4,18473310,185788762,Japan_4,Mediterranean1,41.454938,120.459924,0.99992,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001
18175,Mediterranean1,935,5,15571736,175059713,Japan_4,Mediterranean1,41.454938,120.459924,0.99992,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001
20545,Mediterranean1,935,6,14329647,146960353,Japan_4,Mediterranean1,41.454938,120.459924,0.99992,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18183,SubsaharanAfrican1,1055,5,15571736,175059713,Yoruba_4,SubsaharanAfrican1,7.982425,4.024412,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.99992
20553,SubsaharanAfrican1,1055,6,14329647,146960353,Yoruba_4,SubsaharanAfrican1,7.982425,4.024412,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.99992
22941,SubsaharanAfrican1,1055,7,3609712,146984783,Yoruba_4,SubsaharanAfrican1,7.982425,4.024412,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.99992
25431,SubsaharanAfrican1,1055,8,9375944,126554039,Yoruba_4,SubsaharanAfrican1,7.982425,4.024412,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.99992


## Step 4: Visualzation

## Chromosome Plots

In [13]:
test_df = df_merged[df_merged['SAMPLE_ID']=="HGDP00021"]
test_df

Unnamed: 0,SAMPLE_ID,individual,chromosome,start_pos,end_pos,Prediction,Population,Lat,Lon,Admixture1,Admixture2,Admixture3,Admixture4,Admixture5,Admixture6,Admixture7,Admixture8,Admixture9
57,HGDP00021,10,1,1061166,71556842,Tatars_2,Brahui,56.763733,47.830288,0.309244,0.001775,0.024306,0.035307,0.0772,0.028757,0.068404,0.383985,0.071021
58,HGDP00021,10,1,71590877,100050789,Altaians_4,Brahui,50.23312,86.588286,0.474894,0.027756,1e-05,0.157944,1e-05,0.048879,1e-05,0.257492,0.033006
59,HGDP00021,10,1,100051668,185344764,Tatars_2,Brahui,58.162374,41.570558,0.285486,1e-05,1e-05,0.166253,0.053178,0.003635,1e-05,0.472241,0.019176
60,HGDP00021,10,1,185410975,211776439,Finnish_1,Brahui,61.335769,26.266626,0.143214,1e-05,1e-05,0.016461,0.137687,0.040895,0.03751,0.624203,1e-05
61,HGDP00021,10,1,211785065,230894365,Tatars_4,Brahui,54.986374,42.745076,0.175145,0.209066,1.1e-05,0.131271,1e-05,0.065682,0.1033,0.315505,1e-05
62,HGDP00021,10,1,230906954,245770642,Tatars_2,Brahui,56.942701,46.779704,0.26967,1e-05,0.135518,0.162474,1e-05,1e-05,1e-05,0.364694,0.067605
5014,HGDP00021,10,2,12589208,50975185,Tatars_2,Brahui,57.643467,42.479475,0.160343,0.037613,1e-05,0.223073,0.015677,1e-05,0.114313,0.392222,0.056742
5015,HGDP00021,10,2,50976485,103123301,Finnish_1,Brahui,60.502,32.752626,0.260841,1e-05,1e-05,0.050314,0.056185,1e-05,0.054946,0.577673,1e-05
5016,HGDP00021,10,2,103176411,130665622,Tatars_2,Brahui,58.061589,42.956811,0.311196,1e-05,0.061522,0.08463,0.078176,1e-05,1e-05,0.461407,0.003039
5017,HGDP00021,10,2,131063169,153872845,Finnish_2,Brahui,60.279164,33.931368,0.10937,1e-05,1e-05,0.32514,1e-05,1e-05,0.044089,0.521351,1e-05


In [36]:
# Ensure chromosome is categorical
test_df['chromosome'] = test_df['chromosome'].astype(str)

# Assign unique colors for different predictions
unique_predictions = test_df['Prediction'].unique()
color_map = {pred: px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)] for i, pred in enumerate(unique_predictions)}

fig = go.Figure()
first_occurrence = {}

for _, row in test_df.iterrows():
    prediction = row["Prediction"]
    sample_id = row['SAMPLE_ID']

    # Show legend only for the first occurrence of each prediction type
    show_legend = False if prediction in first_occurrence else True
    first_occurrence[prediction] = True  

    # Generate intermediate points for smoother hover
    x_values = np.linspace(row["start_pos"], row["end_pos"], num=20)  # Fewer points in range slider
    y_values = [row["chromosome"]] * len(x_values)  # Keep Y fixed for chromosome

    # Main chromosome segment (Main plot)
    fig.add_trace(go.Scatter(
        x=x_values,
        y=y_values,
        mode="lines+markers",
        marker=dict(size=3, opacity=0),  # Invisible markers for better hover effect
        line=dict(color=color_map[prediction], width=18),  # Width for main plot
        name=prediction if show_legend else None,
        showlegend=show_legend,
        hoveron="points+fills",
        hoverinfo="text",
        text=[f"Sample: {row['SAMPLE_ID']}<br>"
              f"Chromosome: {row['chromosome']}<br>"
              f"Position: {int(x)}<br>"
              f"Prediction: {row['Prediction']}<br>"
              f"Population: {row['Population']}"
              for x in x_values]
    ))


fig.update_layout(
    title=f"Chromosome Segments of {sample_id}",
    title_x=0.5,  # Centers the title
    title_font=dict(size=30),  # Increase title font size
    xaxis_title="Genomic Position",
    yaxis_title="Chromosome",
    hovermode="closest",
    height=800,
    dragmode="zoom",
    xaxis=dict(
        showgrid=True,
        zeroline=False,
        showline=True,
        rangeslider=dict(
            visible=True,
            thickness=0.30,
            borderwidth=1,
        ),
        type="linear",
        range=[0, None],
        title_font=dict(size=22),  # Increase x-axis title size
        tickfont=dict(size=16),  # Increase x-axis tick label size
    ),
    yaxis=dict(
        showgrid=True,
        zeroline=False,
        showline=True,
        categoryorder="category ascending",
        fixedrange=False,
        title_font=dict(size=22),  # Increase y-axis title size
        tickfont=dict(size=16),  # Increase y-axis tick label size
    ),
    legend_title="Samples and Predictions",
    legend_title_font=dict(size=24),  # Increase legend title size
    legend_font=dict(size=18),  # Increase legend label size
    hoverlabel=dict(
        font_size=16,  # Increase hover label font size
    ),
    legend=dict(
        font=dict(size=20)  # Increase legend text size
    ),
    legend_orientation="v",  # Vertical legend layout
)

fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## Worldmap plots

In [38]:
test_df.columns

Index(['SAMPLE_ID', 'individual', 'chromosome', 'start_pos', 'end_pos',
       'Prediction', 'Population', 'Lat', 'Lon', 'Admixture1', 'Admixture2',
       'Admixture3', 'Admixture4', 'Admixture5', 'Admixture6', 'Admixture7',
       'Admixture8', 'Admixture9'],
      dtype='object')

In [40]:

# Assuming the dataframe 'df' is already loaded with your data (replace with your actual DataFrame)

# Create the plotly map with scatter_geo
fig = px.scatter_geo(test_df,
                     lat='Lat',  # Latitude
                     lon='Lon',  # Longitude
                     color='Admixture1',  # Color based on Admixture1 (change to any of the admixtures)
                     hover_name='SAMPLE_ID',  # Display sample ID on hover
                     hover_data=['Population', 'Prediction', 'Admixture1'],  # Display additional data on hover
                     color_continuous_scale='Viridis',  # You can change the color scale
                     size_max=15,  # Adjust the max marker size
                     title="Admixture Proportions Across Samples",
                     projection="natural earth"  # A natural earth map projection
                    )

# Add details like map title, map center, and zoom
fig.update_geos(
    showcoastlines=True,
    coastlinecolor="Black",
    projection_type="natural earth",
    center={"lat": 20, "lon": 0},
    projection_scale=1.5,
)

# Customize layout for better clarity
fig.update_layout(
    geo=dict(
        showland=True,
        landcolor="lightgray",
        showlakes=True,
        lakecolor="white",
    ),
    title="Admixture Proportions and Geographical Distribution",
)

# Show the figure
fig.show()
