## Einstein Omics Workshop: Population Stratification Analysis 
**Author: David Yang**<br>
**Date: February 2024**


In [None]:
import warnings
warnings.filterwarnings('ignore')
#libraries used in this python script
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import csv
from matplotlib.pyplot import figure




In [None]:
#Set the color palette for Seaborn
sns.set_style('darkgrid') # darkgrid, white grid, dark, white and ticks
plt.rc('axes', titlesize=18)     # fontsize of the axes title
plt.rc('axes', labelsize=14)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=13)    # fontsize of the tick labels
plt.rc('ytick', labelsize=13)    # fontsize of the tick labels
plt.rc('legend', fontsize=13)    # legend fontsize
plt.rc('font', size=13)          # controls default text sizes

#I will use color blind for this test
sns.color_palette('colorblind')

In [None]:
# Loading and preprocessing of PCA data
# Load in eigenvec calculated using PCA from Plink
pca_vals = pd.read_table("results/PCA_result.eigenvec", delim_whitespace=True, header=None)
pca_ids = pca_vals[1]
print(pca_ids[1:6])
pca_vals.head(5)


In [None]:
# Match sample IDs from pca_vals with metadata file
all_metadata = pd.read_table("txt_files/population_labels.txt", delim_whitespace=True, header=None)

# Set the header row as column names
all_metadata.columns = all_metadata.iloc[0]

# Drop the first row (header row)
all_metadata = all_metadata.drop(0)

# Reset the index
all_metadata = all_metadata.reset_index(drop=True)

# Display the modified DataFrame
all_metadata.head(5)
            

In [None]:
# Extract the column you want to write to the text file
Population_labels = all_metadata['Population']
print(Population_labels[1:10])

# Write the population label data to a text file
with open('txt_files/abels_ind2pop.txt', 'w') as file:
    file.write('\n'.join(Population_labels.astype(str)))
   

In [None]:
all_metadata = pd.read_table("txt_files/population_labels.txt", delim_whitespace=True, header=0)

# Extract unique population labels and create a list
admixture_list = all_metadata['Population'].unique().tolist()

# Create a dictionary mapping individual IDs to population labels
pop_names = dict(zip(all_metadata['IID'], all_metadata['Population']))

# Display the extracted population labels and the individual-to-population mapping
print("Unique Population Labels:", admixture_list)
print("\nIndividual-to-Population Mapping:")
print(pop_names)

# Export pop_names as a tab-delimited file
with open('txt_files/pop_names.txt', 'w', newline='') as file:
    writer = csv.writer(file, delimiter='\t')
    writer.writerow(pop_names.keys())  # Write the header row directly
    writer.writerows(zip(pop_names.keys(), pop_names.values()))  # Write the data
    
pca_vals.insert(2, 'Population_name', pca_vals[1].map(pop_names))
pca_vals.to_csv("txt_files/pca_eigenvec_pop_labels.csv")

In [None]:
# Rename the columns starting from the third column
pca_vals.columns = ['FID', 'IID', 'Population_name'] + [f'PC{i}' for i in range(1, len(pca_vals.columns)-2)]

# Display the DataFrame with the updated column names
pca_vals.head()


In [None]:
# Choose a color map
cmap = plt.cm.tab20

# Count the number of unique super populations
num_population = pca_vals["Population_name"].nunique()

# Create a color map normalization instance
norm = plt.Normalize(0, num_population)

# Plot PCA grouping by population name
plt.figure(figsize=(8, 6), dpi=200)
groups = pca_vals.groupby("Population_name")

for i, (name, group) in enumerate(groups):
    color = cmap(norm(i))
    if name != "OWN":
        plt.scatter(group["PC1"], group["PC2"], marker="o", label=name, s=20, color=color, zorder=5)
    else:
        plt.scatter(group["PC1"], group["PC2"], marker="x", label=name, s=40, color='black', zorder=10)

plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend(bbox_to_anchor=(1, 1), prop={'size': 10,} , ncol=2)
plt.savefig('PCA_PC1-PC2.png', bbox_inches="tight")
plt.show()