In [2]:
### Preprocessing
# Check to make sure only have XX and XXY samples and they're correctly labeled

import argparse
import glob
import subprocess
import pysam
import pandas as pd
import random
import matplotlib.pyplot as plt
from ast import literal_eval
import seaborn as sns
import scipy.stats as stats


In [3]:
npc_df = pd.read_csv("/net/noble/vol1/home/es1/ips_project/NPC_validation_combined_data_v2.csv")
npc_geno = pd.read_csv("/net/noble/vol1/home/es1/ips_project/NPC_Cardio_genotype/NPC_sample_list.txt")

# Filter npc_df where 'cell_barcode' matches values in NPC_geno['Sample_name']
filtered_npc_df = npc_df[npc_df['cell_barcode'].isin(npc_geno['Sample_name'])]

# Merge npc_df with NPC_geno to transfer the 'Genotype' column
filtered_npc_df_geno = filtered_npc_df.merge(npc_geno, how='left', left_on='cell_barcode', right_on='Sample_name')




In [4]:
cardio_df = pd.read_csv("/net/noble/vol1/home/es1/ips_project/Cardiomyocytes_validation_combined_data_v2.csv")
cardio_geno = pd.read_csv("/net/noble/vol1/home/es1/ips_project/NPC_Cardio_genotype/Cardiomyocytes_sample_list.txt")
filtered_cardio_df = cardio_df[cardio_df['cell_barcode'].isin(cardio_geno['Sample_name'])]
filtered_cardio_df_geno = filtered_cardio_df.merge(cardio_geno, how='left', left_on='cell_barcode', right_on='Sample_name')


In [34]:
filtered_npc_df_geno.to_csv("/net/noble/vol1/home/es1/ips_project/NPC_intermediate.csv")




In [5]:
filtered_npc_df_geno.columns

Index(['SNP_ID', 'CHR', 'POS', 'REF', 'ALT', 'cell_barcode', 'REFcount',
       'ALTcount', 'OTHcount', 'Region', 'Gene', 'ALL_Freq', 'Sample_name',
       'Genotype'],
      dtype='object')

In [6]:
replicate_pairs = [["XXY_X5_a_S2","XXY_X5_b_S3"], ["F11_1_S7","F11_1A_S7"], ["F12_1_S8","F12_1A_S8"], ["X4-2_S1","X4_2_new_S7"]]



In [7]:
# Create a copy of the original dataframe to work on
npc_df_no_reps = filtered_npc_df_geno.copy()

# Iterate over each pair of Sample_names
for sample_pair in replicate_pairs:
    sample1, sample2 = sample_pair
    
    # Filter rows for each sample
    df_sample1 = npc_df_no_reps[npc_df_no_reps['Sample_name'] == sample1]
    df_sample2 = npc_df_no_reps[npc_df_no_reps['Sample_name'] == sample2]
    
    # Find common SNP_IDs
    common_snps = pd.merge(df_sample1, df_sample2, on='SNP_ID', suffixes=('_1', '_2'))
    
    # Sum REFcount, ALTcount, and OTHcount for common SNP_IDs
    common_snps['REFcount'] = common_snps['REFcount_1'] + common_snps['REFcount_2']
    common_snps['ALTcount'] = common_snps['ALTcount_1'] + common_snps['ALTcount_2']
    common_snps['OTHcount'] = common_snps['OTHcount_1'] + common_snps['OTHcount_2']
    
    # Merge summed counts back to the original dataframe (replace existing REFcount, ALTcount, OTHcount)
    npc_df_no_reps.loc[
        (npc_df_no_reps['Sample_name'] == sample1) & (npc_df_no_reps['SNP_ID'].isin(common_snps['SNP_ID'])),
        ['REFcount', 'ALTcount', 'OTHcount']
    ] = common_snps[['REFcount', 'ALTcount', 'OTHcount']].values
    
    # Remove the row for sample2 from the main dataframe
    npc_df_no_reps = npc_df_no_reps[~((npc_df_no_reps['Sample_name'] == sample2) & 
                                      (npc_df_no_reps['SNP_ID'].isin(common_snps['SNP_ID'])))]
    
# Reset the index after modifications
npc_df_no_reps.reset_index(drop=True, inplace=True)

In [8]:
# Iterate over each pair of Sample_names in replicate_pairs
for sample_pair in replicate_pairs:
    sample1, sample2 = sample_pair
    combined_sample_name = f"{sample1}_{sample2}"  # Combine the two sample names
    
    # Replace Sample_name in npc_df_no_reps if it's sample1 or sample2
    npc_df_no_reps['Sample_name'] = npc_df_no_reps['Sample_name'].replace({sample1: combined_sample_name, sample2: combined_sample_name})


In [9]:
npc_df_no_reps["Cell_type"] = "NPC"

In [10]:
filtered_cardio_df_geno["Cell_type"] = "Cardiomyocyte"

In [11]:
combined_NPC_cardio = pd.concat([npc_df_no_reps, filtered_cardio_df_geno], ignore_index=True)



In [12]:
# Function to extract donor name
def extract_donor(cell_barcode):
    parts = cell_barcode.replace('-', '_').split('_')  # Normalize delimiters
    return parts[1] if parts[0] == 'XXY' else parts[0]

In [13]:
# Apply function to create new column
combined_NPC_cardio['Donor'] = combined_NPC_cardio['cell_barcode'].apply(extract_donor)



8

In [15]:
gene_xci_status = pd.read_csv("/net/noble/vol1/home/es1/ips_project/Stranger_gene_XCI_status.csv")

gene_xci_status = gene_xci_status[~((gene_xci_status['Gene Name'] == 'AP1S2') & 
                                    (gene_xci_status['X inactivation status'] == 'inactivated'))]

combined_df_gene = combined_NPC_cardio.merge(gene_xci_status, how='left', left_on='Gene', right_on='Gene Name')


In [46]:
combined_df_gene.to_csv("/net/noble/vol1/home/es1/ips_project/combined_NPC_cardio_v2.csv", index=False)

