In [1]:
import pandas as pd
import os

In [2]:
# First align the sequences against IGHC, then integrate this table with all_info
# And generate a new clones file (integration and creating new files in this script)
MYDIR = "../final/"

In [3]:
# Get list of files with aligned sequences (IGHC)
ighc_files = dict()
for myfile in os.listdir(MYDIR):
    if myfile.endswith("-IGHC_CH12_human-easy-import.txt"):
        mysample = myfile.split("_L001")[0]
        ighc_files[mysample] = myfile
ighc_files

{'IgSub5-3_S144': 'IgSub5-3_S144_L001.assembled-ACGTACGT-IGHC_CH12_human-easy-import.txt',
 'IgSub1-3_S100': 'IgSub1-3_S100_L001.assembled-ACGTACGT-IGHC_CH12_human-easy-import.txt',
 'IgSub2-3_S104': 'IgSub2-3_S104_L001.assembled-ACGTACGT-IGHC_CH12_human-easy-import.txt',
 'IgSub3-3_S122': 'IgSub3-3_S122_L001.assembled-ACGTACGT-IGHC_CH12_human-easy-import.txt',
 'IgSub4-3_S126': 'IgSub4-3_S126_L001.assembled-ACGTACGT-IGHC_CH12_human-easy-import.txt',
 'IgSub6-3_S148': 'IgSub6-3_S148_L001.assembled-ACGTACGT-IGHC_CH12_human-easy-import.txt'}

In [4]:
# Retrieve the all_info.csv files
allinfo_files = dict()
for myfile in os.listdir(MYDIR):
    if myfile.endswith("-all_info.csv"):
        mysample = myfile.split("_L001")[0]
        allinfo_files[mysample] = myfile
allinfo_files

{'IgSub5-3_S144': 'IgSub5-3_S144_L001.assembled-ACGTACGT-IGH_HUMAN-all_info.csv',
 'IgSub1-3_S100': 'IgSub1-3_S100_L001.assembled-ACGTACGT-IGH_HUMAN-all_info.csv',
 'IgSub2-3_S104': 'IgSub2-3_S104_L001.assembled-ACGTACGT-IGH_HUMAN-all_info.csv',
 'IgSub3-3_S122': 'IgSub3-3_S122_L001.assembled-ACGTACGT-IGH_HUMAN-all_info.csv',
 'IgSub4-3_S126': 'IgSub4-3_S126_L001.assembled-ACGTACGT-IGH_HUMAN-all_info.csv',
 'IgSub6-3_S148': 'IgSub6-3_S148_L001.assembled-ACGTACGT-IGH_HUMAN-all_info.csv'}

In [5]:
def integrateIghcAllinfo(sample, allinfo_files, ighc_files):
    # Read all_info and IGHC tables
    df_a = pd.read_csv(MYDIR + allinfo_files[sample], sep="\t", na_values=['None', ''])
    df_c = pd.read_csv(MYDIR + ighc_files[sample], sep="\t", header=None, na_values=['None', ''])
    df_c.columns = ['acc', 'bwa_flag', 'C_region']

    # Clean up the gene name from the alignment
    clean_name = lambda x: x.split("|")[1]
    df_c["C_region"] = [g for g in map(clean_name, df_c['C_region'])]

    # Filter on bwa_flag (should be 0 or 16)
    df_c = df_c.loc[(df_c['bwa_flag'] == 16) | (df_c['bwa_flag'] == 0)]

    # Merge the two tables
    df = pd.merge(df_a, df_c, how="left", on="acc")
    
    return(df)

In [6]:
def concat_jc_gene(genes):
    genes = list(set(genes))           # get unique names
    genes = [ str(x) for x in genes ]  # make sure that everything is a string
    genes.sort()                       # sort alphabetically
    return ",".join(genes)

In [7]:
def create_clones(df):
    # Apply filter to all info
    df = df.loc[(df['cdr3_qual_min'] >= 30) & (pd.isna(df['V_sub']) == False) & (pd.isna(df['J_sub']) == False) & ((df['V_flag'] == 0) | (df['V_flag'] == 16)) & ((df['J_flag'] == 0) | (df['J_flag'] == 16))]
    
    # Now group it into clones
    select = ['cdr3pep', 'V_sub', 'J_sub', 'C_region', 'acc', 'beforeMID', 'cdr3nuc']
    cols = ['cdr3pep', 'V_sub']
    clones = df[select].groupby(cols).agg({'J_sub': concat_jc_gene, 'C_region': concat_jc_gene, 'beforeMID': 'nunique', 'acc': 'nunique'})
    clones = clones.sort_values(by='beforeMID', ascending=False)
    clones = clones.reset_index()
    
    return(clones)

In [8]:
for sample in list(ighc_files.keys()):
    df = integrateIghcAllinfo(sample, allinfo_files, ighc_files)
    allinfonew = allinfo_files[sample].replace("-all_info.csv", "-ighc-all_info.csv")
    df.to_csv(MYDIR + allinfonew, sep="\t", index=False)
    print("Wrote", MYDIR + allinfonew, "to disk")

    clones = create_clones(df)
    clonesnew = allinfo_files[sample].replace("-all_info.csv", "-ighc-clones.csv")
    clones.to_csv(MYDIR + clonesnew, sep="\t", index=False)
    print("Wrote", MYDIR + clonesnew, "to disk")

Wrote ../final/IgSub5-3_S144_L001.assembled-ACGTACGT-IGH_HUMAN-ighc-all_info.csv to disk
Wrote ../final/IgSub5-3_S144_L001.assembled-ACGTACGT-IGH_HUMAN-ighc-clones.csv to disk
Wrote ../final/IgSub1-3_S100_L001.assembled-ACGTACGT-IGH_HUMAN-ighc-all_info.csv to disk
Wrote ../final/IgSub1-3_S100_L001.assembled-ACGTACGT-IGH_HUMAN-ighc-clones.csv to disk
Wrote ../final/IgSub2-3_S104_L001.assembled-ACGTACGT-IGH_HUMAN-ighc-all_info.csv to disk
Wrote ../final/IgSub2-3_S104_L001.assembled-ACGTACGT-IGH_HUMAN-ighc-clones.csv to disk
Wrote ../final/IgSub3-3_S122_L001.assembled-ACGTACGT-IGH_HUMAN-ighc-all_info.csv to disk
Wrote ../final/IgSub3-3_S122_L001.assembled-ACGTACGT-IGH_HUMAN-ighc-clones.csv to disk
Wrote ../final/IgSub4-3_S126_L001.assembled-ACGTACGT-IGH_HUMAN-ighc-all_info.csv to disk
Wrote ../final/IgSub4-3_S126_L001.assembled-ACGTACGT-IGH_HUMAN-ighc-clones.csv to disk
Wrote ../final/IgSub6-3_S148_L001.assembled-ACGTACGT-IGH_HUMAN-ighc-all_info.csv to disk
Wrote ../final/IgSub6-3_S148_L0