In [1]:
import os
import pandas as pd
from Bio import SeqIO

In [2]:
f = "/home/barbera/Downloads/IgG/IgSub5-3_S144_L001.assembled-ACGTACGT-IGH_HUMAN-ighc-all_info.csv"
df = pd.read_csv(f, sep="\t")
df.head()

Unnamed: 0,acc,beforeMID,MID,afterMID,readingframe,cdr3pep,cdr3nuc,cdr3_qual_min,cdr3_qual_max,cdr3_qual_avg,...,nr_v_mains,nr_v_subs,nr_v_alleles,nr_j_subs,nr_j_alleles,acc:2,readingframe:1,nr_sites,bwa_flag,C_region
0,M02984:577:000000000-CFCGB:1:1101:10039:20569,ATTT,ACGTACGT,CGTAT,4,CARQGWGGIQFWDEGTNWGQGTLVT,TGTGCGAGACAGGGTTGGGGAGGTATTCAATTTTGGGATGAAGGGA...,30,40,39.0,...,1,1,1,1,1,M02984:577:000000000-CFCGB:1:1101:10039:20569,4.0,1.0,16.0,IGHG1*03
1,M02984:577:000000000-CFCGB:1:1101:10065:23604,TTTT,ACGTACGT,CGTAT,5,CARGRRSDWFDPWGQGTLVT,TGTGCGAGAGGGAGGAGGTCTGACTGGTTCGACCCCTGGGGCCAGG...,27,40,38.3,...,1,1,1,1,1,,,,16.0,IGHG1*05
2,M02984:577:000000000-CFCGB:1:1101:10098:2385,ACGA,ACGTACGT,CGTAT,5,CAGGRRSDWFDPWGQGTLVT,TGTGCGGGAGGGAGGAGGTCTGACTGGTTCGACCCCTGGGGCCAGG...,11,40,35.2,...,1,1,1,1,1,,,,16.0,IGHG1*03
3,M02984:577:000000000-CFCGB:1:1101:10098:9258,GGGT,ACGTACGT,CGTAT,5,CARGRRSDWFDPWGQGTLVT,TGTGCGAGAGGGAGGAGGTCTGACTGGTTCGACCCCTGGGGCCAGG...,9,40,39.5,...,1,1,1,1,1,,,,16.0,IGHG1*05
4,M02984:577:000000000-CFCGB:1:1101:10110:19181,ATAT,ACGTACGT,CGTAT,5,CARGRRSDWFDPWGQGTLVT,TGTGCGAGAGGGAGGAGGTCTGACTGGTTCGACCCCTGGGGCCAGG...,18,40,38.7,...,1,1,1,1,1,,,,16.0,IGHG1*01


In [3]:
df.columns

Index(['acc', 'beforeMID', 'MID', 'afterMID', 'readingframe', 'cdr3pep',
       'cdr3nuc', 'cdr3_qual_min', 'cdr3_qual_max', 'cdr3_qual_avg',
       'cdr3_qual', 'nt_start', 'nt_end', 'seq_length', 'V_flag', 'V_gene',
       'J_flag', 'J_gene', 'readingframe_seq', 'seq', 'pep', 'qual', 'V_sub',
       'J_sub', 'V_main', 'acc:1', 'nr_v_mains', 'nr_v_subs', 'nr_v_alleles',
       'nr_j_subs', 'nr_j_alleles', 'acc:2', 'readingframe:1', 'nr_sites',
       'bwa_flag', 'C_region'],
      dtype='object')

In [4]:
# Which VJ combination occurs most frequent?
df_vj_count = df.groupby(['V_gene', 'J_gene']).agg({'acc': 'nunique'})
df_vj_count = df_vj_count.sort_values(by='acc', ascending=False)
df_vj_count = df_vj_count.reset_index()
df_vj_count.head()

Unnamed: 0,V_gene,J_gene,acc
0,IGHV1-3*01,IGHJ5*02,22720
1,IGHV3-7*01,IGHJ4*02,774
2,IGHV3-23*01,IGHJ4*02,527
3,IGHV3-23*04,IGHJ4*02,439
4,IGHV1-3*02,IGHJ5*02,381


In [5]:
# Select the VJ combination that occurs often
v_top = df_vj_count['V_gene'][1]
j_top = df_vj_count['J_gene'][1]
print(v_top, j_top)

IGHV3-7*01 IGHJ4*02


In [6]:
# Retrieve sequences for VJ combination and make these sequences unique for further analysis
concatenate = lambda x: "|".join(list(set(x)))
df_selection = df[(df["V_gene"] == v_top) & (df["J_gene"] == j_top)]
df_selection = df_selection.groupby("seq").agg({'acc': concatenate}).reset_index()
df_selection.head()

Unnamed: 0,seq,acc
0,AAAAACGTACGTCGTATCGCCTCCCTCGCGCCAGGCACCAAGCCTG...,M02984:577:000000000-CFCGB:1:2112:24380:4741
1,AAAAACGTACGTCGTATCGCCTCCCTCGCGCCAGTACGTTGACCTG...,M02984:577:000000000-CFCGB:1:1110:14353:7289
2,AAAGACGTACGTCGTACCGCCTCCCTCGCGCCATTGAGGACTCCTG...,M02984:577:000000000-CFCGB:1:1118:18353:18927
3,AAATACGTACGTCGTATCGCCTCCCTCGCGCCATTCTTGGCGCCTG...,M02984:577:000000000-CFCGB:1:1116:6498:21562
4,AAATACGTACGTCGTATCGCCTCCCTCGCGCCATTTTATTTTCCTG...,M02984:577:000000000-CFCGB:1:2114:17939:9317


In [7]:
# Open files for writing the fasta sequences
fhOutV = open("test.V.fasta", "w")
fhOutJ = open("test.J.fasta", "w")

In [8]:
# Retrieve the reference V and J sequences
v_seq, j_seq = "", ""
for record in SeqIO.parse(open("../reference/IGHV_human.fasta"), "fasta"):
    if v_top in record.id:
        v_seq = str(record.seq).upper()
for record in SeqIO.parse(open("../reference/IGHJ_human.fasta"), "fasta"):
    if j_top in record.id:
        j_seq = str(record.seq).upper()

print(">" + v_top)
print(v_seq)
print(">" + j_top)
print(j_seq)
print(">" + v_top, file=fhOutV)
print(v_seq, file=fhOutV)
print(">" + j_top, file=fhOutJ)
print(j_seq, file=fhOutJ)

>IGHV3-7*01
GAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTTAGTAGCTATTGGATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTGGCCAACATAAAGCAAGATGGAAGTGAGAAATACTATGTGGACTCTGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACTCACTGTATCTGCAAATGAACAGCCTGAGAGCCGAGGACACGGCTGTGTATTACTGTGCGAGAGA
>IGHJ4*02
ACTACTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG


In [9]:
for i in df_selection.index:
    print(">" + df_selection.iloc[i]['acc'], file=fhOutV)
    print(df_selection.iloc[i]['seq'], file=fhOutV)
    print(">" + df_selection.iloc[i]['acc'], file=fhOutJ)
    print(df_selection.iloc[i]['seq'], file=fhOutJ)

In [10]:
fhOutV.close()
fhOutJ.close()
print("Wrote test.V.fasta to disk")
print("Wrote test.J.fasta to disk")

Wrote test.V.fasta to disk
Wrote test.J.fasta to disk
