In [1]:
import sys
import pandas as pd

In [2]:
df = pd.read_csv("rhesus_ward_lab.csv")
df.head()

Unnamed: 0,Gene Name,Other Names,Type,Conf,Animal,Animal Other,Gene Sequence,Protein Sequence,Gene Accession,Protein Accession,Citations
0,IGHV1-ABW*01,IGHV1-AGS*01,VH,1,MF-M0,,GAGGTCCAGCTGGTGCAGTCTGGGGCTGAGGTGAAGAAGCCTGGGG...,EVQLVQSGAEVKKPGASVKISCKASGYTFTDHYLNWVRQAPGKGLE...,MF989481.1,ATV91137.1,29163486
1,IGHV1-AEP*01,IGHV1-C*01; LJI.Rh_IGHV1.124,VH,1,MF-M0,,CAGGTGCAGCTGGTGCAGTCTGGGGCTGAGATTAAGCAGCCTGGGG...,QVQLVQSGAEIKQPGASVKLSCKASGYTFTSYYMHWVRQAPGQGLE...,MF989475.1,ATV91118.1,29163486; 25858157; 31080066
2,IGHV1-Kc,IGHV1c; VH1.59,VH,1,17573,,GAGGTCCAGCTGGTGCAGTCTGGGGCTGAAGTGAAGAAGCCTGGGG...,EVQLVQSGAEVKKPGASVKVSCKVSGYTFTELSMHWVRQAPGKGLE...,NC-027905.1,,27525066; 25319552
3,IGHV1-Kl,IGHV1l; VH1.23,VH,1,17573,,CAGGTGCAGCTGGTGCAGTCTGGGGCTGAGGTGAAGAAGCCTGGGT...,QVQLVQSGAEVKKPGSSVKVSCKASGYTFTDYYMHWVRQAPRQGLE...,NC-027905.1,,27525066; 25319553
4,IGHV1-Korf4,IGHV1orf-4; VH1.16; IGHV1-1,VH,1,17573,,CAGGTGCAGCTGGTGCAGTCTGGGGCTGAGATTAAGCAGCCTGGGG...,QVQLVQSGAEIKQPGASVKLSCKASGYTFTSYYMHWVRQAPGQGLE...,NC-027905.1,,27525066; 25319554


In [3]:
# Retrieve IGHV and IGHJ entries
df_v = df[df['Gene Name'].str.contains("IGHV")]
df_j = df[df['Gene Name'].str.contains("IGHJ")]
df_c = df[df['Gene Name'].str.contains("IGHC")] # is empty
df_d = df[df['Gene Name'].str.contains("IGHD")] # is not used
df_igk = df[df['Gene Name'].str.contains("IGK")]
df_igl = df[df['Gene Name'].str.contains("IGL")]

In [4]:
# Check length of the tables
total = df['Gene Name'].count()
total_v = df_v['Gene Name'].count()
total_j = df_j['Gene Name'].count()
total_c = df_c['Gene Name'].count()
total_d = df_d['Gene Name'].count()
total_igk = df_igk['Gene Name'].count()
total_igl = df_igl['Gene Name'].count()
print("Total", total)
print("V", total_v)
print("J", total_j)
print("C", total_c)
print("D", total_d)
print("IGK", total_igk)
print("IGL", total_igl)
print("VDJC", total_v + total_d + total_j + total_c)
print("VDJC + IGK + IGL", total_v + total_d + total_j + total_c + total_igk + total_igl)

Total 632
V 198
J 10
C 0
D 70
IGK 193
IGL 161
VDJC 278
VDJC + IGK + IGL 632


In [5]:
def fasta_descr(df):
    '''
    Description: create fasta headers similar to IMGT files
    '''
    df = df.fillna("unknown")
    df["fasta_descr"] = df['Gene Accession'] + "|" + df['Gene Name'] + "|" + "Macaca mulatta"
    df["fasta_nuc"] = ">" + df["fasta_descr"] + "\n" + df["Gene Sequence"]
    df["fasta_prot"] = ">" + df["fasta_descr"] + "\n" + df["Protein Sequence"]
    return(df)

In [6]:
df_v = fasta_descr(df_v)
df_j = fasta_descr(df_j)

In [7]:
def writeFasta(myfasta, df, mycol):
    '''
    myfasta: file name
    df: data frame
    mycol: column to write (fasta_nuc or fasta_prot)
    '''
    fhOut = open(myfasta, "w")
    print(df[mycol].to_csv(header=False, index=False).replace('"', ''), file=fhOut)
    fhOut.close()

In [8]:
# Write sequences to a file in fasta format
writeFasta("IGHV_rhesus.fasta", df_v, "fasta_nuc")
writeFasta("IGHJ_rhesus.fasta", df_j, "fasta_nuc")
writeFasta("IGHV_rhesus.prot.fasta", df_v, "fasta_prot")
writeFasta("IGHJ_rhesus.prot.fasta", df_j, "fasta_prot")

In [9]:
# Create a "ref.table": V.gene,func,seq
clean_name = lambda x: x.split("*")[0]
df_v["V.gene"] = df_v["Gene Name"].map(clean_name)
df_v["func"] = "F"
df_v.head()

Unnamed: 0,Gene Name,Other Names,Type,Conf,Animal,Animal Other,Gene Sequence,Protein Sequence,Gene Accession,Protein Accession,Citations,fasta_descr,fasta_nuc,fasta_prot,V.gene,func
0,IGHV1-ABW*01,IGHV1-AGS*01,VH,1,MF-M0,unknown,GAGGTCCAGCTGGTGCAGTCTGGGGCTGAGGTGAAGAAGCCTGGGG...,EVQLVQSGAEVKKPGASVKISCKASGYTFTDHYLNWVRQAPGKGLE...,MF989481.1,ATV91137.1,29163486,MF989481.1|IGHV1-ABW*01|Macaca mulatta,>MF989481.1|IGHV1-ABW*01|Macaca mulatta\nGAGGT...,>MF989481.1|IGHV1-ABW*01|Macaca mulatta\nEVQLV...,IGHV1-ABW,F
1,IGHV1-AEP*01,IGHV1-C*01; LJI.Rh_IGHV1.124,VH,1,MF-M0,unknown,CAGGTGCAGCTGGTGCAGTCTGGGGCTGAGATTAAGCAGCCTGGGG...,QVQLVQSGAEIKQPGASVKLSCKASGYTFTSYYMHWVRQAPGQGLE...,MF989475.1,ATV91118.1,29163486; 25858157; 31080066,MF989475.1|IGHV1-AEP*01|Macaca mulatta,>MF989475.1|IGHV1-AEP*01|Macaca mulatta\nCAGGT...,>MF989475.1|IGHV1-AEP*01|Macaca mulatta\nQVQLV...,IGHV1-AEP,F
2,IGHV1-Kc,IGHV1c; VH1.59,VH,1,17573,unknown,GAGGTCCAGCTGGTGCAGTCTGGGGCTGAAGTGAAGAAGCCTGGGG...,EVQLVQSGAEVKKPGASVKVSCKVSGYTFTELSMHWVRQAPGKGLE...,NC-027905.1,unknown,27525066; 25319552,NC-027905.1|IGHV1-Kc|Macaca mulatta,>NC-027905.1|IGHV1-Kc|Macaca mulatta\nGAGGTCCA...,>NC-027905.1|IGHV1-Kc|Macaca mulatta\nEVQLVQSG...,IGHV1-Kc,F
3,IGHV1-Kl,IGHV1l; VH1.23,VH,1,17573,unknown,CAGGTGCAGCTGGTGCAGTCTGGGGCTGAGGTGAAGAAGCCTGGGT...,QVQLVQSGAEVKKPGSSVKVSCKASGYTFTDYYMHWVRQAPRQGLE...,NC-027905.1,unknown,27525066; 25319553,NC-027905.1|IGHV1-Kl|Macaca mulatta,>NC-027905.1|IGHV1-Kl|Macaca mulatta\nCAGGTGCA...,>NC-027905.1|IGHV1-Kl|Macaca mulatta\nQVQLVQSG...,IGHV1-Kl,F
4,IGHV1-Korf4,IGHV1orf-4; VH1.16; IGHV1-1,VH,1,17573,unknown,CAGGTGCAGCTGGTGCAGTCTGGGGCTGAGATTAAGCAGCCTGGGG...,QVQLVQSGAEIKQPGASVKLSCKASGYTFTSYYMHWVRQAPGQGLE...,NC-027905.1,unknown,27525066; 25319554,NC-027905.1|IGHV1-Korf4|Macaca mulatta,>NC-027905.1|IGHV1-Korf4|Macaca mulatta\nCAGGT...,>NC-027905.1|IGHV1-Korf4|Macaca mulatta\nQVQLV...,IGHV1-Korf4,F


In [10]:
df_v = df_v.rename(columns = {"Protein Sequence": "seq"})
cols = ["V.gene", "func", "seq"]
df_v[cols].to_csv("ref.table.rhesus.heavy.csv", index=False)