# Circos data 4R
This notebook calculates the weighted adjacency matrix between V and J usages from a given TCR chain for plotting the circos plot in R. Currently usage filtering does not discriminate among V and J usages, so both chains are filtered with the same threshold, there is no posibility of selecting only one.<br>
Author: **Juan Sebastian Diaz Boada**<br>
29/10/2021<br>
Environment: *scMyositis*<br>

In [1]:
import numpy as np
import pandas as pd

In [2]:
import warnings
warnings.filterwarnings("ignore")

# Parameters

In [3]:
# Chain
chain = 'A'
chain_options = ['A','B','G','D']
if not chain in chain_options:
    raise NameError("Invalid chain. Has to be either 'A','B','G' or 'D'.")
tissue_options= ['MUSL','PB','both']
# Tissue
tissue = 'PB'
filter_tissue = True
if not tissue in tissue_options:
    raise NameError("Invalid tissue. Has to be either 'MUSL','PB' or 'both'.")
elif tissue == 'both':
    filter_tissue = False
# Patient
pat = 5
filter_patient = True
if pat<0 or pat>7:
    raise ValueError("Invalid patient number. Has to be an int between 0-7.")
elif pat==0:
    filter_patient = False
    pat=''
else:
    pat = 'sc' + str(pat)
# Cell threshold
filter_cell_number = False
thresh_cells = 4
# Paths
infile = '../data/output_data/TCR_metadata.tsv'
output_path_prefix = '../data/output_data/circos_adj/'
outfile = output_path_prefix + 'circos_adjacency_' + chain + \
         filter_patient*('_' + pat) + filter_tissue*('_' + tissue) + \
         filter_cell_number*('_thresh_' + str(thresh_cells)) + '.csv'
outfile

'../data/output_data/circos_adj/circos_adjacency_A_sc5_PB.csv'

# Load data

In [4]:
DF = pd.read_csv(infile,sep='\t',index_col=0)
with pd.option_context('display.max_columns',None):
    display(DF)

Unnamed: 0,plate,tissue,well,treatment.status,biopsy,patient,age,autoantibody,dx2020,sex,datebirth,datedx,AgeOnset,CK,seq_batch,TRA_1_seq,TRA_2_seq,TRB_1_seq,TRB_2_seq,TRA_productive_number,TRB_productive_number,TRA_number,TRB_number,TRA_1_productive,TRA_1_V,TRA_1_J,TRA_1_descr,TRA_2_productive,TRA_2_V,TRA_2_J,TRA_2_descr,TRB_1_productive,TRB_1_V,TRB_1_D,TRB_1_J,TRB_1_descr,TRB_2_productive,TRB_2_V,TRB_2_D,TRB_2_J,TRB_2_descr,TRG_1_seq,TRG_2_seq,TRD_1_seq,TRD_2_seq,TRG_productive_number,TRD_productive_number,TRG_number,TRD_number,TRG_1_productive,TRG_1_V,TRG_1_J,TRG_1_descr,TRG_2_productive,TRG_2_V,TRG_2_J,TRG_2_descr,TRD_1_productive,TRD_1_V,TRD_1_D,TRD_1_J,TRD_1_descr,TRD_2_productive,TRD_2_V,TRD_2_D,TRD_2_J,TRD_2_descr,RNAexpr,A_1_CDR3,A_1_CDR3_cluster,A_1_CDR3_freq,A_1_CDR3_clones,A_1_CDR3_clone_freq,A_1_CDR3_freq_MUSL,A_1_CDR3_freq_PB,A_1_V,A_1_V_cluster,A_1_V_freq,A_1_V_clones,A_1_V_clone_freq,A_1_J,A_1_J_cluster,A_1_J_freq,A_1_J_clones,A_1_J_clone_freq,A_1_descr_cluster,A_1_descr_freq,A_1_descr_clones,A_1_descr_clone_freq,A_2_CDR3,A_2_CDR3_cluster,A_2_CDR3_freq,A_2_CDR3_clones,A_2_CDR3_clone_freq,A_2_CDR3_freq_MUSL,A_2_CDR3_freq_PB,A_2_V,A_2_V_cluster,A_2_V_freq,A_2_V_clones,A_2_V_clone_freq,A_2_J,A_2_J_cluster,A_2_J_freq,A_2_J_clones,A_2_J_clone_freq,A_2_descr_cluster,A_2_descr_freq,A_2_descr_clones,A_2_descr_clone_freq,B_1_CDR3,B_1_CDR3_cluster,B_1_CDR3_freq,B_1_CDR3_clones,B_1_CDR3_clone_freq,B_1_CDR3_freq_MUSL,B_1_CDR3_freq_PB,B_1_V,B_1_V_cluster,B_1_V_freq,B_1_V_clones,B_1_V_clone_freq,B_1_J,B_1_J_cluster,B_1_J_freq,B_1_J_clones,B_1_J_clone_freq,B_1_descr_cluster,B_1_descr_freq,B_1_descr_clones,B_1_descr_clone_freq,B_2_CDR3,B_2_CDR3_cluster,B_2_CDR3_freq,B_2_CDR3_clones,B_2_CDR3_clone_freq,B_2_CDR3_freq_MUSL,B_2_CDR3_freq_PB,B_2_V,B_2_V_cluster,B_2_V_freq,B_2_V_clones,B_2_V_clone_freq,B_2_J,B_2_J_cluster,B_2_J_freq,B_2_J_clones,B_2_J_clone_freq,B_2_descr_cluster,B_2_descr_freq,B_2_descr_clones,B_2_descr_clone_freq,G_1_CDR3,G_1_CDR3_cluster,G_1_CDR3_freq,G_1_CDR3_clones,G_1_CDR3_clone_freq,G_1_CDR3_freq_MUSL,G_1_CDR3_freq_PB,G_1_V,G_1_V_cluster,G_1_V_freq,G_1_V_clones,G_1_V_clone_freq,G_1_J,G_1_J_cluster,G_1_J_freq,G_1_J_clones,G_1_J_clone_freq,G_1_descr_cluster,G_1_descr_freq,G_1_descr_clones,G_1_descr_clone_freq,G_2_CDR3,G_2_CDR3_cluster,G_2_CDR3_freq,G_2_CDR3_clones,G_2_CDR3_clone_freq,G_2_CDR3_freq_MUSL,G_2_CDR3_freq_PB,G_2_V,G_2_V_cluster,G_2_V_freq,G_2_V_clones,G_2_V_clone_freq,G_2_J,G_2_J_cluster,G_2_J_freq,G_2_J_clones,G_2_J_clone_freq,G_2_descr_cluster,G_2_descr_freq,G_2_descr_clones,G_2_descr_clone_freq,D_1_CDR3,D_1_CDR3_cluster,D_1_CDR3_freq,D_1_CDR3_clones,D_1_CDR3_clone_freq,D_1_CDR3_freq_MUSL,D_1_CDR3_freq_PB,D_1_V,D_1_V_cluster,D_1_V_freq,D_1_V_clones,D_1_V_clone_freq,D_1_J,D_1_J_cluster,D_1_J_freq,D_1_J_clones,D_1_J_clone_freq,D_1_descr_cluster,D_1_descr_freq,D_1_descr_clones,D_1_descr_clone_freq,D_2_CDR3,D_2_CDR3_cluster,D_2_CDR3_freq,D_2_CDR3_clones,D_2_CDR3_clone_freq,D_2_CDR3_freq_MUSL,D_2_CDR3_freq_PB,D_2_V,D_2_V_cluster,D_2_V_freq,D_2_V_clones,D_2_V_clone_freq,D_2_J,D_2_J_cluster,D_2_J_freq,D_2_J_clones,D_2_J_clone_freq,D_2_descr_cluster,D_2_descr_freq,D_2_descr_clones,D_2_descr_clone_freq,flag_2A2B,flag_all_1,flag_2A_1BGD,flag_2B_1AGD,flag_2G_1ABD,flag_2D_1ABG,flag_many_productives,CD4,CD8
Plate400_MUSL_A10,400,MUSL,A10,treated,14,sc6,81,negative,IMNM,M,1938-01-01,,,,p18562,agtcaacagggagaagaggatcctcaggccttgagcatccaggagg...,,gaagcccaagtgacccagaacccaagatacctcatcacagtgactg...,,1,1,1,1,1,TRAV17*01,TRAJ13*01,TRAV17_GGACGAAGGTTCTG_TRAJ13,0,,,,1,TRBV27*01,TRBD1*01,TRBJ1-2*01,TRBV27_CAGTTCCCTCGGGGCAGGGGGCTATG_TRBJ1-2,0,,,,,,,,,0,0,0,0,0,,,,0,,,,0,,,,,0,,,,,1.138727e+06,GGACGAAGGTTCTG,-1.0,1.0,unique,unique,1.0,,TRAV17,2.0,139.0,clone_002,clone_002_139,TRAJ13,22.0,59.0,clone_022,clone_022_59,-1.0,1.0,unique,unique,,,,,,,,,,,,,,,,,,,,,,CAGTTCCCTCGGGGCAGGGGGCTATG,-1.0,1.0,unique,unique,1.0,,TRBV27,6.0,135.0,clone_006,clone_006_135,TRBJ1-2,4.0,285.0,clone_004,clone_004_285,-1.0,1.0,unique,unique,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0
Plate400_MUSL_A11,400,MUSL,A11,treated,14,sc6,81,negative,IMNM,M,1938-01-01,,,,p18562,gcccagtcggtgacccagcttggcagccacgtctctgtctctgaag...,,gatgctggagttatccagtcaccccgccatgaggtgacagagatgg...,,0,1,1,1,0,"TRAV8-4*01,TRAV8-4*04",TRAJ20*01,TRAV8-4_AGTGATTCCACGAC_TRAJ20,0,,,,1,TRBV12-3*01,"TRBD1*01,TRBD2*01,TRBD2*02,TRBD2*02",TRBJ2-3*01,TRBV12-3_CAGCACCCGGGACCCAGAAGATA_TRBJ2-3,0,,,,,gcaggtcacctagagcaacctcaaatttccagtactaaaacgctgt...,,,,1,0,1,0,1,TRGV9*01,"TRGJ1*02,TRGJ2*01",TRGV9_TGGGACGCTTTACTCT_TRGJ1,0,,,,0,,,,,0,,,,,1.013740e+06,AGTGATTCCACGAC,24.0,11.0,clone_024,clone_024_11,7.0,,TRAV8-4,11.0,87.0,clone_011,clone_011_87,TRAJ20,1.0,116.0,clone_001,clone_001_116,22.0,11.0,clone_022,clone_022_11,,,,,,,,,,,,,,,,,,,,,,CAGCACCCGGGACCCAGAAGATA,5.0,31.0,clone_005,clone_005_31,18.0,,TRBV12-3,4.0,150.0,clone_004,clone_004_150,TRBJ2-3,0.0,420.0,clone_000,clone_000_420,5.0,31.0,clone_005,clone_005_31,,,,,,,,,,,,,,,,,,,,,,TGGGACGCTTTACTCT,4.0,25.0,clone_004,clone_004_25,14.0,,TRGV9,2.0,155.0,clone_002,clone_002_155,TRGJ1,0.0,554.0,clone_000,clone_000_554,4.0,25.0,clone_004,clone_004_25,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0
Plate400_MUSL_A12,400,MUSL,A12,treated,14,sc6,81,negative,IMNM,M,1938-01-01,,,,p18562,gaagaccaggtgacgcagagtcccgaggccctgagactccaggagg...,,gatggtggaatcactcagtccccaaagtacctgttcagaaaggaag...,,1,1,1,1,1,TRAV20*01,TRAJ9*01,TRAV20_GCTGTAAGTCCTAATAC_TRAJ9,0,,,,1,TRBV19*01,,TRBJ1-5*01,TRBV19_TGTAGGGATGAATCA_TRBJ1-5,0,,,,,,,,,0,0,0,0,0,,,,0,,,,0,,,,,0,,,,,1.022387e+06,GCTGTAAGTCCTAATAC,14.0,14.0,clone_014,clone_014_14,11.0,,TRAV20,29.0,48.0,clone_029,clone_029_48,TRAJ9,20.0,60.0,clone_020,clone_020_60,14.0,14.0,clone_014,clone_014_14,,,,,,,,,,,,,,,,,,,,,,TGTAGGGATGAATCA,23.0,12.0,clone_023,clone_023_12,8.0,,TRBV19,0.0,235.0,clone_000,clone_000_235,TRBJ1-5,5.0,270.0,clone_005,clone_005_270,20.0,12.0,clone_020,clone_020_12,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0
Plate400_MUSL_A13,400,MUSL,A13,treated,14,sc6,81,negative,IMNM,M,1938-01-01,,,,p18562,,,gattctggagtcacacaaaccccaaagcacctgatcacagcaactg...,,0,1,0,1,0,,,,0,,,,1,TRBV9*01,,TRBJ1-2*01,TRBV9_CAGCGCCTTAAATGGC_TRBJ1-2,0,,,,,,,,,0,0,0,0,0,,,,0,,,,0,,,,,0,,,,,9.072714e+05,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,CAGCGCCTTAAATGGC,-1.0,1.0,unique,unique,1.0,,TRBV9,14.0,74.0,clone_014,clone_014_74,TRBJ1-2,4.0,285.0,clone_004,clone_004_285,-1.0,1.0,unique,unique,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0
Plate400_MUSL_A14,400,MUSL,A14,treated,14,sc6,81,negative,IMNM,M,1938-01-01,,,,p18562,aaggaccaagtgtttcagccttccacagtggcatcttcagagggag...,aaacaggaggtgacgcagattcctgcagctctgagtgtcccagaag...,gatgctggagttatccagtcaccccggcacgaggtgacagagatgg...,,2,1,2,1,1,TRAV2*01,TRAJ43*01,TRAV2_CTGTGCATGGGGACAT_TRAJ43,1,TRAV21*01,TRAJ4*01,TRAV21_TGTGATATCGAGGTCTGG_TRAJ4,1,TRBV12-4*01,TRBD1*01,TRBJ1-2*01,TRBV12-4_GCAGTCCCGGACAGGGGTTTGGCT_TRBJ1-2,0,,,,,ttatcaaaagtggagcagttccagctatccatttccacggaagtca...,tcttccaacttggaagggagaacgaagtcagtcatcaggcagactg...,,,1,0,2,0,1,TRGV10*02,TRGJP2*01,TRGV10_TGGGACGGGGGGAGTGA_TRGJP2,0,TRGV2*01,TRGJP2*01,TRGV2_GGACGAGAAAGTAG_TRGJP2,0,,,,,0,,,,,9.600744e+05,CTGTGCATGGGGACAT,5.0,25.0,clone_005,clone_005_25,15.0,,TRAV2,22.0,58.0,clone_022,clone_022_58,TRAJ43,2.0,115.0,clone_002,clone_002_115,5.0,25.0,clone_005,clone_005_25,TGTGATATCGAGGTCTGG,2.0,22.0,clone_002,clone_002_22,12.0,,TRAV21,1.0,72.0,clone_001,clone_001_72,TRAJ4,5.0,41.0,clone_005,clone_005_41,2.0,22.0,clone_002,clone_002_22,GCAGTCCCGGACAGGGGTTTGGCT,3.0,37.0,clone_003,clone_003_37,22.0,,TRBV12-4,9.0,104.0,clone_009,clone_009_104,TRBJ1-2,4.0,285.0,clone_004,clone_004_285,3.0,34.0,clone_003,clone_003_34,,,,,,,,,,,,,,,,,,,,,,TGGGACGGGGGGAGTGA,2.0,27.0,clone_002,clone_002_27,18.0,,TRGV10,0.0,334.0,clone_000,clone_000_334,TRGJP2,3.0,135.0,clone_003,clone_003_135,2.0,27.0,clone_002,clone_002_27,GGACGAGAAAGTAG,4.0,6.0,clone_004,clone_004_6,4.0,,TRGV2,1.0,47.0,clone_001,clone_001_47,TRGJP2,3.0,27.0,clone_003,clone_003_27,4.0,6.0,clone_004,clone_004_6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Plate72_PB_P5,72,PB,P5,naive,11,sc6,81,negative,IMNM,M,1938-01-01,,,105.0,p15168,aaacaggaggtgacgcagattcctgcagctctgagtgtcccagaag...,,gatgctggaatcacccagagcccaagatacaagatcacagagacag...,aaggctggagtcactcaaactccaagatatctgatcaaaacgagag...,1,2,1,2,1,TRAV21*01,TRAJ10*01,TRAV21_TGGGGGTCTGGGCGGGA_TRAJ10,0,,,,1,TRBV10-2*01,"TRBD2*01,TRBD2*02",TRBJ2-2*01,TRBV10-2_CAGTGTACTAGGCACCG_TRBJ2-2,1,TRBV5-1*01,,TRBJ1-1*01,TRBV5-1_CCAGCTCCCAACCCGAGCGGAACA_TRBJ1-1,,,,,0,0,0,0,0,,,,0,,,,0,,,,,0,,,,,1.124302e+06,TGGGGGTCTGGGCGGGA,-1.0,1.0,unique,unique,,1.0,TRAV21,1.0,147.0,clone_001,clone_001_147,TRAJ10,44.0,27.0,clone_044,clone_044_27,-1.0,1.0,unique,unique,,,,,,,,,,,,,,,,,,,,,,CAGTGTACTAGGCACCG,-1.0,1.0,unique,unique,,1.0,TRBV10-2,34.0,24.0,clone_034,clone_034_24,TRBJ2-2,7.0,211.0,clone_007,clone_007_211,-1.0,1.0,unique,unique,CCAGCTCCCAACCCGAGCGGAACA,-1.0,1.0,unique,unique,,1.0,TRBV5-1,4.0,22.0,clone_004,clone_004_22,TRBJ1-1,4.0,41.0,clone_004,clone_004_41,-1.0,1.0,unique,unique,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,1,0
Plate72_PB_P6,72,PB,P6,naive,11,sc6,81,negative,IMNM,M,1938-01-01,,,105.0,p15168,gcccagtcggtgacccagcttggcagccacgtctctgtctctgaag...,cagcagcaggtgaaacaaagtcctcaatctttgatagtccagaaag...,aatgctggtgtcactcagaccccaaaattccaggtcctgaagacag...,,1,1,2,1,1,TRAV8-4*01,TRAJ53*01,TRAV8-4_AGTGAAACAGGTA_TRAJ53,0,TRAV23_DV6*01,TRAJ41*01,TRAV23_DV6_CAAGCCCCGACAAAT_TRAJ41,1,TRBV6-5*01,TRBD2*01,TRBJ2-2*01,TRBV6-5_TACTCCGCCGGGGGGGCCGGG_TRBJ2-2,0,,,,,,,,,0,0,0,0,0,,,,0,,,,0,,,,,0,,,,,1.009140e+06,AGTGAAACAGGTA,-1.0,1.0,unique,unique,,1.0,TRAV8-4,11.0,87.0,clone_011,clone_011_87,TRAJ53,25.0,51.0,clone_025,clone_025_51,-1.0,1.0,unique,unique,CAAGCCCCGACAAAT,-1.0,1.0,unique,unique,,1.0,TRAV23_DV6,23.0,23.0,clone_023,clone_023_23,TRAJ41,16.0,26.0,clone_016,clone_016_26,-1.0,1.0,unique,unique,TACTCCGCCGGGGGGGCCGGG,-1.0,1.0,unique,unique,,1.0,TRBV6-5,7.0,117.0,clone_007,clone_007_117,TRBJ2-2,7.0,211.0,clone_007,clone_007_211,-1.0,1.0,unique,unique,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,1,0
Plate72_PB_P7,72,PB,P7,naive,11,sc6,81,negative,IMNM,M,1938-01-01,,,105.0,p15168,gctcagtcagtggctcagccggaagatcaggtcaacgttgctgaag...,gcccagtcggtgacccagcttgacagccacgtctctgtctctgaag...,ggtgctgtcgtctctcaacatccgagctgggttatctgtaagagtg...,gatactggagtctcccagaaccccagacacaagatcacaaagaggg...,1,1,2,2,1,TRAV3*01,TRAJ4*01,TRAV3_GTGAGTTCCTTTTC_TRAJ4,0,TRAV8-2*01,TRAJ3*01,TRAV8-2_TTGTGGGGTTCCCCTCCCAGAAGGTAC_TRAJ3,1,TRBV20-1*01,TRBD2*01,TRBJ2-5*01,TRBV20-1_TAGAGTCCTAATAGCGGGGGACCAA_TRBJ2-5,0,"TRBV7-9*01,TRBV7-9*02,TRBV7-9*03","TRBD1*01,TRBD1*01",TRBJ2-3*01,TRBV7-9_GCAGCCACACAGGTCTTTAGGGGATATTT_TRBJ2-3,,,,,0,0,0,0,0,,,,0,,,,0,,,,,0,,,,,1.150923e+06,GTGAGTTCCTTTTC,-1.0,1.0,unique,unique,,1.0,TRAV3,8.0,98.0,clone_008,clone_008_98,TRAJ4,8.0,83.0,clone_008,clone_008_83,-1.0,1.0,unique,unique,TTGTGGGGTTCCCCTCCCAGAAGGTAC,-1.0,1.0,unique,unique,,1.0,TRAV8-2,34.0,12.0,clone_034,clone_034_12,TRAJ3,31.0,18.0,clone_031,clone_031_18,-1.0,1.0,unique,unique,TAGAGTCCTAATAGCGGGGGACCAA,-1.0,1.0,unique,unique,,1.0,TRBV20-1,1.0,222.0,clone_001,clone_001_222,TRBJ2-5,6.0,241.0,clone_006,clone_006_241,-1.0,1.0,unique,unique,GCAGCCACACAGGTCTTTAGGGGATATTT,-1.0,1.0,unique,unique,,1.0,TRBV7-9,3.0,25.0,clone_003,clone_003_25,TRBJ2-3,2.0,55.0,clone_002,clone_002_55,-1.0,1.0,unique,unique,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,1,0
Plate72_PB_P8,72,PB,P8,naive,11,sc6,81,negative,IMNM,M,1938-01-01,,,105.0,p15168,acccagctgctggagcagagccctcagtttctaagcatccaagagg...,ggagagaatgtggagcagcatccttcaaccctgagtgtccaggagg...,gatgtgaaagtaacccagagctcgagatatctagtcaaaaggacgg...,,2,1,2,1,1,TRAV27*01,TRAJ30*01,TRAV27_TGCAGCCCACGAAGCGAACA_TRAJ30,1,TRAV13-1*01,TRAJ22*01,TRAV13-1_AAGTACAATATCTGG_TRAJ22,1,TRBV28*01,,TRBJ2-7*01,TRBV28_AGTTTGACGGGCGAGC_TRBJ2-7,0,,,,,,,,,0,0,0,0,0,,,,0,,,,0,,,,,0,,,,,9.949375e+05,TGCAGCCCACGAAGCGAACA,-1.0,1.0,unique,unique,,1.0,TRAV27,28.0,49.0,clone_028,clone_028_49,TRAJ30,27.0,50.0,clone_027,clone_027_50,-1.0,1.0,unique,unique,AAGTACAATATCTGG,-1.0,1.0,unique,unique,,1.0,TRAV13-1,3.0,64.0,clone_003,clone_003_64,TRAJ22,2.0,49.0,clone_002,clone_002_49,-1.0,1.0,unique,unique,AGTTTGACGGGCGAGC,-1.0,1.0,unique,unique,,1.0,TRBV28,3.0,167.0,clone_003,clone_003_167,TRBJ2-7,2.0,402.0,clone_002,clone_002_402,-1.0,1.0,unique,unique,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,1,0


# Processing

In [5]:
df = DF.loc[DF['treatment.status']=='naive']
df

Unnamed: 0,plate,tissue,well,treatment.status,biopsy,patient,age,autoantibody,dx2020,sex,...,D_2_descr_clone_freq,flag_2A2B,flag_all_1,flag_2A_1BGD,flag_2B_1AGD,flag_2G_1ABD,flag_2D_1ABG,flag_many_productives,CD4,CD8
Plate404_MUSL_A10,404,MUSL,A10,naive,15,sc7,61,C1N,IBM,M,...,,0,0,0,0,0,0,0,1,0
Plate404_MUSL_A11,404,MUSL,A11,naive,15,sc7,61,C1N,IBM,M,...,,0,0,0,0,0,0,0,1,0
Plate404_MUSL_A12,404,MUSL,A12,naive,15,sc7,61,C1N,IBM,M,...,,0,0,0,0,0,0,0,0,0
Plate404_MUSL_A13,404,MUSL,A13,naive,15,sc7,61,C1N,IBM,M,...,,0,0,0,0,0,0,0,1,0
Plate404_MUSL_A14,404,MUSL,A14,naive,15,sc7,61,C1N,IBM,M,...,,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Plate72_PB_P5,72,PB,P5,naive,11,sc6,81,negative,IMNM,M,...,,0,0,0,0,0,0,0,1,0
Plate72_PB_P6,72,PB,P6,naive,11,sc6,81,negative,IMNM,M,...,,0,0,0,0,0,0,0,1,0
Plate72_PB_P7,72,PB,P7,naive,11,sc6,81,negative,IMNM,M,...,,0,0,0,0,0,0,0,1,0
Plate72_PB_P8,72,PB,P8,naive,11,sc6,81,negative,IMNM,M,...,,0,0,0,0,0,0,0,1,0


#### Filter by tissue

In [6]:
if filter_tissue:
    df = df.loc[df['tissue']==tissue,:]
df

Unnamed: 0,plate,tissue,well,treatment.status,biopsy,patient,age,autoantibody,dx2020,sex,...,D_2_descr_clone_freq,flag_2A2B,flag_all_1,flag_2A_1BGD,flag_2B_1AGD,flag_2G_1ABD,flag_2D_1ABG,flag_many_productives,CD4,CD8
Plate406_PB_A10,406,PB,A10,naive,15,sc7,61,C1N,IBM,M,...,,0,0,0,0,0,0,0,0,1
Plate406_PB_A11,406,PB,A11,naive,15,sc7,61,C1N,IBM,M,...,,0,0,0,0,0,0,0,0,1
Plate406_PB_A12,406,PB,A12,naive,15,sc7,61,C1N,IBM,M,...,,0,0,0,0,0,0,0,0,1
Plate406_PB_A13,406,PB,A13,naive,15,sc7,61,C1N,IBM,M,...,,0,0,0,0,0,0,0,0,1
Plate406_PB_A15,406,PB,A15,naive,15,sc7,61,C1N,IBM,M,...,,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Plate72_PB_P5,72,PB,P5,naive,11,sc6,81,negative,IMNM,M,...,,0,0,0,0,0,0,0,1,0
Plate72_PB_P6,72,PB,P6,naive,11,sc6,81,negative,IMNM,M,...,,0,0,0,0,0,0,0,1,0
Plate72_PB_P7,72,PB,P7,naive,11,sc6,81,negative,IMNM,M,...,,0,0,0,0,0,0,0,1,0
Plate72_PB_P8,72,PB,P8,naive,11,sc6,81,negative,IMNM,M,...,,0,0,0,0,0,0,0,1,0


#### Filter by patient

In [7]:
if filter_patient:
    df = df.loc[df['patient']==pat]
df

Unnamed: 0,plate,tissue,well,treatment.status,biopsy,patient,age,autoantibody,dx2020,sex,...,D_2_descr_clone_freq,flag_2A2B,flag_all_1,flag_2A_1BGD,flag_2B_1AGD,flag_2G_1ABD,flag_2D_1ABG,flag_many_productives,CD4,CD8
Plate70_PB_I10,70,PB,I10,naive,10,sc5,54,na,,M,...,,0,0,0,0,0,0,0,0,1
Plate70_PB_I11,70,PB,I11,naive,10,sc5,54,na,,M,...,,0,0,0,0,0,0,0,1,0
Plate70_PB_I12,70,PB,I12,naive,10,sc5,54,na,,M,...,,0,0,0,0,0,0,0,0,0
Plate70_PB_I13,70,PB,I13,naive,10,sc5,54,na,,M,...,,0,0,0,0,0,0,0,1,0
Plate70_PB_I14,70,PB,I14,naive,10,sc5,54,na,,M,...,,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Plate70_PB_P5,70,PB,P5,naive,10,sc5,54,na,,M,...,,0,0,0,0,0,0,0,0,1
Plate70_PB_P6,70,PB,P6,naive,10,sc5,54,na,,M,...,,0,0,0,0,0,0,0,1,0
Plate70_PB_P7,70,PB,P7,naive,10,sc5,54,na,,M,...,,0,0,0,0,0,0,0,1,0
Plate70_PB_P8,70,PB,P8,naive,10,sc5,54,na,,M,...,,0,0,0,0,0,0,0,1,0


#### Select chains and productive samples

In [8]:
# Allele 1
prod_col_1 = 'TR' + chain + '_1_productive'
DF_1 = df.loc[df[prod_col_1]==1,[chain + '_1_V',chain + '_1_J']].dropna(thresh=2)
DF_1

Unnamed: 0,A_1_V,A_1_J
Plate70_PB_I10,TRAV3,TRAJ6
Plate70_PB_I11,TRAV38-1,TRAJ40
Plate70_PB_I13,TRAV17,TRAJ47
Plate70_PB_I14,TRAV27,TRAJ9
Plate70_PB_I17,TRAV9-2,TRAJ9
...,...,...
Plate70_PB_P3,TRAV3,TRAJ6
Plate70_PB_P5,TRAV3,TRAJ6
Plate70_PB_P6,TRAV21,TRAJ33
Plate70_PB_P7,TRAV38-1,TRAJ57


In [9]:
# Allele 2
prod_col_2 = 'TR' + chain + '_2_productive'
DF_2 = df.loc[df[prod_col_2]==1,[chain + '_2_V',chain + '_2_J']].dropna(thresh=2)
DF_2

Unnamed: 0,A_2_V,A_2_J
Plate70_PB_I15,TRAV12-2,TRAJ3
Plate70_PB_I18,TRAV29_DV5,TRAJ43
Plate70_PB_I23,TRAV20,TRAJ11
Plate70_PB_I4,TRAV12-1,TRAJ31
Plate70_PB_I5,TRAV8-4,TRAJ11
Plate70_PB_J12,TRAV12-1,TRAJ10
Plate70_PB_J15,TRDV1,TRAJ37
Plate70_PB_J19,TRAV12-1,TRAJ10
Plate70_PB_J22,TRAV6,TRAJ26
Plate70_PB_J24,TRAV12-1,TRAJ42


#### Concatenate vertically both alleles

In [10]:
col_names = [chain + '_V', chain + '_J'] # V and J column names
DF_1.columns = col_names
DF_2.columns = col_names
DF_VJ = pd.concat([DF_1,DF_2],axis=0)
DF_VJ

Unnamed: 0,A_V,A_J
Plate70_PB_I10,TRAV3,TRAJ6
Plate70_PB_I11,TRAV38-1,TRAJ40
Plate70_PB_I13,TRAV17,TRAJ47
Plate70_PB_I14,TRAV27,TRAJ9
Plate70_PB_I17,TRAV9-2,TRAJ9
...,...,...
Plate70_PB_P11,TRAV23_DV6,TRAJ23
Plate70_PB_P13,TRAV6,TRAJ21
Plate70_PB_P17,TRAV39,TRAJ31
Plate70_PB_P18,TRAV21,TRAJ58


#### Frequency calculation and filtering by cell threshold

In [11]:
DF_VJ['union'] = DF_VJ.iloc[:,0] + DF_VJ.iloc[:,1] # Combination of V and J
DF_VJ['freq'] = DF_VJ.loc[:,'union'].map(DF_VJ.loc[:,'union'].value_counts()) # Frequency VJ combination
DF_VJ = DF_VJ.sort_values(['freq'],ascending=False) # Sort in descending frequency order
if filter_cell_number:
    # Discard cells with fewer appearances than cell threshold
    DF_VJ = DF_VJ.loc[DF_VJ['freq']>thresh_cells]
DF_VJ

Unnamed: 0,A_V,A_J,union,freq
Plate70_PB_I10,TRAV3,TRAJ6,TRAV3TRAJ6,25
Plate70_PB_J11,TRAV3,TRAJ6,TRAV3TRAJ6,25
Plate70_PB_J14,TRAV3,TRAJ6,TRAV3TRAJ6,25
Plate70_PB_P10,TRAV3,TRAJ6,TRAV3TRAJ6,25
Plate70_PB_M6,TRAV3,TRAJ6,TRAV3TRAJ6,25
...,...,...,...,...
Plate70_PB_M11,TRAV35,TRAJ22,TRAV35TRAJ22,1
Plate70_PB_M13,TRAV8-1,TRAJ45,TRAV8-1TRAJ45,1
Plate70_PB_M18,TRAV24,TRAJ40,TRAV24TRAJ40,1
Plate70_PB_M1,TRAV38-1,TRAJ30,TRAV38-1TRAJ30,1


In [12]:
# Overwrite frequency columns
DF_VJ.loc[:,'freq_v'] = DF_VJ.iloc[:,0].map(DF_VJ.iloc[:,0].value_counts())
DF_VJ.loc[:,'freq_j'] = DF_VJ.iloc[:,1].map(DF_VJ.iloc[:,1].value_counts())
DF_VJ.loc[:,'freq'] = DF_VJ.loc[:,'union'].map(DF_VJ.loc[:,'union'].value_counts())
DF_VJ.sort_values(['freq','union'],ascending=False)

Unnamed: 0,A_V,A_J,union,freq,freq_v,freq_j
Plate70_PB_I10,TRAV3,TRAJ6,TRAV3TRAJ6,25,27,27
Plate70_PB_J11,TRAV3,TRAJ6,TRAV3TRAJ6,25,27,27
Plate70_PB_J14,TRAV3,TRAJ6,TRAV3TRAJ6,25,27,27
Plate70_PB_P10,TRAV3,TRAJ6,TRAV3TRAJ6,25,27,27
Plate70_PB_M6,TRAV3,TRAJ6,TRAV3TRAJ6,25,27,27
...,...,...,...,...,...,...
Plate70_PB_K11,TRAV12-1,TRAJ13,TRAV12-1TRAJ13,1,9,6
Plate70_PB_K3,TRAV10,TRAJ6,TRAV10TRAJ6,1,1,27
Plate70_PB_L2,TRAV1-2,TRAJ9,TRAV1-2TRAJ9,1,9,8
Plate70_PB_I3,TRAV1-2,TRAJ15,TRAV1-2TRAJ15,1,9,2


#### Adjacency matrix

In [13]:
# Unique alleles
V_als = DF_VJ.sort_values('freq_v',ascending=False).iloc[:,0].unique()
J_als = DF_VJ.sort_values('freq_j',ascending=False).iloc[:,1].unique()
print("The dataset has {} V alleles and {} J alleles".format(len(V_als),len(J_als)))
# dictionaries of allele to index
v2idx = {v:i for i,v in enumerate(V_als)}
j2idx = {j:i for i,j in enumerate(J_als)}
# Creation of adjacency matrix
adj = np.zeros([len(V_als),len(J_als)],dtype=int)
for i in range(len(DF_VJ)):
    r = v2idx[DF_VJ.iloc[i,0]]
    c = j2idx[DF_VJ.iloc[i,1]]
    adj[r,c] = adj[r,c] + 1
df_adj = pd.DataFrame(adj,columns=j2idx.keys(),index=v2idx.keys(),dtype=int)
df_adj

The dataset has 40 V alleles and 48 J alleles


Unnamed: 0,TRAJ6,TRAJ52,TRAJ40,TRAJ9,TRAJ22,TRAJ42,TRAJ33,TRAJ37,TRAJ49,TRAJ45,...,TRAJ28,TRAJ34,TRAJ32,TRAJ50,TRAJ4,TRAJ8,TRAJ38,TRAJ56,TRAJ12,TRAJ41
TRAV3,25,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
TRAV29_DV5,0,1,0,0,5,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
TRAV1-2,0,0,0,1,0,0,5,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TRAV12-1,0,0,0,0,0,2,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
TRAV23_DV6,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TRAV6,0,0,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
TRAV14_DV4,0,0,1,1,0,0,0,1,1,0,...,1,0,0,0,0,0,0,0,0,0
TRAV35,0,0,1,0,1,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TRAV38-1,0,0,2,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
TRAV9-2,0,0,0,2,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


# Export data

In [14]:
df_adj.to_csv(outfile,sep=',')