In [5]:

import os
import numpy as np
import pandas as pd
import scipy
from itertools import combinations
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
from importlib import reload
import networkx as nx
import Levenshtein 

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
from matplotlib import cm

# local imports
import file_loader_funcs as _load
import distance_funcs as _dist
import plotting_funcs as _plot

%matplotlib inline

In [2]:
""" Define a sample """

reload(_load)

ROOT_DIR = "/Volumes/Cooper_TB_Drive/research/rajapakse/b_cell_1/public-bcell-dataset/"
DATA_NAME = 'D1-M'
DIRPATH = f"{ROOT_DIR}{DATA_NAME}"

df = _load.get_samples(DIRPATH, n_sequences=1000)
print(df.shape)
df.head()

(1000, 75)


Unnamed: 0,nucleotide,aminoAcid,copy,copyNormalized,count,frequency,frequencyNormalized,frequencyCount,cdr3Length,vFamilyName,...,jOrphon,vFunction,dFunction,jFunction,vAlignSubstitutionIndexes,dAlignSubstitutionIndexes,jAlignSubstitutionIndexes,vAlignSubstitutionGeneThreePrimeIndexes,dAlignSubstitutionGeneThreePrimeIndexes,jAlignSubstitutionGeneThreePrimeIndexes
0,AACCCAGGTGGTCCTTACCATGACCAACATGGACCCTGCGGACACA...,CARLNMVRGVISNYYYGLDVW,5,,,0.001214,,,63,IGHV02,...,,,,,39,,110.0,33,,21.0
1,AATAGACCAGTCCAAAAACCAACTCTCCCTGAAACTATACTCTCTG...,CARHLTTPGTRGFDLW,1,,,0.000243,,,48,IGHV04,...,,,,,8162334373839446772,,117.0,13184146474851626977,,14.0
2,CATCTCCAAGGACACCTCCAGAAATCAGGTGGTCCTTACAATGACC...,CARISRKASDLDYW,1,,,0.000243,,,42,IGHV02,...,,,,,212569,,110.0,246872,,21.0
3,TGACACGTCCAAGAGCCAGGTCTCCCTGAAATTGACCTCTGTGACC...,CARDHGSGTGGRPFESW,1,,,0.000243,,,51,IGHV04,...,,,,,1520313236566264,,126.0,1820264650516267,,5.0
4,GGACACCTCCAAAAACCAGGTGGTCCTCTCAATGACCGACATGGAC...,CTRRRSVGLEELPFDSW,1,,,0.000243,,,51,IGHV02,...,,,,,2829385256586567697478,,126.0,610151719262832465556,,5.0


In [3]:
""" filter cdr3 region. NOTE: this is done by slicing the sequence
from the start of the v region `n` nucleotides based on the reported cdr3
length column """

def _apply_crd3_slice(row):
    """An apply function to extract the cdr3 region from the 
    sequence""" 
    seq = row['nucleotide']
    start = row['vIndex']
    end = start + row['cdr3Length']
    cdr3 = seq[start:end]
    return cdr3

# define the column in the sampled dataframe 
df['cdr3_sequence'] = df.apply(lambda row: _apply_crd3_slice(row), axis=1)

In [4]:
"""Compute distance matrix using real minimal edit distance
from the python package leveshtien. Build a graph object """

A = _dist.matrix_levenshtien(df['cdr3_sequence'].tolist())
g = nx.from_numpy_matrix(A)
g.pos = nx.spring_layout(g, weight='weight')

In [6]:
# """ Try graph viz: WARNING: slow execution time """

# matplotlib.rcParams['figure.dpi'] = 300
# matplotlib.rcParams['figure.figsize'] = 18, 18
# plt.style.use('seaborn-deep')
# _plot.plot_network_centrality(g)

# plt.suptitle(f"{DATA_NAME} Sample Network", fontsize=30)
# outpath = f"figures/{DATA_NAME}_by_information_centrality_sample.png"
# plt.savefig(outpath, bbox_inches='tight')


In [7]:
"""
Repreated samples with replacement to see the distributions of node centrality
"""

# N_SAMPLES = 30
# SAMPLE_SIZE = 100

# matplotlib.rcParams['figure.dpi'] = 300
# matplotlib.rcParams['figure.figsize'] = 7, 7
# plt.style.use('seaborn-deep')

# for i in range(N_SAMPLES):
#     sample_inds = np.random.choice(list(range(A.shape[0])), SAMPLE_SIZE, replace=False)
#     sample_A = A[sample_inds][:,sample_inds]
#     sample_g = nx.from_numpy_matrix(sample_A)
#     centralities = nx.eigenvector_centrality(sample_g, weight='weight')
#     centralities = list(centralities.values())
#     sns.kdeplot(centralities, color=(0.1, 0.3, 0.5, 0.2))

# plt.suptitle(f"{DATA_NAME} Eigenvector Centrality Distributions")
# plt.xlabel("Eigenvector centrality")
# outpath = f"figures/{DATA_NAME}_centrality_distributions.png"
# plt.savefig(outpath, bbox_inches='tight')

'\nRepreated samples with replacement to see the distributions of node centrality\n'