In [32]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import networkx as nx
import sys
sys.path.append('/opt/miniconda3/lib/python3.8/site-packages')
from Levenshtein import distance
import math

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 400)

In [33]:
def get_gephi_diameter(df):
    return math.sqrt(df["gephi_area"]/math.pi) * 2

In [39]:
bcrs = pd.read_csv("bcr_and_seurat_2.csv")

In [40]:
bcrs["cell_id"] = bcrs["Unnamed: 0"]

bcrs["sample"] = bcrs["orig.ident"]

bcrs["duplicate_count"] = 1

In [36]:
bcrs = bcrs[bcrs['heavy_c_call'].notna()]

In [37]:
sample1 = bcrs.query("sample == 'Sample1'").copy()
sample2 = bcrs.query("sample == 'Sample2'").copy()
sample3 = bcrs.query("sample == 'Sample3'").copy()

In [7]:
sample1["duplicate_count"] = 1
sample2["duplicate_count"] = 1
sample3["duplicate_count"] = 1

# Collapse down to unique CDR3s

In [9]:
unique_cdrs = sample1.copy()

unique_cdrs['duplicate_count'] = unique_cdrs.groupby(['sample', 'heavy_junction_aa'])['duplicate_count'].transform('sum')
unique_cdrs = unique_cdrs.drop_duplicates(subset = ["sample", "heavy_junction_aa"])

all_grpd = unique_cdrs.groupby(['sample']).size().to_frame('clonotype_count').reset_index()

unique_cdrs = pd.merge(unique_cdrs, all_grpd[["clonotype_count", "sample"]], on= "sample")

unique_cdrs["gephi_area"] = unique_cdrs["duplicate_count"] * (0.1 / unique_cdrs["duplicate_count"].min())
unique_cdrs["gephi_diameter"] = unique_cdrs.apply(get_gephi_diameter, axis = 1)

cdr3_list = unique_cdrs.heavy_junction_aa.tolist()

dm = [[ distance(a, b) for b in cdr3_list] for a in cdr3_list]
edges = pd.DataFrame(dm, index=unique_cdrs.index, columns=unique_cdrs.index).stack().reset_index()
edges.columns=['source','target','weight']
edges = edges[edges["source"] != edges["target"]]
edges["weight"] = edges["weight"].max() - edges["weight"] + edges["weight"].min()
edges["weight"] = edges["weight"] + 1
# edges = edges.sort_values(by = "weight").drop_duplicates(subset='source', keep="last") #only draw edge to most closely related neighbour for each node

In [10]:
clone_info = unique_cdrs.copy()

clone_info.reset_index(inplace=True)
clone_info = clone_info.rename(columns = {'index':'source'})
clone_info["target"] = clone_info['source']
clone_info["heavy_clone_id_source"] = clone_info['heavy_clone_id']
clone_info["heavy_clone_id_target"] = clone_info['heavy_clone_id']


# clone_info[["heavy_clone_id", "source", "target"]]


In [11]:
edges_test = pd.merge(edges, clone_info[["source", "heavy_clone_id_source"]], on = "source")
edges_test = pd.merge(edges_test, clone_info[["target", "heavy_clone_id_target"]], on = "target")
edges = edges_test.query("heavy_clone_id_source == heavy_clone_id_target")


In [12]:
edges

Unnamed: 0,source,target,weight,heavy_clone_id_source,heavy_clone_id_target
259,260,1,31,972_2,972_2
1481,1482,1,31,972_2,972_2
3659,218,3,31,427_4,427_4
3775,334,3,30,427_4,427_4
6811,1649,4,28,898_5,898_5
...,...,...,...,...,...
2960522,402,1721,30,1074_413,1074_413
2960552,432,1721,30,1074_413,1074_413
2960927,807,1721,29,1074_413,1074_413
2963151,1311,0,31,91_1,91_1


In [13]:
G = nx.Graph()

G = nx.from_pandas_edgelist(edges, 'source', 'target', 'weight')
G.add_nodes_from(unique_cdrs.index.tolist())
nx.set_node_attributes(G, pd.Series(unique_cdrs.gephi_diameter, index=unique_cdrs.index).to_dict(), 'weight')
nx.set_node_attributes(G, pd.Series(unique_cdrs.heavy_junction_aa, index=unique_cdrs.index).to_dict(), 'label')
nx.set_node_attributes(G, pd.Series(unique_cdrs.heavy_v_call, index=unique_cdrs.index).to_dict(), 'v_call')
nx.set_node_attributes(G, pd.Series(unique_cdrs.heavy_clone_id, index=unique_cdrs.index).to_dict(), 'heavy_clone_id')

# nx.set_node_attributes(G, pd.Series(unique_cdrs.sample, index=unique_cdrs.index).to_dict(), 'sample')

nx.write_gexf(G, "synovial_all_connected.gexf")

# Don't collapse down to unique CDR3s - keep all as individual points

In [14]:
sample_1_graph = sample1.copy()

sample_1_graph["gephi_area"] = sample_1_graph["duplicate_count"] * (0.1 / sample_1_graph["duplicate_count"].min())
sample_1_graph["gephi_diameter"] = sample_1_graph.apply(get_gephi_diameter, axis = 1)

cdr3_list = sample_1_graph.heavy_junction_aa.tolist()

dm = [[ distance(a, b) for b in cdr3_list] for a in cdr3_list]
edges = pd.DataFrame(dm, index=sample_1_graph.index, columns=sample_1_graph.index).stack().reset_index()
edges.columns=['source','target','weight']
edges = edges[edges["source"] != edges["target"]]
edges["weight"] = edges["weight"].max() - edges["weight"] + edges["weight"].min()
edges["weight"] = edges["weight"] + 1
# edges = edges.sort_values(by = "weight").drop_duplicates(subset='source', keep="last")

clone_info = sample_1_graph.copy()

clone_info.reset_index(inplace=True)
clone_info = clone_info.rename(columns = {'index':'source'})
clone_info["target"] = clone_info['source']
clone_info["heavy_clone_id_source"] = clone_info['heavy_clone_id']
clone_info["heavy_clone_id_target"] = clone_info['heavy_clone_id']



edges_test = pd.merge(edges, clone_info[["source", "heavy_clone_id_source"]], on = "source")
edges_test = pd.merge(edges_test, clone_info[["target", "heavy_clone_id_target"]], on = "target")
edges = edges_test.query("heavy_clone_id_source == heavy_clone_id_target")

G = nx.Graph()

G = nx.from_pandas_edgelist(edges, 'source', 'target', 'weight')
G.add_nodes_from(sample_1_graph.index.tolist())
nx.set_node_attributes(G, pd.Series(sample_1_graph.gephi_diameter, index=sample_1_graph.index).to_dict(), 'weight')
nx.set_node_attributes(G, pd.Series(sample_1_graph.heavy_junction_aa, index=sample_1_graph.index).to_dict(), 'label')
nx.set_node_attributes(G, pd.Series(sample_1_graph.heavy_v_call, index=sample_1_graph.index).to_dict(), 'v_call')
nx.set_node_attributes(G, pd.Series(sample_1_graph.heavy_clone_id, index=sample_1_graph.index).to_dict(), 'heavy_clone_id')

# nx.set_node_attributes(G, pd.Series(sample_1_graph.sample, index=sample_1_graph.index).to_dict(), 'sample')

nx.write_gexf(G, "synovial_all_connected_don't_collapse.gexf")

# Only draw 3 edges per node

In [8]:
sample_1_graph = sample1.copy()

sample_1_graph["gephi_area"] = sample_1_graph["duplicate_count"] * (0.1 / sample_1_graph["duplicate_count"].min())
sample_1_graph["gephi_diameter"] = sample_1_graph.apply(get_gephi_diameter, axis = 1)

cdr3_list = sample_1_graph.heavy_junction_aa.tolist()

dm = [[ distance(a, b) for b in cdr3_list] for a in cdr3_list]
edges = pd.DataFrame(dm, index=sample_1_graph.index, columns=sample_1_graph.index).stack().reset_index()
edges.columns=['source','target','weight']
edges = edges[edges["source"] != edges["target"]]
# edges = edges.query("weight == 1")

In [60]:
edges["weight"] = edges["weight"].max() - edges["weight"] + edges["weight"].min()
edges["weight"] = edges["weight"] + 1
edges = edges.sort_values(by = "weight")
edges = edges.groupby('source').tail(3)

In [10]:
clone_info = sample_1_graph.copy()

clone_info.reset_index(inplace=True)
clone_info = clone_info.rename(columns = {'index':'source'})
clone_info["target"] = clone_info['source']
clone_info["heavy_clone_id_source"] = clone_info['heavy_clone_id']
clone_info["heavy_clone_id_target"] = clone_info['heavy_clone_id']


edges_test = pd.merge(edges, clone_info[["source", "heavy_clone_id_source"]], on = "source")
edges_test = pd.merge(edges_test, clone_info[["target", "heavy_clone_id_target"]], on = "target")
edges = edges_test.query("heavy_clone_id_source == heavy_clone_id_target")

In [62]:
G = nx.Graph()

G = nx.from_pandas_edgelist(edges, 'source', 'target', 'weight')
G.add_nodes_from(sample_1_graph.index.tolist())
nx.set_node_attributes(G, pd.Series(sample_1_graph.gephi_diameter, index=sample_1_graph.index).to_dict(), 'weight')
nx.set_node_attributes(G, pd.Series(sample_1_graph.heavy_junction_aa, index=sample_1_graph.index).to_dict(), 'label')
nx.set_node_attributes(G, pd.Series(sample_1_graph.heavy_c_call, index=sample_1_graph.index).to_dict(), 'c_call')
nx.set_node_attributes(G, pd.Series(sample_1_graph.heavy_clone_id, index=sample_1_graph.index).to_dict(), 'heavy_clone_id')

# nx.set_node_attributes(G, pd.Series(sample_1_graph.sample, index=sample_1_graph.index).to_dict(), 'sample')

nx.write_gexf(G, "synovial_all_connected_don't_collapse_closest_3_1.gexf")

In [11]:
edges

Unnamed: 0,source,target,weight,heavy_clone_id_source,heavy_clone_id_target
299,302,1,1,972_2,972_2
675,686,1,0,972_2,972_2
1057,1073,1,0,972_2,972_2
2145,2177,1,1,972_2,972_2
2246,2280,1,0,972_2,972_2
...,...,...,...,...,...
7959931,1919,0,0,91_1,91_1
7960042,2032,0,0,91_1,91_1
7960047,2037,0,1,91_1,91_1
7960077,2068,0,0,91_1,91_1


## Collapse identical heavy sequence

In [26]:
sample_1_graph = sample1.copy()

sample_1_graph['duplicate_count'] = sample_1_graph.groupby(['sample', 'heavy_sequence'])['duplicate_count'].transform('sum')
sample_1_graph = sample_1_graph.drop_duplicates(subset = ["sample", "heavy_sequence"])

all_grpd = sample_1_graph.groupby(['sample']).size().to_frame('clonotype_count').reset_index()

sample_1_graph = pd.merge(sample_1_graph, all_grpd[["clonotype_count", "sample"]], on= "sample")

sample_1_graph["gephi_area"] = sample_1_graph["duplicate_count"] * (0.1 / sample_1_graph["duplicate_count"].min())
sample_1_graph["gephi_diameter"] = sample_1_graph.apply(get_gephi_diameter, axis = 1)

cdr3_list = sample_1_graph.heavy_junction_aa.tolist()

dm = [[ distance(a, b) for b in cdr3_list] for a in cdr3_list]
edges = pd.DataFrame(dm, index=sample_1_graph.index, columns=sample_1_graph.index).stack().reset_index()
edges.columns=['source','target','weight']
edges = edges[edges["source"] != edges["target"]]

In [27]:
edges["weight"] = edges["weight"].max() - edges["weight"] + edges["weight"].min()
edges["weight"] = edges["weight"] + 1
edges = edges.sort_values(by = "weight")
edges = edges.groupby('source').tail(3)

In [28]:
clone_info = sample_1_graph.copy()

clone_info.reset_index(inplace=True)
clone_info = clone_info.rename(columns = {'index':'source'})
clone_info["target"] = clone_info['source']
clone_info["heavy_clone_id_source"] = clone_info['heavy_clone_id']
clone_info["heavy_clone_id_target"] = clone_info['heavy_clone_id']


edges_test = pd.merge(edges, clone_info[["source", "heavy_clone_id_source"]], on = "source")
edges_test = pd.merge(edges_test, clone_info[["target", "heavy_clone_id_target"]], on = "target")
edges = edges_test.query("heavy_clone_id_source == heavy_clone_id_target")

G = nx.Graph()

G = nx.from_pandas_edgelist(edges, 'source', 'target', 'weight')
G.add_nodes_from(sample_1_graph.index.tolist())
nx.set_node_attributes(G, pd.Series(sample_1_graph.gephi_diameter, index=sample_1_graph.index).to_dict(), 'weight')
nx.set_node_attributes(G, pd.Series(sample_1_graph.heavy_junction_aa, index=sample_1_graph.index).to_dict(), 'label')
nx.set_node_attributes(G, pd.Series(sample_1_graph.heavy_c_call, index=sample_1_graph.index).to_dict(), 'c_call')
nx.set_node_attributes(G, pd.Series(sample_1_graph.heavy_clone_id, index=sample_1_graph.index).to_dict(), 'heavy_clone_id')

# nx.set_node_attributes(G, pd.Series(sample_1_graph.sample, index=sample_1_graph.index).to_dict(), 'sample')

nx.write_gexf(G, "synovial_all_connected_collapse_closest_3.gexf")