In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict
from itertools import combinations
import networkx as nx
import community
from utils.cluster import cluster
from utils.recombinations import recombinations
from matplotlib.colors import LogNorm, LinearSegmentedColormap
from pandas.io.formats.style import Styler
from utils.preprocessing import preprocessing
import os
import seaborn as sns
import igraph as ig
ig.config["plotting.backend"] = "matplotlib"
ig.config.save()

In [None]:
conv = pd.read_csv('../Data/221121_sjogren_exclude_all_healthy_convergent.csv')
subject_id = {
    'PSO': 'S1',
    'YJH': 'S2',
    'WKM': 'S3',
    'KYB': 'S4',
    'HHJ': 'S5',
    'KJH': 'S6',
    'LCR': 'S7',
    'PSB': 'S8',
    'CKJ': 'S9',
    'JHO': 'S10',
    'YYS': 'S11',
    'KJY': 'S12',
    'KMJ': 'S13',
    'LYS': 'S14',
    'CJR': 'S15',
    'HKN': 'S16',
    'PSJ': 'S17',
    'KSK': 'S18',
    'LHS': 'S19',
    'KMO': 'S20'
}
conv['patients'] = conv['patients'].copy().apply(lambda x: x.split('|'))
conv['patients'] = conv['patients'].apply(lambda x: [subject_id[i] for i in x])
conv

In [None]:
def unique_shared_clonotype(shared_clonotype):
    raw_datas = []
    for patient in shared_clonotype['patients']:
        df = pd.read_csv(f'../Data/new_sjogren_file/{patient}_add_d_gene.tsv', sep='\t')
        process = preprocessing(df)
        raw_datas.append(
            process().get_group((shared_clonotype['v_call'], shared_clonotype['j_call'], shared_clonotype['cdr3_aa'])))
    concat = pd.concat(raw_datas, keys=shared_clonotype['patients'])
    concat['isUnique'] = ~concat['sequence'].duplicated(keep=False)
    return concat

In [None]:

datas = []
for i in range(91):
    datas.append(unique_shared_clonotype(conv.iloc[i]))
concat_clone = pd.concat(datas,
                         keys=conv.iloc[0:91]['v_call'] + '|' + conv.iloc[0:91]['j_call'] + '|' + conv.iloc[0:91][
                             'cdr3_aa'])
concat_clone

In [None]:

concat_clone.to_csv('../Data/sjogren_unique_sequences_marked.csv')
concat_clone = pd.read_csv('../Data/sjogren_unique_sequences_marked.csv')
concat_clone

In [None]:

concat_clone.rename(columns={"Unnamed: 0": "clonotype", "Unnamed: 1": "patient_id"}, inplace=True)
concat_clone

In [None]:

sequence_group = concat_clone.groupby('sequence')['patient_id'].nunique()
concat_clone['isShared'] = concat_clone['sequence'].map(lambda x: sequence_group[x] > 1)
#concat_clone.to_csv('../Data/sjogren_unique_sequences_marked.csv', index=False)
concat_clone = pd.read_csv('../Data/sjogren_unique_sequences_marked.csv')
concat_clone

In [None]:

clone_marked = pd.read_csv('../Data/sjogren_unique_sequences_marked.csv')
clone1 = clone_marked.groupby('clonotype').get_group(clone_marked['clonotype'].unique()[1])
clone1


In [None]:

# shared['sequence'].value_counts()
# Create a DataFrame
def show_single_clonotype(df, full_df):
    sequences = df['sequence'].unique()
    trimmed_sequences = [sequence[:10] for sequence in sequences]
    patients = df['patient_id'].unique()

    frequency_matrix = np.zeros((len(patients), len(sequences)), dtype=float)
    for idx, patient in enumerate(patients):
        patient_sequences = df[df['patient_id'] == patient]
        for sequence_row in patient_sequences.iterrows():
            sequence = sequence_row[1]['sequence']
            frequency = sequence_row[1]['frequency']
            sequence_idx = np.where(sequences == sequence)[0][0]
            frequency_matrix[idx, sequence_idx] = frequency

    #min_nonzero_frequency = 0.000001
    normalized_frequencies = frequency_matrix

    #normalized_frequencies = np.where(normalized_frequencies == 0, , frequency_matrix)
    log_frequencies = np.log10(normalized_frequencies)

    cmap_colors = [(0.9, 0.9, 0.9), (0.6, 0.2, 0.2), (0.1, 0.1, 0.1)]
    custom_cmap = LinearSegmentedColormap.from_list("custom_cmap", cmap_colors, N=256)

    custom_cmap.set_bad(color='white')

    fig = plt.figure(figsize=(12, 6))
    ax = plt.subplot2grid((1, 3), (0, 0), colspan=2)
    ax.grid(False)
    heatmap = ax.imshow(log_frequencies, cmap=custom_cmap)

    ax.set_xticks(np.arange(len(sequences)))
    ax.set_yticks(np.arange(len(patients)))
    ax.set_xticklabels(trimmed_sequences)
    ax.set_yticklabels(patients)

    plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

    cbar = plt.colorbar(heatmap, shrink=0.6)
    cbar.set_label("log10(frequency)")

    ax.set_title(f"{df['clonotype'].unique()[0]}")
    ax.set_xlabel("Shared Sequences (first 10 nt shown)")
    ax.set_ylabel("Patients")

    table_data = []

    for patient in patients:
        patient_sequence = full_df[full_df['patient_id'] == patient]
        has_unique = (True in list(patient_sequence['isUnique'])) or \
                     (False in list(patient_sequence[patient_sequence['isUnique'] == False]['isShared']))

        status = 'Yes' if has_unique else 'No'
        table_data.append([patient, status])

    ax1 = plt.subplot2grid((1, 3), (0, 2), colspan=1)
    ax1.axis('off')  # Turn off axis

    table = ax1.table(cellText=table_data, colLabels=['Patient ID', 'Has Unique Sequences'], loc='center',
                      colLoc='center', cellLoc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(10)

    plt.tight_layout()
    plt.savefig(os.path.join('../Data/new_sjogren_contamination_analysis/',
                             df['clonotype'].unique()[0].split('|')[0] + '_' + df['clonotype'].unique()[0].split('|')[
                                 1] + '_' + df['clonotype'].unique()[0].split('|')[2]), dpi=500)

In [None]:

for i in range(92):
    temp = clone_marked.groupby('clonotype').get_group(clone_marked['clonotype'].unique()[1])
    shared_temp = temp[(temp['isShared'] == True)]
    if len(shared_temp) > 0:
        show_single_clonotype(shared_temp, temp)
