In [None]:
#import the library
import scanpy as sc
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib import colors
import seaborn as sb
from gprofiler import GProfiler
import seaborn as sns
import rpy2.rinterface_lib.callbacks
import logging
import tensorflow as tf
import scipy.sparse
import os

from rpy2.robjects import pandas2ri
import anndata2ri

import importlib
import warnings
warnings.filterwarnings("ignore")
import pickle as pkl
from matplotlib.colors import LinearSegmentedColormap

In [None]:
from matplotlib.colors import LinearSegmentedColormap
values = [0,1]
colors = [(227, 227, 227), (255, 42, 18)]
norm = plt.Normalize(min(values), max(values))
my_cmap = LinearSegmentedColormap.from_list(
    '', [(norm(value), tuple(np.array(color) / 255)) for value, color in zip(values, colors)])

In [None]:
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

In [None]:
plt.rcParams['figure.figsize']=(8,8) #rescale figures
sc.settings.verbosity = 3
#sc.set_figure_params(dpi=200, dpi_save=300)
sc.logging.print_versions()

In [None]:
%%R
.libPaths(.libPaths('win-library\\4.3'))

library(scran)
library(Seurat)
library(RColorBrewer)
library(slingshot)
library(monocle)
library(gam)
library(ggplot2)
library(plyr)
library(MAST)
library(clusterExperiment)
library(monocle3)
library(SeuratWrappers)
library(magrittr)
library(dplyr)

In [None]:
import h5py
from scipy.sparse import csr_matrix

# Open the HDF5 file
file_path = 'GSE178341/GSE178341_crc10x_full_c295v4_submit.h5'
with h5py.File(file_path, 'r') as f:
    data = f['matrix/data'][()]
    indices = f['matrix/indices'][()]
    indptr = f['matrix/indptr'][()]
    shape = f['matrix/shape'][()]
    genes = f['matrix/features/name'][()]
    gene_ID = [str(i).split("'")[1] for i in genes]

In [None]:
from matplotlib.colors import LinearSegmentedColormap
values = [0,1]
colors = [(227, 227, 227), (255, 42, 18)]
norm = plt.Normalize(min(values), max(values))
my_cmap = LinearSegmentedColormap.from_list(
    '', [(norm(value), tuple(np.array(color) / 255)) for value, color in zip(values, colors)])

In [None]:
my_palette = ['#0351A8','#8CB0E0','#D56D11','#FFBB78','#234E08','#53CB8B','#D30083','#CB788D','#4E195A','#C58CCF','#AA290F','#B03FD1','#E8BCCF','#64605F','#B2AD9A','#D2D30B','#D1BD4F','#06DCF2','#9EDAE5','#517219','#5B43CF','#D92F24','#FFD900','#002F33','#B8A3A3']

In [None]:
matrix = csr_matrix((data, indices, indptr), shape=[shape[1],shape[0]])

In [None]:
meta_data = pd.read_csv('GSE178341/GSE178341_crc10x_full_c295v4_submit_metatables.csv',index_col = 0)

In [None]:
celltype= pd.read_csv('GSE178341/crc10x_full_c295v4_submit_cluster.csv',index_col = 0)

In [None]:
import anndata
adata =anndata.AnnData(X = matrix)
adata.var_names = gene_ID
adata.obs_names = meta_data.index

In [None]:
adata.obs = meta_data.loc[adata.obs_names,:]

In [None]:
for i in celltype.columns:
    adata.obs[i] = celltype.loc[adata.obs_names,i]

In [None]:
adata.write('GSE178341.h5ad')

In [None]:
adata = sc.read_h5ad('GSE178341_processed.h5ad')

In [None]:
adata.var_names_make_unique()

In [None]:
adata_17 = sc.read_h5ad('GSE178341_T17_refined.h5ad')

In [None]:
adata_17_raw = adata[adata_17.obs_names,:]

In [None]:
sc.pl.tsne(adata, color = ['PDCD1','CTLA4','TRDC','IFNG'], cmap = my_cmap, size = 20)

In [None]:
dp_test = np.array(adata_17[:,'TFF3'].X.A > 0) & np.array(adata_17[:,'CD3D'].X.A > 0)
sum(dp_test.flatten())

In [None]:
colnames = adata_17_raw.obs_names
rownames = adata_17_raw.var_names
counts = adata_17_raw.X.T

In [None]:
%%R -i colnames -i rownames -i counts
colnames(counts) = colnames
rownames(counts) = rownames

In [None]:
%R srat <- CreateSeuratObject(counts = counts)

In [None]:
%R saveRDS(srat, 'GSE178341_T17.rds')

In [None]:
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]
adata.obs['mt_frac'] = np.array(adata.X[:, mt_gene_mask].sum(1).ravel())[0]/adata.obs['n_counts']
ribo_gene_mask = [gene.startswith('RPL') or gene.startswith('RPS') for gene in adata.var_names]
adata.obs['ribo_frac'] = np.array(adata.X[:, ribo_gene_mask].sum(1).ravel())[0]/adata.obs['n_counts']

In [None]:
adata.var_names[mt_gene_mask]

In [None]:
# Filter cells according to identified QC thresholds:
print('Total number of cells: {:d}'.format(adata.n_obs))

sc.pp.filter_cells(adata, min_counts = 500)
print('Number of cells after min count filter: {:d}'.format(adata.n_obs))


adata = adata[adata.obs['mt_frac'] < 0.2]
print('Number of cells after MT filter: {:d}'.format(adata.n_obs))

sc.pp.filter_cells(adata, min_genes = 200)
print('Number of cells after gene filter: {:d}'.format(adata.n_obs))


In [None]:
sc.pp.normalize_total(adata, key_added = 'normalization_factors')
sc.pp.log1p(adata)

In [None]:
sc.pp.neighbors(adata, n_pcs = 50)

In [None]:
sc.pl.pca(adata, color=['CD4'], legend_loc = 'on data')

In [None]:
%matplotlib inline
sc.tl.tsne(adata)
#plt.close()
plt.rcParams['axes.linewidth'] = 2

In [None]:
sc.pl.tsne(adata, color=['cl295v11SubFull','CD3E','CD3G','CD3D','TRDC','ICOS','CD4','CD8A','CD8B','FOXP3','IFNG','RORC','IL17A','IL17F'], cmap = my_cmap)

In [None]:
sc.tl.leiden(adata, resolution = 0.8, key_added= 'leiden')                                                 

In [None]:
sc.pl.tsne(adata, color=['leiden','CD3E','CD3G','CD3D','TRDC','ICOS','CD4','CD8A','CD8B','FOXP3','IFNG','RORC','IL17A','IL17F'], size =30, cmap = my_cmap, legend_loc = 'on data')

In [None]:
export_df = adata.obs

In [None]:
TNM_T = list()
for i in export_df['TumorStage']:
    if str(i) != 'nan':
        rec = str(i).lower().split('pt')[1]
    else:
        rec = 'N/A'
    TNM_T.append(rec)

In [None]:
TNM_N = list()
for i in export_df['NodeStatus_detailed']:
    if str(i) != 'nan':
        rec = str(i).split('N')[1]
    else:
        rec = 'N/A'
    TNM_N.append(rec)

In [None]:
TNM_M = list()
for i in export_df['MetastasisStatus']:
    i = str(i)
    if i != 'nan':
        id = i.index('M')
        if i[id+2].isalpha():
            rec = i[id+1:id+3]
        else:
            rec = i[id+1]
    else:
        rec = 'N/A'
    TNM_M.append(rec)

In [None]:
adata.obs['patient'] = export_df['PID']
adata.obs['gender'] = export_df['Sex']
adata.obs['age'] = export_df['Age']
adata.obs['tissue'] = export_df['HistologicTypeSimple']
adata.obs['site'] = export_df['TissueSite_detailed']
adata.obs['TNM_T'] = TNM_T
adata.obs['TNM_N'] = TNM_N
adata.obs['TNM_M'] = TNM_M

In [None]:
set(adata.obs['site'])

In [None]:
def convert_tnm_to_stage(t, n, m):
    """
    Convert TNM components to an overall cancer stage.
    
    Parameters:
    t (str): Tumor size and extent (e.g., 'Tis', 'T1', 'T2', 'T3', 'T4').
    n (str): Node involvement (e.g., 'N0', 'N1', 'N2', 'N3').
    m (str): Metastasis presence (e.g., 'M0', 'M1').
    
    Returns:
    int: Overall cancer stage (0 to IV).
    """
    t = t[0].lower()
    n = n[0].lower()
    m = m[0].lower()
    # Handling metastasis first because it overrides other categories
    if m == '1':
        return 'IV'
    if t.isnumeric()==0 or n.isnumeric()==0:
        return 'N/A'
    
    # Mapping T and N to stages
    if t == '0' and n == '0' and m != '1':
        return '0'
    elif t == '1' and n == '0' and m != '1':
        return 'I'
    elif (t == '2' and n == '0' and m != '1') or (t == '1' and n == '1' and m != '1'):
        return 'II'
    elif (t == '2' and n == '1' and m != '1') or (t in ['3'] and n in ['0', '1'] and m != '1'):
        return 'III'
    
    # Default to highest stage if other conditions are not met (typically not used, more complex logic needed in real cases)
    return 'IV'

In [None]:
adata.obs['stage'] = 'N/A'
for i in range(adata.shape[0]):
    adata.obs['stage'][i] = convert_tnm_to_stage(adata.obs['TNM_T'][i], adata.obs['TNM_N'][i], adata.obs['TNM_M'][i])

In [None]:
adata.obs['gender'] = adata.obs['gender'].replace({'F':'Female','M':'Male'})

In [None]:
sc.pl.tsne(adata,color = ['patient','tissue','gender','age','site','TNM_T','TNM_N','TNM_M','stage'])

In [None]:
sc.pl.tsne(adata, color=['leiden','CD3E','CD3G','CD3D','TRDC','ICOS','CD4','CD8A','CD8B','FOXP3','IFNG','RORC','IL17A','IL17F'], size =30, cmap = my_cmap, legend_loc = 'on data')

In [None]:
for i, tnmt in enumerate(adata.obs.groupby('patient')['TNM_T'].apply(set)):
    pid = adata.obs.groupby('patient')['TNM_T'].apply(set).index[i]
    
    if 'nan' not in list(i):
        adata.obs['TNM_T'][np.array(adata.obs['patient'] == pid) & np.array(adata.obs['tissue'] == 'Normal colon')] = max(list(i))
    else if len(list(i)) > 1:
        adata.obs['TNM_T'][np.array(adata.obs['patient'] == pid) & np.array(adata.obs['tissue'] == 'Normal colon')] = max(list(set(['nan']) ^ set(i)))

In [None]:
adata.write('GSE178341_processed.h5ad')

In [None]:
adata = sc.read_h5ad('GSE178341_processed.h5ad')

In [None]:
'0','1','28','36','3','5','15','25','11','21','8','16'

In [None]:
potential_17 = adata[adata.obs['leiden'].isin(['0']),:]

In [None]:
del potential_17.uns

In [None]:
sc.pp.highly_variable_genes(potential_17, flavor='seurat', n_top_genes=3000)
sc.pp.pca(potential_17, n_comps=50, use_highly_variable=True, svd_solver='arpack')

In [None]:
sc.pp.neighbors(potential_17, n_neighbors = 15, n_pcs = 50)
sc.tl.leiden(potential_17, resolution = 1, key_added= 'leiden')                                                 

In [None]:
sc.tl.tsne(potential_17)

In [None]:
%matplotlib inline
sc.pl.tsne(potential_17, color=['leiden','RORC','IL17A','IL17F'],size = 40,cmap = my_cmap)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(potential_17, restrict_to = ('leiden', ['5']), resolution = 0.6, key_added= 'leiden1')
sc.pl.tsne(potential_17, color=['leiden1','RORC','IL17A','IL17F'], size = 20, legend_loc = 'on data', cmap = my_cmap)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(potential_17, restrict_to = ('leiden1', ['10']), resolution = 0.4, key_added= 'leiden2')
sc.pl.tsne(potential_17, color=['leiden2','RORC','IL17A','IL17F'], size = 20, legend_loc = 'on data', cmap = my_cmap)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(potential_17, restrict_to = ('leiden2', ['7']), resolution = 0.5, key_added= 'leiden3')
sc.pl.tsne(potential_17, color=['leiden3','RORC','IL17A','IL17F'], size = 20, legend_loc = 'on data', cmap = my_cmap)

In [None]:
IL17_genesmask = [gene.startswith("IL17") and gene.startswith("IL17R")==0 for gene in potential_17.var_names]
IL17_genes = potential_17.var_names[IL17_genesmask]
IL17_exp_set ={
    'cytokine': IL17_genes, 'TF': 'RORC'
}
sc.pl.dotplot(potential_17,IL17_exp_set,groupby = 'leiden2' , vmax = 1, swap_axes = False, dot_min =0.1, dot_max =1,standard_scale = 'var')

In [None]:
T17_1 = potential_17[potential_17.obs['leiden2'].isin(['6,1','6,4']),:]

In [None]:
'0','1','28','36','3','5','15','25','11','21','8','16'

In [None]:
potential_17 = adata[adata.obs['leiden'].isin(['1']),:]

In [None]:
del potential_17.uns

In [None]:
sc.pp.highly_variable_genes(potential_17, flavor='seurat', n_top_genes=3000)
sc.pp.pca(potential_17, n_comps=50, use_highly_variable=True, svd_solver='arpack')

In [None]:
sc.pp.neighbors(potential_17, n_neighbors = 15, n_pcs = 50)
sc.tl.leiden(potential_17, resolution = 1, key_added= 'leiden')                                                 

In [None]:
sc.tl.tsne(potential_17)

In [None]:
%matplotlib inline
sc.pl.tsne(potential_17, color=['leiden','RORC','IL17A','IL17F'],size = 40,cmap = my_cmap)

In [None]:
potential_17 = adata[adata.obs['leiden'].isin(['1']),:]

In [None]:
sc.pp.neighbors(potential_17, n_pcs = 50)

In [None]:
sc.pl.pca(potential_17, color=['CD4'], legend_loc = 'on data')

In [None]:
sc.tl.leiden(potential_17, resolution = 0.8, key_added= 'leiden')                                                 

In [None]:
%matplotlib inline
sc.tl.tsne(potential_17)
#plt.close()
plt.rcParams['axes.linewidth'] = 2

In [None]:
sc.pl.tsne(potential_17, color=['leiden','RORC','IL17A','IL17F'], size = 20, cmap = my_cmap)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(potential_17, restrict_to = ('leiden', ['6']), resolution = 0.5, key_added= 'leiden1')
sc.pl.tsne(potential_17, color=['leiden1','RORC','IL17A','IL17F'], size = 20, legend_loc = 'on data', cmap = my_cmap)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(potential_17, restrict_to = ('leiden1', ['1']), resolution = 0.5, key_added= 'leiden2')
sc.pl.tsne(potential_17, color=['leiden2','RORC','IL17A','IL17F'], size = 20, legend_loc = 'on data', cmap = my_cmap)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(potential_17, restrict_to = ('leiden2', ['7']), resolution = 0.5, key_added= 'leiden3')
sc.pl.tsne(potential_17, color=['leiden3','RORC','IL17A','IL17F'], size = 20, legend_loc = 'on data', cmap = my_cmap)

In [None]:
IL17_genesmask = [gene.startswith("IL17") and gene.startswith("IL17R")==0 for gene in potential_17.var_names]
IL17_genes = potential_17.var_names[IL17_genesmask]
IL17_exp_set ={
    'cytokine': IL17_genes, 'TF': 'RORC'
}
sc.pl.dotplot(potential_17,IL17_exp_set,groupby = 'leiden' , vmax = 1, swap_axes = False, dot_min =0.1, dot_max =1,standard_scale = 'var')

In [None]:
T17_2 = potential_17[potential_17.obs['leiden'].isin(['6','10']),:]

In [None]:
'28','36','5','15','25','11','21','8','16'

In [None]:
potential_17 = adata[adata.obs['leiden'].isin(['3']),:]

In [None]:
sc.pp.neighbors(potential_17, n_pcs = 50)

In [None]:
sc.pl.pca(potential_17, color=['CD4'], legend_loc = 'on data')

In [None]:
sc.tl.leiden(potential_17, resolution = 0.8, key_added= 'leiden')                                                 

In [None]:
%matplotlib inline
sc.tl.tsne(potential_17)
#plt.close()
plt.rcParams['axes.linewidth'] = 2

In [None]:
sc.pl.tsne(potential_17, color=['leiden','RORC','IL17A','IL17F'], size = 20, cmap = my_cmap)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(potential_17, restrict_to = ('leiden', ['13']), resolution = 0.3, key_added= 'leiden1')
sc.pl.tsne(potential_17, color=['leiden1','RORC','IL17A','IL17F'], size = 20, legend_loc = 'on data', cmap = my_cmap)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(potential_17, restrict_to = ('leiden1', ['6']), resolution = 0.6, key_added= 'leiden2')
sc.pl.tsne(potential_17, color=['leiden2','RORC','IL17A','IL17F'], size = 20, legend_loc = 'on data', cmap = my_cmap)

In [None]:
IL17_genesmask = [gene.startswith("IL17") and gene.startswith("IL17R")==0 for gene in potential_17.var_names]
IL17_genes = potential_17.var_names[IL17_genesmask]
IL17_exp_set ={
    'cytokine': IL17_genes, 'TF': 'RORC'
}
sc.pl.dotplot(potential_17,IL17_exp_set,groupby = 'leiden1' , vmax = 1, swap_axes = False, dot_min =0.1, dot_max =1,standard_scale = 'var')

In [None]:
T17_3 = potential_17[potential_17.obs['leiden2'].isin(['5','13,1','16','17']),:]

In [None]:
'5','15','25','11','21','8','16'

In [None]:
potential_17 = adata[adata.obs['leiden'].isin(['28','36']),:]

In [None]:
sc.pp.neighbors(potential_17, n_pcs = 50)

In [None]:
sc.pl.pca(potential_17, color=['CD4'], legend_loc = 'on data')

In [None]:
sc.tl.leiden(potential_17, resolution = 0.8, key_added= 'leiden')                                                 

In [None]:
%matplotlib inline
sc.tl.tsne(potential_17)
#plt.close()
plt.rcParams['axes.linewidth'] = 2

In [None]:
sc.pl.tsne(potential_17, color=['leiden','RORC','IL17A','IL17F'], size = 50, cmap = my_cmap)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(potential_17, restrict_to = ('leiden', ['1']), resolution = 0.5, key_added= 'leiden1')
sc.pl.tsne(potential_17, color=['leiden1','RORC','IL17A','IL17F'], size = 20, legend_loc = 'on data', cmap = my_cmap)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(potential_17, restrict_to = ('leiden1', ['6']), resolution = 0.6, key_added= 'leiden2')
sc.pl.tsne(potential_17, color=['leiden2','RORC','IL17A','IL17F'], size = 20, legend_loc = 'on data', cmap = my_cmap)

In [None]:
IL17_genesmask = [gene.startswith("IL17") and gene.startswith("IL17R")==0 for gene in potential_17.var_names]
IL17_genes = potential_17.var_names[IL17_genesmask]
IL17_exp_set ={
    'cytokine': IL17_genes, 'TF': 'RORC'
}
sc.pl.dotplot(potential_17,IL17_exp_set,groupby = 'leiden1' , vmax = 1, swap_axes = False, dot_min =0.1, dot_max =1,standard_scale = 'var')

In [None]:
T17_4 = potential_17[potential_17.obs['leiden1'].isin(['1,2']),:]

In [None]:
'5','15','25','11','21','16'

In [None]:
potential_17 = adata[adata.obs['leiden'].isin(['8']),:]

In [None]:
del potential_17.uns

In [None]:
sc.pp.highly_variable_genes(potential_17, flavor='seurat', n_top_genes=3000)
sc.pp.pca(potential_17, n_comps=50, use_highly_variable=True, svd_solver='arpack')

In [None]:
sc.pp.neighbors(potential_17, n_neighbors = 15, n_pcs = 50)
sc.tl.leiden(potential_17, resolution = 1, key_added= 'leiden')                                                 

In [None]:
sc.tl.tsne(potential_17)

In [None]:
%matplotlib inline
sc.pl.tsne(potential_17, color=['leiden','RORC','IL17A','IL17F'],size = 40,cmap = my_cmap)

In [None]:
potential_17 = adata[adata.obs['leiden'].isin(['8']),:]

In [None]:
sc.pp.neighbors(potential_17, n_pcs = 50)

In [None]:
sc.pl.pca(potential_17, color=['CD4'], legend_loc = 'on data')

In [None]:
sc.tl.leiden(potential_17, resolution = 0.8, key_added= 'leiden')                                                 

In [None]:
%matplotlib inline
sc.tl.tsne(potential_17)
#plt.close()
plt.rcParams['axes.linewidth'] = 2

In [None]:
sc.pl.tsne(potential_17, color=['leiden','RORC','IL17A','IL17F'], size = 40, cmap = my_cmap)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(potential_17, restrict_to = ('leiden', ['4']), resolution = 0.3, key_added= 'leiden1')
sc.pl.tsne(potential_17, color=['leiden1','RORC','IL17A','IL17F'], size = 20, legend_loc = 'on data', cmap = my_cmap)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(potential_17, restrict_to = ('leiden1', ['6']), resolution = 0.6, key_added= 'leiden2')
sc.pl.tsne(potential_17, color=['leiden2','RORC','IL17A','IL17F'], size = 20, legend_loc = 'on data', cmap = my_cmap)

In [None]:
IL17_genesmask = [gene.startswith("IL17") and gene.startswith("IL17R")==0 for gene in potential_17.var_names]
IL17_genes = potential_17.var_names[IL17_genesmask]
IL17_exp_set ={
    'cytokine': IL17_genes, 'TF': 'RORC'
}
sc.pl.dotplot(potential_17,IL17_exp_set,groupby = 'leiden1' , vmax = 1, swap_axes = False, dot_min =0.1, dot_max =1,standard_scale = 'var')

In [None]:
T17_5 = potential_17[potential_17.obs['leiden1'].isin(['4,0','6']),:]

In [None]:
potential_17 = adata[adata.obs['leiden'].isin(['27']),:]

In [None]:
del potential_17.uns

In [None]:
sc.pp.highly_variable_genes(potential_17, flavor='seurat', n_top_genes=3000)
sc.pp.pca(potential_17, n_comps=50, use_highly_variable=True, svd_solver='arpack')

In [None]:
sc.pp.neighbors(potential_17, n_neighbors = 15, n_pcs = 50)
sc.tl.leiden(potential_17, resolution = 1, key_added= 'leiden')                                                 

In [None]:
sc.tl.tsne(potential_17)

In [None]:
%matplotlib inline
sc.pl.tsne(potential_17, color=['leiden','RORC','IL17A','IL17F'],size = 40,cmap = my_cmap)

In [None]:
potential_17 = adata[adata.obs['leiden'].isin(['27']),:]

In [None]:
sc.pp.neighbors(potential_17, n_pcs = 50)

In [None]:
sc.pl.pca(potential_17, color=['CD4'], legend_loc = 'on data')

In [None]:
sc.tl.leiden(potential_17, resolution = 0.8, key_added= 'leiden')                                                 

In [None]:
%matplotlib inline
sc.tl.tsne(potential_17)
#plt.close()
plt.rcParams['axes.linewidth'] = 2

In [None]:
sc.pl.tsne(potential_17, color=['leiden','RORC','IL17A','IL17F'], size = 40, cmap = my_cmap)

In [None]:
IL17_genesmask = [gene.startswith("IL17") and gene.startswith("IL17R")==0 for gene in potential_17.var_names]
IL17_genes = potential_17.var_names[IL17_genesmask]
IL17_exp_set ={
    'cytokine': IL17_genes, 'TF': 'RORC'
}
sc.pl.dotplot(potential_17,IL17_exp_set,groupby = 'leiden' , swap_axes = False, dot_min =0.1, dot_max =1,standard_scale = 'var')

In [None]:
T17_10 = potential_17[potential_17.obs['leiden'].isin(['1','2','3','4','5','6','8']),:]

In [None]:
potential_17 = adata[adata.obs['leiden'].isin(['38']),:]

In [None]:
sc.pp.neighbors(potential_17, n_pcs = 50)

In [None]:
sc.pl.pca(potential_17, color=['CD4'], legend_loc = 'on data')

In [None]:
sc.tl.leiden(potential_17, resolution = 0.8, key_added= 'leiden')                                                 

In [None]:
%matplotlib inline
sc.tl.tsne(potential_17)
#plt.close()
plt.rcParams['axes.linewidth'] = 2

In [None]:
sc.pl.tsne(potential_17, color=['leiden','RORC','IL17A','IL17F'], size = 80, cmap = my_cmap)

In [None]:
IL17_genesmask = [gene.startswith("IL17") and gene.startswith("IL17R")==0 for gene in potential_17.var_names]
IL17_genes = potential_17.var_names[IL17_genesmask]
IL17_exp_set ={
    'cytokine': IL17_genes, 'TF': 'RORC'
}
sc.pl.dotplot(potential_17,IL17_exp_set,groupby = 'leiden' , vmax = 1, swap_axes = False, dot_min =0.1, dot_max =1,standard_scale = 'var')

In [None]:
T17_8 = adata[adata.obs['leiden'].isin(['38']),:]

In [None]:
potential_17 = adata[adata.obs['leiden'].isin(['43']),:]

In [None]:
sc.pp.neighbors(potential_17, n_pcs = 50)

In [None]:
sc.pl.pca(potential_17, color=['CD4'], legend_loc = 'on data')

In [None]:
sc.tl.leiden(potential_17, resolution = 0.8, key_added= 'leiden')                                                 

In [None]:
%matplotlib inline
sc.tl.tsne(potential_17)
#plt.close()
plt.rcParams['axes.linewidth'] = 2

In [None]:
sc.pl.tsne(potential_17, color=['leiden','RORC','IL17A','IL17F'], size = 70, cmap = my_cmap)

In [None]:
IL17_genesmask = [gene.startswith("IL17") and gene.startswith("IL17R")==0 for gene in potential_17.var_names]
IL17_genes = potential_17.var_names[IL17_genesmask]
IL17_exp_set ={
    'cytokine': IL17_genes, 'TF': 'RORC'
}
sc.pl.dotplot(potential_17,IL17_exp_set,groupby = 'leiden' , swap_axes = False, dot_min =0.1, dot_max =1,standard_scale = 'var')

In [None]:
T17_9 = potential_17[potential_17.obs['leiden'].isin(['0','1','3','4','5','6']),:]

In [None]:
potential_17 = adata[adata.obs['leiden'].isin(['40']),:]

In [None]:
sc.pp.neighbors(potential_17, n_pcs = 50)

In [None]:
sc.pl.pca(potential_17, color=['CD4'], legend_loc = 'on data')

In [None]:
sc.tl.leiden(potential_17, resolution = 0.8, key_added= 'leiden')                                                 

In [None]:
%matplotlib inline
sc.tl.tsne(potential_17)
#plt.close()
plt.rcParams['axes.linewidth'] = 2

In [None]:
sc.pl.tsne(potential_17, color=['leiden','RORC','IL17A','IL17F'], size = 40, cmap = my_cmap)

In [None]:
IL17_genesmask = [gene.startswith("IL17") and gene.startswith("IL17R")==0 for gene in potential_17.var_names]
IL17_genes = potential_17.var_names[IL17_genesmask]
IL17_exp_set ={
    'cytokine': IL17_genes, 'TF': 'RORC'
}
sc.pl.dotplot(potential_17,IL17_exp_set,groupby = 'leiden' , swap_axes = False, dot_min =0.1, dot_max =1,standard_scale = 'var')

In [None]:
T17_7 = potential_17[potential_17.obs['leiden'].isin(['0','1','2','3','4','5']),:]

In [None]:
potential_17 = adata[adata.obs['leiden'].isin(['5','15','25','11','16']),:]

In [None]:
sc.pp.neighbors(potential_17, n_pcs = 50)

In [None]:
sc.pl.pca(potential_17, color=['CD4'], legend_loc = 'on data')

In [None]:
sc.tl.leiden(potential_17, resolution = 0.8, key_added= 'leiden')                                                 

In [None]:
%matplotlib inline
sc.tl.tsne(potential_17)
#plt.close()
plt.rcParams['axes.linewidth'] = 2

In [None]:
sc.pl.tsne(potential_17, color=['leiden','RORC','IL17A','IL17F'], size = 30, cmap = my_cmap)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(potential_17, restrict_to = ('leiden', ['9']), resolution = 0.3, key_added= 'leiden1')
sc.pl.tsne(potential_17, color=['leiden1','RORC','IL17A','IL17F'], size = 20, legend_loc = 'on data', cmap = my_cmap)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(potential_17, restrict_to = ('leiden1', ['4']), resolution = 0.6, key_added= 'leiden2')
sc.pl.tsne(potential_17, color=['leiden2','RORC','IL17A','IL17F'], size = 20, legend_loc = 'on data', cmap = my_cmap)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(potential_17, restrict_to = ('leiden2', ['6']), resolution = 0.6, key_added= 'leiden3')
sc.pl.tsne(potential_17, color=['leiden3','RORC','IL17A','IL17F'], size = 20, legend_loc = 'on data', cmap = my_cmap)

In [None]:
IL17_genesmask = [gene.startswith("IL17") and gene.startswith("IL17R")==0 for gene in potential_17.var_names]
IL17_genes = potential_17.var_names[IL17_genesmask]
IL17_exp_set ={
    'cytokine': IL17_genes, 'TF': 'RORC'
}
sc.pl.dotplot(potential_17,IL17_exp_set,groupby = 'leiden2' , vmax = 1, swap_axes = False, dot_min =0.1, dot_max =1,standard_scale = 'var')

In [None]:
T17_6 = potential_17[potential_17.obs['leiden3'].isin(['1','7','9,1','14','18']),:]

In [None]:
#Why all of a sudden this does not work??
adata_17 = T17_list[0].concatenate(T17_list[1:9], batch_key = 'original_cluster', batch_categories=['0','1','3','28-36','8','5-15-25-11-16','40','38','43','27'],join = 'outer',fill_value=0)

In [None]:
#T17_list = [T17_1, T17_2,T17_3,T17_4,T17_5,T17_6,T17_7,T17_8,T17_9,T17_10]
T17_list = []
obs_names = []
for i in range(10):#len(T17_list)):
    T17_list.append(sc.read_h5ad('T17_'+str(i)+'.h5ad'))
    obs_names += list(T17_list[i].obs_names)
    #for j in T17_list[i].obs.columns:
        #T17_list[i].obs[j] = T17_list[i].obs[j].astype(str)
obs_names = np.array(obs_names).flatten()

In [None]:
from scipy.sparse import vstack
X_stack = vstack((T17_list[0].X, T17_list[1].X,T17_list[2].X,T17_list[3].X,T17_list[4].X,T17_list[5].X,T17_list[6].X,T17_list[7].X,T17_list[8].X,T17_list[9].X))

In [None]:
import anndata
adata_17 = anndata.AnnData(X = X_stack)
adata_17.obs_names = obs_names
adata_17.var_names = T17_list[0].var_names

In [None]:
adata_17.obs = adata.obs.loc[adata_17.obs_names,:]

In [None]:
sc.pp.highly_variable_genes(adata_17, flavor='seurat', n_top_genes=3000)
sc.pp.pca(adata_17, n_comps=50, use_highly_variable=True, svd_solver='arpack')

In [None]:
sc.pp.neighbors(adata_17, n_neighbors = 15, n_pcs = 50)
sc.tl.leiden(adata_17, resolution = 1, key_added= 'leiden')                                                 

In [None]:
sc.tl.tsne(adata_17)

In [None]:
%matplotlib inline
sc.pl.tsne(adata_17, color=['leiden','RORC','IL17A','IL17F'],size = 40,cmap = my_cmap)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(adata_17, restrict_to = ('leiden', ['3']), resolution = 0.5, key_added= 'leiden1')
sc.pl.tsne(adata_17, color=['leiden1','RORC','IL17A','IL17F'], size = 20, legend_loc = 'on data', cmap = my_cmap)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(adata_17, restrict_to = ('leiden1', ['1']), resolution = 0.6, key_added= 'leiden2')
sc.pl.tsne(adata_17, color=['leiden2','RORC','IL17A','IL17F'], size = 20, legend_loc = 'on data', cmap = my_cmap)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(adata_17, restrict_to = ('leiden3', ['10']), resolution = 0.5, key_added= 'leiden4')
sc.pl.tsne(adata_17, color=['leiden4','RORC','IL17A','IL17F'], size = 20, legend_loc = 'on data', cmap = my_cmap)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(adata_17, restrict_to = ('leiden2', ['1,1']), resolution = 0.4, key_added= 'leiden3')
sc.pl.tsne(adata_17, color=['leiden3','RORC','IL17A','IL17F'], size = 20, legend_loc = 'on data', cmap = my_cmap)

In [None]:
IL17_genesmask = [gene.startswith("IL17") and gene.startswith("IL17R")==0 for gene in adata_17.var_names]
IL17_genes = adata_17.var_names[IL17_genesmask]
IL17_exp_set ={
    'cytokine': IL17_genes, 'TF': 'RORC'
}
sc.pl.dotplot(adata_17,IL17_exp_set,groupby = 'leiden4' , vmax = 1, swap_axes = False, dot_min =0.1, dot_max =1,standard_scale = 'var')

In [None]:
adata_17_refined = adata_17[adata_17.obs['leiden4'].isin(['1,0','1,1,0','1,4','1,5','1,6','3,0','3,1','3,2','3,3','3,4','10,3','11','14','15','16']),:]

In [None]:
sc.pl.tsne(adata_17_refined, color=['leiden4','RORC','IL17A','IL17F'], size = 20, legend_loc = 'on data', cmap = my_cmap)

In [None]:
adata_17_refined.write('GSE178341_T17_refined.h5ad')

In [None]:
adata_17_refined= sc.read_h5ad('GSE178341_T17_refined.h5ad')

In [None]:
T_adata = sc.read_h5ad('GSE178341_T.h5ad')

In [None]:
sc.pp.normalize_total(adata, key_added = 'normalization_factors')
sc.pp.log1p(adata)

In [None]:
T_adata = T_adata[T_adata.obs['cl295v11SubFull'].isin([
 'cTNI17 (gd-like T)',
 'cTNI18 (gd-like T PDCD1+)',
 'cTNI19 (gd-like T prolif)',
 'cTNI20 (PLZF+ T)',
 'cTNI21 (PLZF+ T prolif)',
 'cTNI23 (NK CD16A+)',
 'cTNI24 (NK GZMK+)',
 'cTNI25 (NK XCL1+)']),:]

In [None]:
del T_adata.uns['log1p']

In [None]:
sc.pp.highly_variable_genes(T_adata, flavor='seurat', n_top_genes=3000)
sc.pp.pca(T_adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')

In [None]:
sc.pp.neighbors(T_adata, n_neighbors = 15, n_pcs = 50)
#sc.tl.leiden(T_adata, resolution = 0.8, key_added= 'leiden')                                                 

In [None]:
sc.tl.tsne(T_adata)

In [None]:
%matplotlib inline
plt.rcParams['axes.linewidth'] = 2
fig = sc.pl.tsne(T_adata, color = ['cl295v11SubFull','ZBTB16','IKZF2','TRGV4','TRGC1','CD3D','CD3G','CD3E',
                                   'CD247','IFNG','IL17A','TRDV1','TRDV2','TRDV3','CD4','CD8A','CD8B',
                                   'PDCD1','KLRK1','RORC','TRAV1-2','NCAM1','KLRB1','KLRF1','TRDC'],size = 20, legend_loc = 'on data', palette  = my_palette, ncols = 4, cmap = my_cmap, return_fig = True, legend_fontsize = 'large')
ax = fig.get_axes()
for i in range(0,len(ax)):
    ax[i].xaxis.label.set_fontsize(22)
    ax[i].xaxis.label.set_fontweight('bold')
    ax[i].yaxis.label.set_fontsize(22)
    ax[i].title.set_fontsize(30)
    ax[i].yaxis.label.set_fontweight('bold')
    ax[i].title.set_fontweight('bold')
#plt.savefig('overall_map_withunmatched.png')

In [None]:
%matplotlib inline
plt.rcParams['axes.linewidth'] = 2
fig = sc.pl.tsne(T_adata, color = ['cl295v11SubFull'],size = 20, palette  = my_palette, ncols = 4, cmap = my_cmap, return_fig = True, legend_fontsize = 'large')
ax = fig.get_axes()
for i in range(0,len(ax)):
    ax[i].xaxis.label.set_fontsize(22)
    ax[i].xaxis.label.set_fontweight('bold')
    ax[i].yaxis.label.set_fontsize(22)
    ax[i].title.set_fontsize(30)
    ax[i].yaxis.label.set_fontweight('bold')
    ax[i].title.set_fontweight('bold')
#plt.savefig('overall_map_withunmatched.png')

In [None]:
gamma_genesmask = [gene.startswith("TRG") for gene in T_adata.var_names]
gamma_genes = T_adata.var_names[gamma_genesmask]
delta_genesmask = [gene.startswith("TRD") for gene in T_adata.var_names]
delta_genes = T_adata.var_names[delta_genesmask]

In [None]:
TCR_exp_set ={
"gamma": gamma_genes, "delta": delta_genes, "CD3S": ['CD3E','CD3D','CD3G']
}



sc.pl.dotplot(T_adata,TCR_exp_set,groupby = 'cl295v11SubFull' , vmax = 1, swap_axes = False, dot_min =0, dot_max =1,standard_scale = 'var')

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(T_adata, restrict_to = ('cl295v11SubFull', ['cTNI18 (gd-like T PDCD1+)']), resolution = 0.9, key_added= 'leiden1')
sc.pl.tsne(T_adata, color=['leiden1','CD3E','CD3G','CD3D','TRDC'], size = 30, legend_loc = 'on data', cmap = my_cmap)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(T_adata, restrict_to = ('leiden1', ['cTNI17 (gd-like T)']), resolution = 0.9, key_added= 'leiden2')
sc.pl.tsne(T_adata, color=['leiden2','CD3E','CD3G','CD3D','TRDC'], size = 30, legend_loc = 'on data', cmap = my_cmap)

In [None]:
TCR_exp_set ={
"gamma": gamma_genes, "delta": delta_genes, "CD3S": ['CD3E','CD3D','CD3G']
}



sc.pl.dotplot(T_adata,TCR_exp_set,groupby = 'leiden2' , vmax = 1, swap_axes = False, dot_min =0, dot_max =1,standard_scale = 'var')

In [None]:
keep_list = ['cTNI17 (gd-like T),'+str(i) for i in range(0,12)] + ['cTNI18 (gd-like T PDCD1+),18'] + ['cTNI18 (gd-like T PDCD1+),'+str(i) for i in [0,1,5,6]] +[ 'cTNI19 (gd-like T prolif)']

In [None]:
gd = T_adata[T_adata.obs['leiden2'].isin(keep_list),:]

In [None]:
sc.pl.tsne(gd, color=['leiden2','CD3E','CD3G','CD3D','TRDC'], size = 30, legend_loc = 'on data', cmap = my_cmap)

In [None]:
gd.write('GSE178341_gd.h5ad')

## Do they show any sign of IL17 secreting?

In [None]:
def DE_to_df(_adata, rank_key, _padj_thresh=0.05, _logfc_thresh=1):
    def process_genes(i, direction):
        _log2foldmask = (_adata.uns[rank_key]['logfoldchanges'][i].astype('double') >= _logfc_thresh) if direction == "up" else (_adata.uns[rank_key]['logfoldchanges'][i].astype('double') <= -_logfc_thresh)
        _pvalmask = _adata.uns[rank_key]['pvals_adj'][i].astype('double') <= _padj_thresh
        _additional = pd.DataFrame({
            i: _adata.uns[rank_key]['names'][i].astype('str')[_log2foldmask & _pvalmask],
            'logfoldchanges_'+i: np.abs(_adata.uns[rank_key]['logfoldchanges'][i].astype('double')[_log2foldmask & _pvalmask])
        })
        _ribo_gene_mask = [gene.startswith('RPL') or gene.startswith('RPS') for gene in _additional[i]]
        _mt_gene_mask = [gene.startswith('MT-') for gene in _additional[i]]
        if len(_ribo_gene_mask)>0 and len(_mt_gene_mask)>0:
            _drop_id = np.array(np.array(_ribo_gene_mask) | np.array(_mt_gene_mask))
            _additional = _additional.sort_values(by='logfoldchanges_'+i, ascending=False)
            _additional = _additional.iloc[_drop_id==False, :].reset_index(drop=True)
        return _additional

    _pass_genes_up = pd.DataFrame()
    _pass_genes_down = pd.DataFrame()

    for i in set(_adata.uns[rank_key]['pvals_adj'].dtype.names):

        _pass_genes_up = pd.concat([_pass_genes_up, process_genes(i, "up")[i]], ignore_index=False, axis=1)
        _pass_genes_down = pd.concat([_pass_genes_down, process_genes(i, "down")[i]], ignore_index=False, axis=1)

        
    return _pass_genes_up, _pass_genes_down

In [None]:
sc.tl.rank_genes_groups(T_adata, groupby='cl295v11SubFull', key_added='rank',method = 'wilcoxon')

In [None]:
[pass_genes_up, pass_genes_down] = DE_to_df(T_adata, rank_key='rank', _padj_thresh = 0.05, _logfc_thresh = 1)

In [None]:
pass_genes_up.to_csv('pass_genes_up.csv')

In [None]:
T_adata.var_names_make_unique()

In [None]:
# plt.rcParams.update({'font.size': 15, 'font.weight': 'heavy','axes.linewidth':2})
#plt.rcParams.update(plt.rcParamsDefault)
gamma_genesmask = [gene.startswith("TRGV") for gene in adata.var_names]
gamma_genes = adata.var_names[gamma_genesmask]
gammac_genesmask = [gene.startswith("TRGC") for gene in adata.var_names]
gammac_genes = adata.var_names[gammac_genesmask]
delta_genesmask = [gene.startswith("TRDV") for gene in adata.var_names]
delta_genes = adata.var_names[delta_genesmask]
deltac_genesmask = [gene.startswith("TRDC") for gene in adata.var_names]
deltac_genes = adata.var_names[deltac_genesmask]
alpha_genesmask = [gene.startswith("TRBV") for gene in adata.var_names]
alpha_genes = adata.var_names[alpha_genesmask]
TCR_exp_set ={"gamma_genes":gamma_genes,"others":['IKZF2','ZBTB16','TCF7','CD28','TRDC','CD3E','CD4','CD8A','CD8B','TBX21','TRAV1-2','NCAM1','TRAV24','TRAJ18']
}



sc.pl.dotplot(T_adata,TCR_exp_set,groupby = 'subclustered4' , vmax = 1, swap_axes = False, dot_min =0.1, dot_max =1,standard_scale = 'var')

In [None]:
# plt.rcParams.update({'font.size': 15, 'font.weight': 'heavy','axes.linewidth':2})
#plt.rcParams.update(plt.rcParamsDefault)
gamma_genesmask = [gene.startswith("TRGV") for gene in adata.var_names]
gamma_genes = adata.var_names[gamma_genesmask]
gammac_genesmask = [gene.startswith("TRGC") for gene in adata.var_names]
gammac_genes = adata.var_names[gammac_genesmask]
delta_genesmask = [gene.startswith("TRDV") for gene in adata.var_names]
delta_genes = adata.var_names[delta_genesmask]
deltac_genesmask = [gene.startswith("TRDC") for gene in adata.var_names]
deltac_genes = adata.var_names[deltac_genesmask]
alpha_genesmask = [gene.startswith("TRBV") for gene in adata.var_names]
alpha_genes = adata.var_names[alpha_genesmask]
TCR_exp_set ={
"gamma V": gamma_genes,"gamma C": gammac_genes, "delta V": delta_genes,"delta C": deltac_genes, "alpha": alpha_genes
}



sc.pl.dotplot(T_adata,TCR_exp_set,groupby = 'cl295v11SubFull' , vmax = 1, swap_axes = False, dot_min =0.05, dot_max =1,standard_scale = 'var',save= 'TCR_exp.png')

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(T_adata, restrict_to = ('cl295v11SubFull', ['cTNI13 (CD8+ T IL17+)']), resolution = 0.1, key_added= 'subclustered')
sc.pl.tsne(T_adata[T_adata.obs['cl295v11SubFull'] == 'cTNI13 (CD8+ T IL17+)',:], color=['subclustered'], size = 30)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(T_adata, restrict_to = ('subclustered', ['cTNI20 (PLZF+ T)']), resolution = 0.1, key_added= 'subclustered2')
sc.pl.tsne(T_adata[T_adata.obs['subclustered'] == 'cTNI20 (PLZF+ T)',:], color=['subclustered2'], size = 30)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(T_adata, restrict_to = ('subclustered2', ['cTNI18 (gd-like T PDCD1+)']), resolution = 0.1, key_added= 'subclustered3')
sc.pl.tsne(T_adata[T_adata.obs['subclustered'] == 'cTNI18 (gd-like T PDCD1+)',:], color=['subclustered3'], size = 30)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(T_adata, restrict_to = ('subclustered3', ['cTNI21 (PLZF+ T prolif)']), resolution = 0.1, key_added= 'subclustered4')
sc.pl.tsne(T_adata[T_adata.obs['subclustered'] == 'cTNI21 (PLZF+ T prolif)',:], color=['subclustered4'], size = 30)

In [None]:
set(T_adata.obs['subclustered4'])

In [None]:
gd_T_temp = T_adata[T_adata.obs['subclustered4'].isin(['cTNI12 (CD8+ IL7R+)',
                                                       'cTNI13 (CD8+ T IL17+),0',
                                                       'cTNI18 (gd-like T PDCD1+),1',
                                                       'cTNI20 (PLZF+ T),1',
                                                       'cTNI21 (PLZF+ T prolif),1']),:]

In [None]:
sc.pl.tsne(gd_T_temp, color = ['subclustered4','GD_enrichment'])

In [None]:
# plt.rcParams.update({'font.size': 15, 'font.weight': 'heavy','axes.linewidth':2})
#plt.rcParams.update(plt.rcParamsDefault)
gamma_genesmask = [gene.startswith("TRGV") for gene in adata.var_names]
gamma_genes = adata.var_names[gamma_genesmask]
gammac_genesmask = [gene.startswith("TRGC") for gene in adata.var_names]
gammac_genes = adata.var_names[gammac_genesmask]
delta_genesmask = [gene.startswith("TRDV") for gene in adata.var_names]
delta_genes = adata.var_names[delta_genesmask]
deltac_genesmask = [gene.startswith("TRDC") for gene in adata.var_names]
deltac_genes = adata.var_names[deltac_genesmask]
alpha_genesmask = [gene.startswith("TRAV") for gene in adata.var_names]
alpha_genes = adata.var_names[alpha_genesmask]
TCR_exp_set ={
"gamma V": gamma_genes,"gamma C": gammac_genes, "delta V": delta_genes,"delta C": deltac_genes, "alpha": alpha_genes
}



sc.pl.dotplot(gd_T_temp,TCR_exp_set,groupby = 'subclustered4' , vmax = 1, swap_axes = False, dot_min =0.05, dot_max =1,standard_scale = 'var')#,save= 'TCR_exp.png')

In [None]:
del_list = [ 'cTNI13 (CD8+ T IL17+),'+str(i) for i in range(2,15)]
T_adata = T_adata[T_adata.obs['subclustered'].isin(del_list)==0,:]

In [None]:
T_adata.write('GSE178341_T.h5ad')

In [None]:
# plt.rcParams.update({'font.size': 15, 'font.weight': 'heavy','axes.linewidth':2})
#plt.rcParams.update(plt.rcParamsDefault)
gamma_genesmask = [gene.startswith("TRGV") for gene in adata.var_names]
gamma_genes = adata.var_names[gamma_genesmask]
gammac_genesmask = [gene.startswith("TRGC") for gene in adata.var_names]
gammac_genes = adata.var_names[gammac_genesmask]
delta_genesmask = [gene.startswith("TRDV") for gene in adata.var_names]
delta_genes = adata.var_names[delta_genesmask]
deltac_genesmask = [gene.startswith("TRDC") for gene in adata.var_names]
deltac_genes = adata.var_names[deltac_genesmask]
alpha_genesmask = [gene.startswith("TRBV") for gene in adata.var_names]
alpha_genes = adata.var_names[alpha_genesmask]
TCR_exp_set ={"gamma_genes":gamma_genes,"others":['IKZF2','ZBTB16','TCF7','CD28','CD3E','CD4','CD8A','CD8B',
                                                  'TBX21','TRAV1-2','NCAM1','TRAV24','TRAJ18','HES4']
}



sc.pl.dotplot(T_adata,TCR_exp_set,groupby = 'subclustered' , vmax = 1, swap_axes = False, dot_min =0, dot_max =1,standard_scale = 'var')

In [None]:
# plt.rcParams.update({'font.size': 15, 'font.weight': 'heavy','axes.linewidth':2})
#plt.rcParams.update(plt.rcParamsDefault)
gamma_genesmask = [gene.startswith("TRGV") for gene in adata.var_names]
gamma_genes = adata.var_names[gamma_genesmask]
gammac_genesmask = [gene.startswith("TRGC") for gene in adata.var_names]
gammac_genes = adata.var_names[gammac_genesmask]
delta_genesmask = [gene.startswith("TRDV") for gene in adata.var_names]
delta_genes = adata.var_names[delta_genesmask]
deltac_genesmask = [gene.startswith("TRDC") for gene in adata.var_names]
deltac_genes = adata.var_names[deltac_genesmask]
alpha_genesmask = [gene.startswith("TRBV") for gene in adata.var_names]
alpha_genes = adata.var_names[alpha_genesmask]
TCR_exp_set ={
"gamma V": gamma_genes,"gamma C": gammac_genes, "delta V": delta_genes,"delta C": deltac_genes, "alpha": alpha_genes
}



sc.pl.dotplot(T_adata,TCR_exp_set,groupby = 'subclustered' , vmax = 1, swap_axes = False, dot_min =0, dot_max =1,standard_scale = 'var',save= 'TCR_exp.png')

In [None]:
plt.rcParams.update({'font.size': 5, 'font.weight': 'heavy','axes.linewidth':5})
plt.rcParams.update(plt.rcParamsDefault)


sc.pl.dotplot(gd_T_temp,['CD4','CD8A','CD8B','TRGV4','CCR7', 'SELL', 'CD27', 'CD28', 'IL7R', 'CD44','CD38','HLA-DRB1', 'KLRG1', 'IL2',
       'IL2RA', 'IL2RB', 'CD69','S1PR1','KLF2', 'ITGAE', 'ITGA1','ITGB2','S1PR1','CCL4', 'GZMK', 'CD101', 'CX3CR1',
       'TCF7', 'LEF1', 'PRDM1', 'TBX21', 'EOMES', 'BACH2', 'GZMB', 'PRF1',
       'FAS', 'FASLG', 'TNF', 'IFNG', 'NKG7', 'CCL4', 'XCL1',
       'XCL2', 'STAT3', 'CD40LG', 'TRAV1-2', 'PDCD1', 'HAVCR2', 'LAG3',
       'MKI67', 'TFRC', 'RORC', 'RORA', 'STAT1', 'STAT4', 'STAT5A',
       'STAT6', 'RUNX1', 'RUNX3', 'CCR3', 'CCR4', 'CCR5', 'CCL5','CCR6', 'CCR8',
       'CCR10','CXCR3',  'CXCR4', 'CXCR5', 'CXCR6', 'IL4', 'IL5', 'IL10', 'IL13', 'IL17A',
       'IL21', 'IL22', 'IL6R', 'IL12RB2', 'IL15RA', 'IL17RB', 'IL18R1',
       'IL21R', 'IL23R', 'IL27RA', 'KLRD1', 'KLRK1', 'TNFRSF8', 'GZMA',
       'LTA', 'IFNGR2', 'HLA-DRA', 'SLC3A2', 'CTLA4', 'FOXP3', 'SMAD3',
       'AHR', 'ENTPD1', 'NT5E', 'TGFB1', 'ITGA2', 'BCL6', 'MAF', 'BTLA',
       'ICOS', 'DPP4', 'GATA3', 'CCL20', 'IRF4', 'BATF', 'TACR1',
       'ZBTB16', 'TNFSF8', 'IKZF2'],groupby = 'subclustered4' , vmax = 1, 
              swap_axes = False ,dot_min = 0.01,standard_scale = 'var', save= 'Inference.png')
# Checked IL9, nothing

In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

def calculate_enrichment_scores(sparse_expression_data_csr, gene_scores):
    # Initialize the enrichment scores with zeros for each cell
    enrichment_scores = np.zeros(sparse_expression_data_csr.shape[0])

    # Convert the gene_scores to a dense numpy array for multiplication with the sparse matrix
    gene_scores_dense = gene_scores.to_numpy()

    # Iterate over each cell (each row in the CSR matrix)
    for cell_idx in range(sparse_expression_data_csr.shape[0]):
        # Extract the expression vector for the current cell
        expression_vector = sparse_expression_data_csr[cell_idx, :].toarray().flatten()
        
        # Calculate the running sum as the dot product between expression and gene scores
        running_sum_vector = np.cumsum(expression_vector * gene_scores_dense)
        
        # The enrichment score for the cell is the maximum of the running sum
        enrichment_scores[cell_idx] = np.max(running_sum_vector)

    return enrichment_scores


def add_gd_enrichment(adata, reference_path):
    features=pd.read_csv(reference_path,index_col = 0)
    features['-logQ'] = -np.log10(features['q_value'])
    features = features[features['-logQ']>0]
    gene_list = pd.DataFrame(features['normalized_effect'])*-1
    gene_list.index = features['gene_short_name']
    gene_list =gene_list.sort_values(by = 'normalized_effect',ascending = False)
    print(gene_list)
    
    sparse_expression_data = scipy.sparse.lil_matrix((adata.shape[0], len(gene_list.index)))
    gene_to_index = {gene: idx for idx, gene in enumerate(gene_list.index)}

    # Fill the sparse matrix with the expression data
    for gene in gene_list.index:
        if gene in adata.var_names:
            # Get the column index for this gene
            col_idx = gene_to_index[gene]
            # Extract the column for this gene from adata
            gene_expression = adata[:, gene].X
            # Assign the column to the appropriate place in the sparse matrix
            sparse_expression_data[:, col_idx] = gene_expression

    # Convert the LIL matrix to CSR format for more efficient row slicing
    sparse_expression_data_csr = sparse_expression_data.tocsr()

    enrichment_scores = calculate_enrichment_scores(sparse_expression_data_csr, gene_list['normalized_effect'])
    adata.obs['GD_enrichment'] = enrichment_scores

In [None]:
add_gd_enrichment(T_adata,'F:/CRC/refined_gd.csv')

In [None]:
sc.pl.tsne(T_adata,  color = ['cl295v11SubFull','IKZF2'], cmap = my_cmap , ncols = 1)

In [None]:
T_adata = T_adata[T_adata.obs['cl295v11SubFull'].isin([ 'cTNI07 (CD4+ CXCL13+)',
 'cTNI08 (CD4+ Treg)',
 'cTNI09 (CD4+ Treg prolif)',
 'cTNI10 (CD8+ IL7R+)',
 'cTNI11 (CD8+GZMK+)',
 'cTNI12 (CD8+ IL7R+)',
 'cTNI13 (CD8+ T IL17+)',
 'cTNI14 (CD8+ CXCL13+)',
 'cTNI15 (CD8+ CXCL13+ HSP+)',
 'cTNI16 (CD8+ CXCL13+ prolif)',
 'cTNI17 (gd-like T)',
 'cTNI18 (gd-like T PDCD1+)',
 'cTNI19 (gd-like T prolif)',
 'cTNI20 (PLZF+ T)',
 'cTNI21 (PLZF+ T prolif)']),:]

In [None]:
adata_gd = sc.read_h5ad('GSE178341_gd.h5ad')

In [None]:
sc.pl.tsne(adata_gd,color = ['RORC','IL17A','TRDC'],cmap = my_cmap)

In [None]:
export_df = adata_gd.obs

In [None]:
#[str(i).lower() for i in ]
TNM_T = list()
for i in export_df['TumorStage']:
    if str(i) != 'nan':
        rec = str(i).lower().split('pt')[1]
    else:
        rec = 'N/A'
    TNM_T.append(rec)

In [None]:
#[str(i).lower() for i in ]
TNM_N = list()
for i in export_df['NodeStatus_detailed']:
    if str(i) != 'nan':
        rec = str(i).split('N')[1]
    else:
        rec = 'N/A'
    TNM_N.append(rec)

In [None]:
#[str(i).lower() for i in ]
TNM_M = list()
for i in export_df['MetastasisStatus']:
    i = str(i)
    if i != 'nan':
        id = i.index('M')
        if i[id+2].isalpha():
            rec = i[id+1:id+3]
        else:
            rec = i[id+1]
    else:
        rec = 'N/A'
    TNM_M.append(rec)

In [None]:
adata_gd.obs['patient'] = export_df['PID']
adata_gd.obs['gender'] = export_df['Sex']
adata_gd.obs['age'] = export_df['Age']
adata_gd.obs['tissue'] = export_df['HistologicTypeSimple']
adata_gd.obs['site'] = export_df['TissueSite_detailed']
adata_gd.obs['TNM_T'] = TNM_T
adata_gd.obs['TNM_N'] = TNM_N
adata_gd.obs['TNM_M'] = TNM_M

In [None]:
set(adata_gd.obs['site'])

In [None]:
def convert_tnm_to_stage(t, n, m):
    """
    Convert TNM components to an overall cancer stage.
    
    Parameters:
    t (str): Tumor size and extent (e.g., 'Tis', 'T1', 'T2', 'T3', 'T4').
    n (str): Node involvement (e.g., 'N0', 'N1', 'N2', 'N3').
    m (str): Metastasis presence (e.g., 'M0', 'M1').
    
    Returns:
    int: Overall cancer stage (0 to IV).
    """
    t = t[0].lower()
    n = n[0].lower()
    m = m[0].lower()
    # Handling metastasis first because it overrides other categories
    if m == '1':
        return 'IV'
    if t.isnumeric()==0 or n.isnumeric()==0:
        return 'N/A'
    
    # Mapping T and N to stages
    if t == '0' and n == '0' and m != '1':
        return '0'
    elif t == '1' and n == '0' and m != '1':
        return 'I'
    elif (t == '2' and n == '0' and m != '1') or (t == '1' and n == '1' and m != '1'):
        return 'II'
    elif (t == '2' and n == '1' and m != '1') or (t in ['3'] and n in ['0', '1'] and m != '1'):
        return 'III'
    
    # Default to highest stage if other conditions are not met (typically not used, more complex logic needed in real cases)
    return 'IV'

In [None]:
adata_gd.obs['stage'] = 'N/A'
for i in range(adata_gd.shape[0]):
    adata_gd.obs['stage'][i] = convert_tnm_to_stage(adata_gd.obs['TNM_T'][i], adata_gd.obs['TNM_N'][i], adata_gd.obs['TNM_M'][i])

In [None]:
adata_gd.obs['gender'] = adata_gd.obs['gender'].replace({'F':'Female','M':'Male'})

In [None]:
sc.pl.tsne(adata_gd,color = ['patient','tissue','gender','age','site','TNM_T','TNM_N','TNM_M','stage'])

In [None]:
for i, tnmt in enumerate(adata_gd.obs.groupby('patient')['TNM_T'].apply(set)):
    pid = adata_gd.obs.groupby('patient')['TNM_T'].apply(set).index[i]
    
    if 'nan' not in list(i):
        adata_gd.obs['TNM_T'][np.array(adata_gd.obs['patient'] == pid) & np.array(adata_gd.obs['tissue'] == 'Normal colon')] = max(list(i))
    else if len(list(i)) > 1:
        adata_gd.obs['TNM_T'][np.array(adata_gd.obs['patient'] == pid) & np.array(adata_gd.obs['tissue'] == 'Normal colon')] = max(list(set(['nan']) ^ set(i)))

In [None]:
adata_gd.obs[['patient','tissue','gender','age','site','TNM_T','TNM_N','TNM_M','stage']].to_csv('GSE178341_gd.csv')

In [None]:
adata_gd.write('GSE178341_gd.h5ad')

In [None]:
gd_info = pd.read_csv('GSE178341_gd.csv')

In [None]:
adata.var_names_make_unique()

In [None]:
counts = adata[gd_info['cellID'],:].X.T
colnames = gd_info['cellID']
rownames = adata.var_names

In [None]:
%%R -i colnames -i rownames -i counts #-i gd_info

colnames(counts) = colnames
rownames(counts) = rownames

srat <- CreateSeuratObject(counts = counts, project = "GSE178341", min.cells = 0, min.features = 0, assay = "RNA")

In [None]:
%%R
srat = AddMetaData(srat, metadata = gd_info)
saveRDS(srat, 'GSE178341_gd.rds')

In [None]:
adata = sc.read_h5ad('GSE178341_processed.h5ad')
adata_17 = sc.read_h5ad('GSE178341_T17_refined.h5ad')
adata_gd = sc.read_h5ad('GSE178341_gd.h5ad')

In [None]:
adata.obs['IL17 secreting selected'] = '0'
adata.obs['IL17 secreting selected'][adata.obs_names.isin(adata_17.obs_names)] = '1'

In [None]:
adata.obs['gdT selected'] = '0'
adata.obs['gdT selected'][adata.obs_names.isin(adata_gd.obs_names)] = '1'

In [None]:
from matplotlib.colors import LinearSegmentedColormap
values = [0,1]
colors = [(227, 227, 227), (255, 42, 18)]
norm = plt.Normalize(min(values), max(values))
my_cmap = LinearSegmentedColormap.from_list(
    '', [(norm(value), tuple(np.array(color) / 255)) for value, color in zip(values, colors)])

In [None]:
plt.close()
plt.rcParams['axes.linewidth'] = 2
plt.rcParams['figure.figsize'] = [8,8]
fig = sc.pl.tsne(adata, color=['RORC','IL17A','IL17F','IL17 secreting selected'],
                 size =1, ncols = 2, palette = ['#E3E3E3', '#FF2A12'], cmap = my_cmap, return_fig = True, legend_fontsize = 'large')
ax = fig.get_axes()
for i in range(0,len(ax)):
    ax[i].xaxis.label.set_fontsize(22)
    ax[i].xaxis.label.set_fontweight('bold')
    ax[i].yaxis.label.set_fontsize(22)
    ax[i].title.set_fontsize(30)
    ax[i].yaxis.label.set_fontweight('bold')
    ax[i].title.set_fontweight('bold')
fig.savefig('17_selected.png',dpi = 300,bbox_inches='tight') 

In [None]:
plt.close()
plt.rcParams['axes.linewidth'] = 2
plt.rcParams['figure.figsize'] = [8,8]
fig = sc.pl.tsne(adata, color=['CD3E','CD3D','CD3G','CD247','TRDC','gdT selected'],
                 size =1, ncols = 2, palette = ['#E3E3E3', '#FF2A12'], cmap = my_cmap, return_fig = True, legend_fontsize = 'large')
ax = fig.get_axes()
for i in range(0,len(ax)):
    ax[i].xaxis.label.set_fontsize(22)
    ax[i].xaxis.label.set_fontweight('bold')
    ax[i].yaxis.label.set_fontsize(22)
    ax[i].title.set_fontsize(30)
    ax[i].yaxis.label.set_fontweight('bold')
    ax[i].title.set_fontweight('bold')
fig.savefig('gd_selected.png',dpi = 300,bbox_inches='tight') 