In [None]:
%%R 
srat_combined = readRDS('GSE245552/COLON/integrated_data.rds')
HVG = VariableFeatures(srat_combined)

In [None]:
%R mat <- srat_combined@assays$integrated@scale.data

In [None]:
%%R
#srat_combined <- RunPCA(srat_combined)
pca <- srat_combined[["pca"]]

# Get the total variance:
total_variance <- sum(matrixStats::rowVars(mat))

eigValues = (pca@stdev)^2  ## EigenValues
varExplained = eigValues / total_variance

PCs = Loadings(srat_combined, reduction = "pca")

In [None]:
%%R -o logcounts -o counts
merged = JoinLayers(srat_combined@assays$RNA)
counts = merged@layers$counts
logcounts = merged@layers$data
#counts = srat_combined@assays$integrated@counts

In [None]:
%%R -o features -o HVG -o varExplained -o PCs -o mat -o obs_names -o PC_embeddings -o sample_origin
features = rownames(merged)
obs_names = colnames(merged)
HVG = rownames(srat_combined@assays$integrated)
PC_embeddings = srat_combined@reductions$pca@cell.embeddings
sample_origin = srat_combined@meta.data$orig.ident

In [None]:
import anndata
adata = anndata.AnnData(X = logcounts.T)
adata.var_names = features
adata.obs_names = obs_names
adata.uns['scaled'] = mat.T
adata.layers['counts'] = counts.T
#adata.uns['residuals_genes'] = HVG
adata.uns['residuals_genes'] = list(HVG)
adata.layers['logcounts'] = logcounts.T
adata.obsm['X_pca'] = PC_embeddings
adata.obs['sample_origin'] = sample_origin

In [None]:
adata.obs['n_counts'] = adata.layers['counts'].sum(1)
adata.obs['n_genes'] = (adata.layers['counts'] > 0).sum(1)
adata.var['highly_variable'] = adata.var_names.isin(HVG)
adata.uns['pca'] = dict({'variance_ratio': varExplained})

In [None]:
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]
adata.obs['mt_frac'] = np.array(adata.layers['counts'][:, mt_gene_mask].sum(1).ravel())[0]/adata.obs['n_counts']

In [None]:
ribo_gene_mask = [gene.startswith('RPL') or gene.startswith('RPS') for gene in adata.var_names]
adata.obs['ribo_frac'] = np.array(adata.layers['counts'][:, ribo_gene_mask].sum(1).ravel())[0]/adata.obs['n_counts']

In [None]:
sc.pp.neighbors(adata, n_pcs = 50)

In [None]:
sc.tl.leiden(adata, resolution = 0.8, key_added= 'leiden')

In [None]:
sc.pl.pca(adata, color=['leiden','CD4'], legend_loc = 'on data')

In [None]:
%matplotlib inline
sc.tl.tsne(adata)
#plt.close()
plt.rcParams['axes.linewidth'] = 2
sc.pl.tsne(adata, color=['leiden','CD3E','CD3G','CD3D','TRDC','ICOS','CD4','CD8A','CD8B','FOXP3','IFNG'],legend_loc = 'on data',cmap = my_cmap)

In [None]:
adata.write('GSE245552/COLON/integrated.h5ad')

In [None]:
adata = sc.read_h5ad('GSE245552/COLON/integrated.h5ad')

In [None]:
bioinfo = pd.read_csv('GSE245552/bioinfo.csv')

In [None]:
adata.obs['patient'] = [i.split('_')[1] for i in adata.obs['sample_origin']]

In [None]:
adata.obs['tissue'] = 'CRC'

In [None]:
gender_dict = dict(zip(bioinfo['Patient number'], bioinfo['Gender']))
adata.obs['gender'] = [gender_dict[i] for i in adata.obs['patient']]

In [None]:
age_dict = dict(zip(bioinfo['Patient number'], bioinfo['Age']))
adata.obs['age'] = [age_dict[i] for i in adata.obs['patient']]

In [None]:
site_dict = dict(zip(bioinfo['Patient number'], bioinfo['Location']))
adata.obs['site'] = [site_dict[i] for i in adata.obs['patient']]

In [None]:
T_dict = dict(zip(bioinfo['Patient number'], bioinfo['pTNM: T']))
adata.obs['TNM_T'] = [T_dict[i] for i in adata.obs['patient']]

In [None]:
N_dict = dict(zip(bioinfo['Patient number'], bioinfo['pTNM: N']))
adata.obs['TNM_N'] = [N_dict[i] for i in adata.obs['patient']]

In [None]:
M_dict = dict(zip(bioinfo['Patient number'], bioinfo['pTNM: M']))
adata.obs['TNM_M'] = [M_dict[i] for i in adata.obs['patient']]

In [None]:
stage_dict = dict(zip(bioinfo['Patient number'], bioinfo['Stage']))
adata.obs['stage'] = [stage_dict[i] for i in adata.obs['patient']]

In [None]:
sc.pl.tsne(adata,color = ['patient','tissue','gender','age','site','TNM_T','TNM_N','TNM_M'])

In [None]:
%matplotlib inline
plt.rcParams['axes.linewidth'] = 2
sc.pl.tsne(adata, color=['leiden','CD3E','CD3G','CD3D','TRDC','RORC','IL17A','CD19'],legend_loc = 'on data',cmap = my_cmap)

In [None]:
def DE_to_df(_adata, rank_key, _padj_thresh=0.05, _logfc_thresh=1):
    def process_genes(i, direction):
        _log2foldmask = (_adata.uns[rank_key]['logfoldchanges'][i].astype('double') >= _logfc_thresh) if direction == "up" else (_adata.uns[rank_key]['logfoldchanges'][i].astype('double') <= -_logfc_thresh)
        _pvalmask = _adata.uns[rank_key]['pvals_adj'][i].astype('double') <= _padj_thresh
        _additional = pd.DataFrame({
            i: _adata.uns[rank_key]['names'][i].astype('str')[_log2foldmask & _pvalmask],
            'logfoldchanges_'+i: np.abs(_adata.uns[rank_key]['logfoldchanges'][i].astype('double')[_log2foldmask & _pvalmask])
        })
        _ribo_gene_mask = [gene.startswith('RPL') or gene.startswith('RPS') for gene in _additional[i]]
        _mt_gene_mask = [gene.startswith('MT-') for gene in _additional[i]]
        if len(_ribo_gene_mask)>0 and len(_mt_gene_mask)>0:
            _drop_id = np.array(np.array(_ribo_gene_mask) | np.array(_mt_gene_mask))
            _additional = _additional.sort_values(by='logfoldchanges_'+i, ascending=False)
            _additional = _additional.iloc[_drop_id==False, :].reset_index(drop=True)
        return _additional

    _pass_genes_up = pd.DataFrame()
    _pass_genes_down = pd.DataFrame()

    for i in set(_adata.uns[rank_key]['pvals_adj'].dtype.names):

        _pass_genes_up = pd.concat([_pass_genes_up, process_genes(i, "up")[i]], ignore_index=False, axis=1)
        _pass_genes_down = pd.concat([_pass_genes_down, process_genes(i, "down")[i]], ignore_index=False, axis=1)

        
    return _pass_genes_up, _pass_genes_down

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden', key_added='rank',method = 'wilcoxon')

In [None]:
[pass_genes_up, pass_genes_down] = DE_to_df(adata, rank_key='rank', _padj_thresh = 0.05, _logfc_thresh = 5)

In [None]:
pass_genes_up['0'].dropna()

In [None]:
potential_17 = adata[adata.obs['leiden'].isin(['8']),:]

In [None]:
del potential_17.uns

In [None]:
sc.pp.highly_variable_genes(potential_17, flavor='seurat', n_top_genes=3000)
sc.pp.pca(potential_17, n_comps=50, use_highly_variable=True, svd_solver='arpack')

In [None]:
sc.pp.neighbors(potential_17, n_neighbors = 15, n_pcs = 50)
sc.tl.leiden(potential_17, resolution = 1, key_added= 'leiden')                                                 

In [None]:
sc.tl.tsne(potential_17)

In [None]:
%matplotlib inline
sc.pl.tsne(potential_17, color=['leiden','RORC','IL17A','IL17F'],size = 50,cmap = my_cmap)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(potential_17, restrict_to = ('leiden', ['1']), resolution = 0.6, key_added= 'leiden1')
sc.pl.tsne(potential_17, color=['leiden1','RORC','IL17A','IL17F'], size = 20, legend_loc = 'on data', cmap = my_cmap)

In [None]:
IL17_genesmask = [gene.startswith("IL17") and gene.startswith("IL17R")==0 for gene in potential_17.var_names]
IL17_genes = potential_17.var_names[IL17_genesmask]
IL17_exp_set ={
    'cytokine': IL17_genes, 'TF': 'RORC'
}
sc.pl.dotplot(potential_17,IL17_exp_set,groupby = 'leiden1' , vmax = 1, swap_axes = False, dot_min =0.1, dot_max =1,standard_scale = 'var')

In [None]:
T17_1 = potential_17[potential_17.obs['leiden'].isin(['1,2','1,3'])==0,:]

In [None]:
potential_17 = adata[adata.obs['leiden'].isin(['6']),:]

In [None]:
del potential_17.uns

In [None]:
sc.pp.highly_variable_genes(potential_17, flavor='seurat', n_top_genes=3000)
sc.pp.pca(potential_17, n_comps=50, use_highly_variable=True, svd_solver='arpack')

In [None]:
sc.pp.neighbors(potential_17, n_neighbors = 15, n_pcs = 50)
sc.tl.leiden(potential_17, resolution = 1, key_added= 'leiden')                                                 

In [None]:
sc.tl.tsne(potential_17)

In [None]:
%matplotlib inline
sc.pl.tsne(potential_17, color=['leiden','RORC','IL17A','IL17F'],size = 40,cmap = my_cmap)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(potential_17, restrict_to = ('leiden', ['1']), resolution = 0.5, key_added= 'leiden1')
sc.pl.tsne(potential_17, color=['leiden1','RORC','IL17A','IL17F'], size = 20, legend_loc = 'on data', cmap = my_cmap)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(potential_17, restrict_to = ('leiden1', ['5']), resolution = 0.4, key_added= 'leiden2')
sc.pl.tsne(potential_17, color=['leiden2','RORC','IL17A','IL17F'], size = 20, legend_loc = 'on data', cmap = my_cmap)

In [None]:
IL17_genesmask = [gene.startswith("IL17") and gene.startswith("IL17R")==0 for gene in potential_17.var_names]
IL17_genes = potential_17.var_names[IL17_genesmask]
IL17_exp_set ={
    'cytokine': IL17_genes, 'TF': 'RORC'
}
sc.pl.dotplot(potential_17,IL17_exp_set,groupby = 'leiden2' , vmax = 1, swap_axes = False, dot_min =0.1, dot_max =1,standard_scale = 'var')

In [None]:
T17_2 = potential_17[potential_17.obs['leiden2'].isin(['1,1','1,0','5,2']),:]

In [None]:
potential_17 = adata[adata.obs['leiden'].isin(['1']),:]

In [None]:
del potential_17.uns

In [None]:
sc.pp.highly_variable_genes(potential_17, flavor='seurat', n_top_genes=3000)
sc.pp.pca(potential_17, n_comps=50, use_highly_variable=True, svd_solver='arpack')

In [None]:
sc.pp.neighbors(potential_17, n_neighbors = 15, n_pcs = 50)
sc.tl.leiden(potential_17, resolution = 1, key_added= 'leiden')                                                 

In [None]:
sc.tl.tsne(potential_17)

In [None]:
%matplotlib inline
sc.pl.tsne(potential_17, color=['leiden','RORC','IL17A','IL17F'],size = 40,cmap = my_cmap)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(potential_17, restrict_to = ('leiden', ['1']), resolution = 0.6, key_added= 'leiden1')
sc.pl.tsne(potential_17, color=['leiden1','RORC','IL17A','IL17F'], size = 20, legend_loc = 'on data', cmap = my_cmap)

In [None]:
IL17_genesmask = [gene.startswith("IL17") and gene.startswith("IL17R")==0 for gene in potential_17.var_names]
IL17_genes = potential_17.var_names[IL17_genesmask]
IL17_exp_set ={
    'cytokine': IL17_genes, 'TF': 'RORC'
}
sc.pl.dotplot(potential_17,IL17_exp_set,groupby = 'leiden1' , vmax = 1, swap_axes = False, dot_min =0.1, dot_max =1,standard_scale = 'var')

In [None]:
T17_3 = potential_17[potential_17.obs['leiden1'].isin(['1,3']),:]

In [None]:
adata_17 = T17_1.concatenate(T17_2,T17_3, batch_key = 'original_cluster', batch_categories=['8','6','1'],join = 'outer',fill_value=0)

In [None]:
sc.pp.highly_variable_genes(adata_17, flavor='seurat', n_top_genes=3000)
sc.pp.pca(adata_17, n_comps=50, use_highly_variable=True, svd_solver='arpack')

In [None]:
sc.pp.neighbors(adata_17, n_neighbors = 15, n_pcs = 50)
sc.tl.leiden(adata_17, resolution = 1, key_added= 'leiden')                                                 

In [None]:
sc.tl.tsne(adata_17)

In [None]:
%matplotlib inline
sc.pl.tsne(adata_17, color=['leiden','RORC','IL17A','IL17F'],size = 40,cmap = my_cmap)

In [None]:
added = adata[adata.obs['leiden']=='24',:]

In [None]:
adata_17 = adata_17.concatenate(added,join = 'outer',fill_value=0, index_unique = None)

In [None]:
adata_17.write('GSE245552_T17.h5ad')

In [None]:
potential_gd = adata[adata.obs['leiden'].isin(['11']),:]

In [None]:
sc.pp.highly_variable_genes(potential_gd, flavor='seurat', n_top_genes=3000)
sc.pp.pca(potential_gd, n_comps=50, use_highly_variable=True, svd_solver='arpack')

In [None]:
sc.pp.neighbors(potential_gd, n_neighbors = 15, n_pcs = 50)
sc.tl.leiden(potential_gd, resolution = 1, key_added= 'leiden')                                                 

In [None]:
sc.tl.tsne(potential_gd)

In [None]:
%matplotlib inline
sc.pl.tsne(potential_gd, color=['leiden','CD3E','CD3G','CD3D','TRDC','ICOS','CD4','CD8A','CD8B','FOXP3'],cmap = my_cmap)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(potential_gd, restrict_to = ('leiden', ['8']), resolution = 0.8, key_added= 'leiden1')
sc.pl.tsne(potential_gd, color=['leiden1','CD3E','CD3G','CD3D','TRDC'], size = 20, legend_loc = 'on data', cmap = my_cmap)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(potential_gd, restrict_to = ('leiden1', ['6']), resolution = 0.6, key_added= 'leiden2')
sc.pl.tsne(potential_gd, color=['leiden2','CD3E','CD3G','CD3D','TRDC'], size = 20, legend_loc = 'on data', cmap = my_cmap)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(potential_gd, restrict_to = ('leiden2', ['10']), resolution = 0.3, key_added= 'leiden3')
sc.pl.tsne(potential_gd, color=['leiden3','CD3E','CD3G','CD3D','TRDC'], size = 20, legend_loc = 'on data', cmap = my_cmap)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(potential_gd, restrict_to = ('leiden3', ['3']), resolution = 0.5, key_added= 'leiden4')
sc.pl.tsne(potential_gd, color=['leiden4','CD3E','CD3G','CD3D','TRDC'], size = 20, legend_loc = 'on data', cmap = my_cmap)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(potential_gd, restrict_to = ('leiden3', ['6']), resolution = 0.5, key_added= 'leiden4')
sc.pl.tsne(potential_gd, color=['leiden4','CD3E','CD3G','CD3D','TRDC'], size = 20, legend_loc = 'on data', cmap = my_cmap)

In [None]:
gamma_genesmask = [gene.startswith("TRG") for gene in potential_gd.var_names]
gamma_genes = potential_gd.var_names[gamma_genesmask]
delta_genesmask = [gene.startswith("TRD") for gene in potential_gd.var_names]
delta_genes = potential_gd.var_names[delta_genesmask]

In [None]:
TCR_exp_set ={
"gamma": gamma_genes, "delta": delta_genes, "CD3S": ['CD3E','CD3D','CD3G']
}
sc.pl.dotplot(potential_gd,TCR_exp_set,groupby = 'leiden1' , vmax = 1, swap_axes = False, dot_min =0, dot_max =1,standard_scale = 'var')

In [None]:
gd = potential_gd[potential_gd.obs['leiden1'].isin(['0','1','2','3','6','7','8,1','11']),:]

In [None]:
sc.pl.dotplot(gd,TCR_exp_set,groupby = 'leiden1' , vmax = 1, swap_axes = False, dot_min =0, dot_max =1,standard_scale = 'var')

In [None]:
gd.write('GSE245552/COLON/GSE245552_gd.h5ad')

In [None]:
sc.pl.tsne(gd,color = ['RORC','IL17A'],cmap = my_cmap)

In [None]:
adata_gd = sc.read_h5ad('GSE245552/COLON/GSE245552_gd.h5ad')

In [None]:
bioinfo = pd.read_csv('GSE245552/bioinfo.csv')

In [None]:
bioinfo

In [None]:
adata_gd.obs['patient'] = [i.split('_')[1] for i in adata_gd.obs['sample_origin']]

In [None]:
adata_gd.obs['tissue'] = 'CRC'

In [None]:
gender_dict = dict(zip(bioinfo['Patient number'], bioinfo['Gender']))
adata_gd.obs['gender'] = [gender_dict[i] for i in adata_gd.obs['patient']]

In [None]:
age_dict = dict(zip(bioinfo['Patient number'], bioinfo['Age']))
adata_gd.obs['age'] = [age_dict[i] for i in adata_gd.obs['patient']]

In [None]:
site_dict = dict(zip(bioinfo['Patient number'], bioinfo['Location']))
adata_gd.obs['site'] = [site_dict[i] for i in adata_gd.obs['patient']]

In [None]:
T_dict = dict(zip(bioinfo['Patient number'], bioinfo['pTNM: T']))
adata_gd.obs['TNM_T'] = [T_dict[i] for i in adata_gd.obs['patient']]

In [None]:
N_dict = dict(zip(bioinfo['Patient number'], bioinfo['pTNM: N']))
adata_gd.obs['TNM_N'] = [N_dict[i] for i in adata_gd.obs['patient']]

In [None]:
M_dict = dict(zip(bioinfo['Patient number'], bioinfo['pTNM: M']))
adata_gd.obs['TNM_M'] = [M_dict[i] for i in adata_gd.obs['patient']]

In [None]:
stage_dict = dict(zip(bioinfo['Patient number'], bioinfo['Stage']))
adata_gd.obs['stage'] = [stage_dict[i] for i in adata_gd.obs['patient']]

In [None]:
sc.pl.tsne(adata_gd,color = ['patient','tissue','gender','age','site','TNM_T','TNM_N','TNM_M'])

In [None]:
adata_gd.obs[['patient','tissue','gender','age','site','TNM_T','TNM_N','TNM_M','stage']].to_csv('GSE245552_gd.csv')

In [None]:
adata_gd.write('GSE245552/GSE245552_gd.h5ad')

In [None]:
from matplotlib.colors import LinearSegmentedColormap
values = [0,1]
colors = [(227, 227, 227), (255, 42, 18)]
norm = plt.Normalize(min(values), max(values))
my_cmap = LinearSegmentedColormap.from_list(
    '', [(norm(value), tuple(np.array(color) / 255)) for value, color in zip(values, colors)])

In [None]:
adata = sc.read_h5ad('integrated.h5ad')
adata_17 = sc.read_h5ad('GSE245552_T17.h5ad')
adata_gd = sc.read_h5ad('GSE245552_gd.h5ad')

In [None]:
adata_17 = sc.read_h5ad('GSE245552_T17.h5ad')

In [None]:
adata_17.write('GSE245552_T17.h5ad')

In [None]:
adata_17.obs[['patient','tissue','gender','age','site','stage','TNM_T','TNM_N','TNM_M']].to_csv('GSE245552_T17.csv')

In [None]:
adata.obs['IL17 secreting selected'] = '0'
adata.obs['IL17 secreting selected'][adata.obs_names.isin(adata_17.obs_names)] = '1'

In [None]:
adata.obs['gdT selected'] = '0'
adata.obs['gdT selected'][adata.obs_names.isin(adata_gd.obs_names)] = '1'

In [None]:
plt.close()
plt.rcParams['axes.linewidth'] = 2
plt.rcParams['figure.figsize'] = [8,8]
fig = sc.pl.tsne(adata, color=['RORC','IL17A','IL17F','IL17 secreting selected'],
                 size =10, ncols = 2, palette = ['#E3E3E3', '#FF2A12'], cmap = my_cmap, return_fig = True, legend_fontsize = 'large', vmax = 4)
ax = fig.get_axes()
for i in range(0,len(ax)):
    ax[i].xaxis.label.set_fontsize(22)
    ax[i].xaxis.label.set_fontweight('bold')
    ax[i].yaxis.label.set_fontsize(22)
    ax[i].title.set_fontsize(30)
    ax[i].yaxis.label.set_fontweight('bold')
    ax[i].title.set_fontweight('bold')
fig.savefig('17_selected.png',dpi = 300,bbox_inches='tight') 

In [None]:
plt.close()
plt.rcParams['axes.linewidth'] = 2
plt.rcParams['figure.figsize'] = [8,8]
fig = sc.pl.tsne(adata, color=['CD3E','CD3D','CD3G','CD247','TRDC','gdT selected'],
                 size =20, ncols = 2, palette = ['#E3E3E3', '#FF2A12'], cmap = my_cmap, return_fig = True, legend_fontsize = 'large')
ax = fig.get_axes()
for i in range(0,len(ax)):
    ax[i].xaxis.label.set_fontsize(22)
    ax[i].xaxis.label.set_fontweight('bold')
    ax[i].yaxis.label.set_fontsize(22)
    ax[i].title.set_fontsize(30)
    ax[i].yaxis.label.set_fontweight('bold')
    ax[i].title.set_fontweight('bold')
fig.savefig('gd_selected.png',dpi = 300,bbox_inches='tight') 