In [None]:
#import the library
import scanpy as sc
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib import colors
import seaborn as sb
from gprofiler import GProfiler
import seaborn as sns
import rpy2.rinterface_lib.callbacks
import logging
import tensorflow as tf
import os

from rpy2.robjects import pandas2ri
import anndata2ri

import importlib
import warnings
warnings.filterwarnings("ignore")
import pickle as pkl
from matplotlib.colors import LinearSegmentedColormap

In [None]:
def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')

In [None]:
#This pallete is for colorblinds
my_palette = ['#0351A8','#8CB0E0','#D56D11','#FFBB78','#234E08','#53CB8B','#D30083','#CB788D','#4E195A','#C58CCF','#AA290F','#B03FD1','#E8BCCF','#64605F','#B2AD9A','#D2D30B','#D1BD4F','#06DCF2','#9EDAE5','#517219','#5B43CF','#D92F24','#FFD900','#002F33','#B8A3A3']

In [None]:
adata = sc.read_h5ad('integrated.h5ad')

In [None]:
from matplotlib.colors import LinearSegmentedColormap
values = [0,1]
colors = [(227, 227, 227), (255, 42, 18)]
norm = plt.Normalize(min(values), max(values))
my_cmap = LinearSegmentedColormap.from_list(
    '', [(norm(value), tuple(np.array(color) / 255)) for value, color in zip(values, colors)])

In [None]:
# Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

In [None]:
plt.rcParams['figure.figsize']=(8,8) #rescale figures
sc.settings.verbosity = 3
#sc.set_figure_params(dpi=200, dpi_save=300)
sc.logging.print_versions()

In [None]:
%%R
# Load libraries from correct lib Paths for my environment - ignore this!
.libPaths(.libPaths('C:\\Users\\16220\\AppData\\Local\\R\\win-library\\4.3'))
library(Seurat)

In [None]:
%%R
# Load libraries from correct lib Paths for my environment - ignore this!
.libPaths(.libPaths('C:\\Users\\16220\\AppData\\Local\\R\\win-library\\4.3'))
library(Seurat)
# Load all the R libraries we will be using in the notebook
library(scran)
library(Seurat)
library(RColorBrewer)
library(slingshot)
library(monocle)
library(gam)
library(ggplot2)
library(plyr)
library(MAST)
library(clusterExperiment)
library(monocle3)
library(SeuratWrappers)
library(magrittr) # needs to be run every time you start R and want to use %>%
library(dplyr)    # alternatively, this also loads %>%

In [None]:
%%R 
srat_combined = readRDS('F:/CRC/GSE161277_Adenoma/integrated_data.rds')
HVG = VariableFeatures(srat_combined)

In [None]:
%R mat <- srat_combined@assays$integrated@scale.data

In [None]:
%%R
#srat_combined <- RunPCA(srat_combined)
pca <- srat_combined[["pca"]]

# Get the total variance:
total_variance <- sum(matrixStats::rowVars(mat))

eigValues = (pca@stdev)^2  ## EigenValues
varExplained = eigValues / total_variance

PCs = Loadings(srat_combined, reduction = "pca")

In [None]:
%%R -o logcounts -o counts
merged = JoinLayers(srat_combined@assays$RNA)
counts = merged@layers$counts
logcounts = merged@layers$data
#counts = srat_combined@assays$integrated@counts

In [None]:
%%R -o features -o HVG -o varExplained -o PCs -o mat -o obs_names -o PC_embeddings -o sample_origin
features = rownames(merged)
obs_names = colnames(merged)
HVG = rownames(srat_combined@assays$integrated)
PC_embeddings = srat_combined@reductions$pca@cell.embeddings
sample_origin = srat_combined@meta.data$orig.ident

In [None]:
import anndata
adata = anndata.AnnData(X = logcounts.T)
adata.var_names = features
adata.obs_names = obs_names
adata.uns['scaled'] = mat.T
adata.layers['counts'] = counts.T
#adata.uns['residuals_genes'] = HVG
adata.uns['residuals_genes'] = list(HVG)
adata.layers['logcounts'] = logcounts.T
adata.obsm['X_pca'] = PC_embeddings
adata.obs['sample_origin'] = sample_origin

In [None]:
adata.obs['n_counts'] = adata.layers['counts'].sum(1)
adata.obs['n_genes'] = (adata.layers['counts'] > 0).sum(1)
adata.var['highly_variable'] = adata.var_names.isin(HVG)
adata.uns['pca'] = dict({'variance_ratio': varExplained})

In [None]:
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]
adata.obs['mt_frac'] = np.array(adata.layers['counts'][:, mt_gene_mask].sum(1).ravel())[0]/adata.obs['n_counts']

In [None]:
ribo_gene_mask = [gene.startswith('RPL') or gene.startswith('RPS') for gene in adata.var_names]
adata.obs['ribo_frac'] = np.array(adata.layers['counts'][:, ribo_gene_mask].sum(1).ravel())[0]/adata.obs['n_counts']

In [None]:
sc.pp.neighbors(adata, n_pcs = 50)

In [None]:
sc.tl.leiden(adata, resolution = 0.8, key_added= 'leiden')

In [None]:
sc.pl.pca(adata, color=['leiden','CD4'], legend_loc = 'on data')

In [None]:
%matplotlib inline
sc.tl.tsne(adata)

In [None]:
plt.rcParams['axes.linewidth'] = 2
sc.pl.tsne(adata, color=['leiden','CD3E','CD3G','CD3D','TRDC','ICOS','CD4','CD8A','CD8B','FOXP3','IFNG','RORC','IL17A','IL17F'],legend_loc = 'on data',cmap = my_cmap)

In [None]:
patient = [i.split('_')[1] for i in adata.obs['sample_origin']]
tissue  = [i.split('_')[2] for i in adata.obs['sample_origin']]

In [None]:
bioinfo = pd.read_csv('Bioinfo.csv', header = None, index_col = 0).T
bioinfo

In [None]:
gender_dict = dict(zip(bioinfo['Patient ID'], bioinfo['Gender']))
gender = [gender_dict[i] for i in patient]

In [None]:
age_dict = dict(zip(bioinfo['Patient ID'], bioinfo['Age']))
age = [age_dict[i] for i in patient]

In [None]:
site_dict = dict(zip(bioinfo['Patient ID'], bioinfo['Site of tumor']))
site = [site_dict[i] for i in patient]

In [None]:
TNM_dict = dict(zip(bioinfo['Patient ID'], bioinfo['pTNM']))
TNM = [TNM_dict[i] for i in patient]

In [None]:
stage_dict = dict(zip(bioinfo['Patient ID'], bioinfo['Stage']))
stage = [stage_dict[i] for i in patient]

In [None]:
T_dict = dict(zip(bioinfo['Patient ID'], bioinfo['pTNM: T']))
T = [T_dict[i] for i in patient]

In [None]:
N_dict = dict(zip(bioinfo['Patient ID'], bioinfo['pTNM: N']))
N = [N_dict[i] for i in patient]

In [None]:
M_dict = dict(zip(bioinfo['Patient ID'], bioinfo['pTNM: M']))
M = [M_dict[i] for i in patient]

In [None]:
adata.obs['patient'] = patient
adata.obs['tissue'] = tissue
adata.obs['gender'] = gender
adata.obs['age'] = age
adata.obs['site'] = site
adata.obs['TNM'] = TNM
adata.obs['TNM_T'] = T
adata.obs['TNM_N'] = N
adata.obs['TNM_M'] = M
adata.obs['stage'] = stage

In [None]:
sc.pl.tsne(adata,color = ['patient','tissue','gender','age','site','TNM','stage','TNM_T','TNM_N','TNM_M'])

In [None]:
adata.obs['site']= adata.obs['site'].astype('str')
adata.obs['tissue']= adata.obs['tissue'].astype('str')

In [None]:
adata.obs['TNM_T'] = adata.obs['TNM_T'].astype(str)
adata.obs['TNM_N'] = adata.obs['TNM_N'].astype(str)
adata.obs['TNM_M'] = adata.obs['TNM_M'].astype(str)
adata.obs['TNM'] = adata.obs['TNM'].astype(str)
adata.obs['stage'] = adata.obs['stage'].astype(str)
adata.obs['stage'][adata.obs['tissue'].isin(['normal'])] = 'N/A'
adata.obs['TNM'][adata.obs['tissue'].isin(['normal'])] = 'N/A'
adata.obs['TNM_T'][adata.obs['tissue'].isin(['normal'])] = 'N/A'
adata.obs['TNM_N'][adata.obs['tissue'].isin(['normal'])] = 'N/A'
adata.obs['TNM_M'][adata.obs['tissue'].isin(['normal'])] = 'N/A'

In [None]:
adata.obs[['patient','tissue','gender','age','site','stage','TNM_T','TNM_N','TNM_M']].to_csv('F:/CRC/AA_Done/GSE161277.csv')

In [None]:
adata.write('integrated.h5ad')

In [None]:
potential_gd = adata[adata.obs['leiden'].isin(['0','5','25','11','1','7']),:]

In [None]:
sc.pl.tsne(potential_gd, color=['leiden','CD3E','CD3G','CD3D','TRDC','ICOS','CD4','CD8A','CD8B','FOXP3','IFNG','RORC','IL17A','IL17F'],legend_loc = 'on data',cmap = my_cmap)

In [None]:
sc.pp.highly_variable_genes(potential_gd, flavor='seurat', n_top_genes=3000)
sc.pp.pca(potential_gd, n_comps=50, use_highly_variable=True, svd_solver='arpack')

In [None]:
sc.pp.neighbors(potential_gd, n_neighbors = 15, n_pcs = 50)
sc.tl.leiden(potential_gd, resolution = 1, key_added= 'leiden')                                                 

In [None]:
sc.tl.tsne(potential_gd)

In [None]:
%matplotlib inline
sc.pl.tsne(potential_gd, color=['leiden','CD3E','CD3G','CD3D','TRDC',
                                'ICOS','CD4','CD8A','CD8B','FOXP3','IFNG','RORC','IL17A','IL17F'],
           legend_loc = 'on data',cmap = my_cmap)

In [None]:
potential_gd = potential_gd[potential_gd.obs['leiden'].isin(['3','11','23','9']),:]

In [None]:
sc.pp.highly_variable_genes(potential_gd, flavor='seurat', n_top_genes=3000)
sc.pp.pca(potential_gd, n_comps=50, use_highly_variable=True, svd_solver='arpack')

In [None]:
sc.pp.neighbors(potential_gd, n_neighbors = 15, n_pcs = 50)
sc.tl.leiden(potential_gd, resolution = 1, key_added= 'leiden')                                                 

In [None]:
sc.tl.tsne(potential_gd)

In [None]:
%matplotlib inline
sc.pl.tsne(potential_gd, color=['leiden','CD3E','CD3G','CD3D','TRDC',
                                'ICOS','CD4','CD8A','CD8B','FOXP3','IFNG','RORC','IL17A','IL17F'],
           legend_loc = 'on data',cmap = my_cmap)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(potential_gd, restrict_to = ('leiden', ['1']), resolution = 0.5, key_added= 'leiden1')
sc.pl.tsne(potential_gd, color=['leiden1','CD3E','CD3G','CD3D','TRDC','RORC'], size = 50, legend_loc = 'on data', cmap = my_cmap)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(potential_gd, restrict_to = ('leiden1', ['4']), resolution = 0.5, key_added= 'leiden2')
sc.pl.tsne(potential_gd, color=['leiden2','CD3E','CD3G','CD3D','TRDC','RORC'], size = 40, legend_loc = 'on data', cmap = my_cmap)

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sc.tl.leiden(potential_gd, restrict_to = ('leiden2', ['3']), resolution = 0.6, key_added= 'leiden3')
sc.pl.tsne(potential_gd, color=['leiden3','CD3E','CD3G','CD3D','TRDC','RORC'], size = 40, legend_loc = 'on data', cmap = my_cmap)

In [None]:
gamma_genesmask = [gene.startswith("TRG") for gene in potential_gd.var_names]
gamma_genes = potential_gd.var_names[gamma_genesmask]
delta_genesmask = [gene.startswith("TRD") for gene in potential_gd.var_names]
delta_genes = potential_gd.var_names[delta_genesmask]

In [None]:
TCR_exp_set ={
"gamma": gamma_genes, "delta": delta_genes, "CD3S": ['CD3E','CD3D','CD3G']
}
sc.pl.dotplot(potential_gd,TCR_exp_set,groupby = 'leiden2' , vmax = 1, swap_axes = False, dot_min =0, dot_max =1,standard_scale = 'var')

In [None]:
gd = potential_gd[potential_gd.obs['leiden2'].isin(['1','3','5,1','6,1']),:]

In [None]:
sc.pl.dotplot(gd,TCR_exp_set,groupby = 'leiden2' , vmax = 1, swap_axes = False, dot_min =0, dot_max =1,standard_scale = 'var')

In [None]:
gd.write('F:/CRC/GSE161277_Adenoma/GSE161277_Adenoma_gd.h5ad')

In [None]:
IL17_genesmask = [gene.startswith("IL17") and gene.startswith("IL17R")==0 for gene in potential_gd.var_names]
IL17_genes = potential_gd.var_names[IL17_genesmask]
IL17_exp_set ={
    'cytokine': IL17_genes, 'TF': 'RORC'
}
sc.pl.dotplot(potential_gd,IL17_exp_set,groupby = 'leiden2' , vmax = 1, swap_axes = False, dot_min =0.1, dot_max =1,standard_scale = 'var')

In [None]:
IL17pos = potential_gd[potential_gd.obs['leiden2'].isin(['0','1,2','4,0','4,1','4,2','8','9']),:]

In [None]:
IL17pos.write('GSE161277_Adenoma_T17.h5ad')

In [None]:
adata_T17 = sc.read_h5ad('GSE161277_Adenoma_T17.h5ad')

In [None]:
adata_T17.obs[['patient','tissue','gender','age','site','stage','TNM_T','TNM_N','TNM_M']].to_csv('F:/CRC/AA_Done/GSE161277_T17.csv')

In [None]:
sc.pl.tsne(gd,color = ['RORC','IL17A'],cmap = my_cmap)

In [None]:
adata_gd = sc.read_h5ad('GSE161277_Adenoma_gd.h5ad')

In [None]:
patient = [i.split('_')[1] for i in adata_gd.obs['sample_origin']]

In [None]:
bioinfo = pd.read_csv('Bioinfo.csv', header = None, index_col = 0).T
bioinfo

In [None]:
gender_dict = dict(zip(bioinfo['Patient ID'], bioinfo['Gender']))
gender = [gender_dict[i] for i in patient]

In [None]:
age_dict = dict(zip(bioinfo['Patient ID'], bioinfo['Age']))
age = [age_dict[i] for i in patient]

In [None]:
site_dict = dict(zip(bioinfo['Patient ID'], bioinfo['Site of tumor']))
site = [site_dict[i] for i in patient]

In [None]:
TNM_dict = dict(zip(bioinfo['Patient ID'], bioinfo['pTNM']))
TNM = [TNM_dict[i] for i in patient]

In [None]:
stage_dict = dict(zip(bioinfo['Patient ID'], bioinfo['Stage']))
stage = [stage_dict[i] for i in patient]

In [None]:
T_dict = dict(zip(bioinfo['Patient ID'], bioinfo['pTNM: T']))
T = [T_dict[i] for i in patient]

In [None]:
N_dict = dict(zip(bioinfo['Patient ID'], bioinfo['pTNM: N']))
N = [N_dict[i] for i in patient]

In [None]:
M_dict = dict(zip(bioinfo['Patient ID'], bioinfo['pTNM: M']))
M = [M_dict[i] for i in patient]

In [None]:
adata_gd.obs['patient'] = patient
adata_gd.obs['tissue'] = tissue
adata_gd.obs['gender'] = gender
adata_gd.obs['age'] = age
adata_gd.obs['site'] = site
adata_gd.obs['TNM'] = TNM
adata_gd.obs['TNM_T'] = T
adata_gd.obs['TNM_N'] = N
adata_gd.obs['TNM_M'] = M
adata_gd.obs['stage'] = stage

In [None]:
sc.pl.tsne(adata_gd,color = ['patient','tissue','gender','age','site','TNM','stage','TNM_T','TNM_N','TNM_M'])

In [None]:
adata_gd.obs['site']= adata_gd.obs['site'].astype('str')
adata_gd.obs['tissue']= adata_gd.obs['tissue'].astype('str')

In [None]:
adata_gd.obs['TNM_T'] = adata_gd.obs['TNM_T'].astype(str)
adata_gd.obs['TNM_N'] = adata_gd.obs['TNM_N'].astype(str)
adata_gd.obs['TNM_M'] = adata_gd.obs['TNM_M'].astype(str)
adata_gd.obs['TNM'] = adata_gd.obs['TNM'].astype(str)
adata_gd.obs['stage'] = adata_gd.obs['stage'].astype(str)
adata_gd.obs['stage'][adata_gd.obs['tissue'].isin(['normal'])] = 'N/A'
adata_gd.obs['TNM'][adata_gd.obs['tissue'].isin(['normal'])] = 'N/A'
adata_gd.obs['TNM_T'][adata_gd.obs['tissue'].isin(['normal'])] = 'N/A'
adata_gd.obs['TNM_N'][adata_gd.obs['tissue'].isin(['normal'])] = 'N/A'
adata_gd.obs['TNM_M'][adata_gd.obs['tissue'].isin(['normal'])] = 'N/A'

In [None]:
adata_gd.obs[['patient','tissue','gender','age','site','TNM','stage','TNM_T','TNM_N','TNM_M']].to_csv('F:/CRC/AA_Done/GSE161277_gd.csv')

In [None]:
adata_gd.write('GSE161277_gd.h5ad')

In [None]:
from matplotlib.colors import LinearSegmentedColormap
values = [0,1]
colors = [(227, 227, 227), (255, 42, 18)]
norm = plt.Normalize(min(values), max(values))
my_cmap = LinearSegmentedColormap.from_list(
    '', [(norm(value), tuple(np.array(color) / 255)) for value, color in zip(values, colors)])

In [None]:
#adata = sc.read_h5ad('integrated.h5ad')
#adata_17 = sc.read_h5ad('GSE161277_Adenoma_T17.h5ad')
adata_gd = sc.read_h5ad('GSE161277_gd.h5ad')

In [None]:
adata.obs['IL17 secreting selected'] = '0'
adata.obs['IL17 secreting selected'][adata.obs_names.isin(adata_17.obs_names)] = '1'

In [None]:
adata.obs['gdT selected'] = '0'
adata.obs['gdT selected'][adata.obs_names.isin(adata_gd.obs_names)] = '1'

In [None]:
plt.close()
plt.rcParams['axes.linewidth'] = 2
plt.rcParams['figure.figsize'] = [8,8]
fig = sc.pl.tsne(adata, color=['RORC','IL17A','IL17F','IL17 secreting selected'],
                 size =20, ncols = 2, palette = ['#E3E3E3', '#FF2A12'], cmap = my_cmap, return_fig = True, legend_fontsize = 'large')
ax = fig.get_axes()
for i in range(0,len(ax)):
    ax[i].xaxis.label.set_fontsize(22)
    ax[i].xaxis.label.set_fontweight('bold')
    ax[i].yaxis.label.set_fontsize(22)
    ax[i].title.set_fontsize(30)
    ax[i].yaxis.label.set_fontweight('bold')
    ax[i].title.set_fontweight('bold')
fig.savefig('17_selected.png',dpi = 300,bbox_inches='tight') 

In [None]:
plt.close()
plt.rcParams['axes.linewidth'] = 2
plt.rcParams['figure.figsize'] = [8,8]
fig = sc.pl.tsne(adata, color=['CD3E','CD3D','CD3G','CD247','TRDC','gdT selected'],
                 size =20, ncols = 2, palette = ['#E3E3E3', '#FF2A12'], cmap = my_cmap, return_fig = True, legend_fontsize = 'large')
ax = fig.get_axes()
for i in range(0,len(ax)):
    ax[i].xaxis.label.set_fontsize(22)
    ax[i].xaxis.label.set_fontweight('bold')
    ax[i].yaxis.label.set_fontsize(22)
    ax[i].title.set_fontsize(30)
    ax[i].yaxis.label.set_fontweight('bold')
    ax[i].title.set_fontweight('bold')
fig.savefig('gd_selected.png',dpi = 300,bbox_inches='tight') 