In [None]:
#This file contains the code for sorting the single cell RNA-seq data into LowIP and HighIP
#The data is first normalized and then the genes for constitative proteasome and immunoproteasome are filtered out
#the data is then merged with annotation data, followed by filtering out of tumour and then epithelial cells
#The average immunoproteasome score is calculated. Subsequently, either lowIP and highIP groups are made or the cells are annotated with Low and HighIP epithelial cells. 
#other cells (except epithelial cells) are filtered out and then concatenated with low and highIP groups or low and highIP epithelial cells annotated group. 

In [2]:
import scanpy as sc
import matplotlib.pyplot as plt
import numpy as np
import scanpy as sc
import pandas as pd

In [3]:
def filt(adata,st):
    ## selected dataset:
    gns=['PSMB5',  'PSMB6',  'PSMB7',  'PSMB8',  'PSMB9',  'PSMB10',  'PTPRC']
    
    ## filtering cells with less genes and genes withh less cells
    sc.pp.filter_cells(adata, min_genes=200)
    sc.pp.filter_genes(adata, min_cells=3)
    rw=adata[:, gns]

    ##raw counts
    if st =='csv':
        y=pd.DataFrame(data=rw.X, index=rw.obs_names, columns=rw.var_names)
    if st== 'sparse':
        y=pd.DataFrame.sparse.from_spmatrix(data=rw.X, index=rw.obs_names, columns=rw.var_names)
    ## Total-count normalize (library-size correct) the data matrix X to 10,000 reads per cell, so that counts become comparable among cells.
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    adata=adata[:, gns]
    ## storing sparse matrix to pandas
    if st =='csv':
        x=pd.DataFrame(data=adata.X, index=adata.obs_names, columns=adata.var_names)
    if st== 'sparse':
        x=pd.DataFrame.sparse.from_spmatrix(data=adata.X, index=adata.obs_names, columns=adata.var_names)
    #return raw and

    return (x,y)

In [4]:
def avg_score(dt1):
    ## for proteasomes
    dt1['Immunoproteasome']= (1/3)*((dt1["PSMB8"]+1)+(dt1["PSMB9"]+1)+(dt1["PSMB10"]+1))
    dt1['Const_proteasome']=(1/3)*((dt1["PSMB5"]+1)+(dt1["PSMB6"]+1)+(dt1["PSMB7"]+1))
    return dt1

In [5]:
colon=sc.read_csv('GSE132465_GEO_processed_CRC_10X_raw_UMI_count_matrix-Copy1.txt.gz',delimiter="\t")
colon1=colon.transpose()

In [6]:
## creating raw and normalized data for plotting
colon1_pd,colon1_raw=filt(colon1, 'csv')

In [7]:
colon1_pd

Unnamed: 0,PSMB5,PSMB6,PSMB7,PSMB8,PSMB9,PSMB10,PTPRC
SMC01-T_AAACCTGCATACGCCG,0.946544,0.838974,1.423970,1.543122,1.745780,0.718422,0.0
SMC01-T_AAACCTGGTCGCATAT,1.376992,1.021651,1.516348,1.215023,1.021651,0.465363,0.0
SMC01-T_AAACCTGTCCCTTGCA,1.313420,0.000000,0.000000,0.858422,0.000000,0.858422,0.0
SMC01-T_AAACGGGAGGGAAACA,1.298895,0.000000,1.298895,0.000000,0.000000,0.000000,0.0
SMC01-T_AAACGGGGTATAGGTA,0.981064,0.811139,1.126265,1.558441,0.981064,0.606306,0.0
...,...,...,...,...,...,...,...
SMC10-N_TCAGCTCGTAGCGTCC,0.000000,0.000000,0.000000,0.000000,0.000000,1.410707,0.0
SMC10-N_TGACTAGCAGACGCAA,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
SMC10-N_TGCTACCGTCTCCATC,0.000000,0.000000,1.884089,0.000000,0.000000,0.000000,0.0
SMC10-N_TTTATGCAGTGTCTCA,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0


In [8]:
meta=pd.read_csv("GSE132465_GEO_processed_CRC_10X_cell_annotation-Copy1.txt.gz",sep='\t')
meta.head()

Unnamed: 0,Index,Patient,Class,Sample,Cell_type,Cell_subtype
0,SMC01-T_AAACCTGCATACGCCG,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2
1,SMC01-T_AAACCTGGTCGCATAT,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2
2,SMC01-T_AAACCTGTCCCTTGCA,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2
3,SMC01-T_AAACGGGAGGGAAACA,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2
4,SMC01-T_AAACGGGGTATAGGTA,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2


In [9]:
colon1_pd['Index']=colon1_pd.index
norm=meta.merge(colon1_pd,on='Index')

In [10]:
norm

Unnamed: 0,Index,Patient,Class,Sample,Cell_type,Cell_subtype,PSMB5,PSMB6,PSMB7,PSMB8,PSMB9,PSMB10,PTPRC
0,SMC01-T_AAACCTGCATACGCCG,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2,0.946544,0.838974,1.423970,1.543122,1.745780,0.718422,0.0
1,SMC01-T_AAACCTGGTCGCATAT,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2,1.376992,1.021651,1.516348,1.215023,1.021651,0.465363,0.0
2,SMC01-T_AAACCTGTCCCTTGCA,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2,1.313420,0.000000,0.000000,0.858422,0.000000,0.858422,0.0
3,SMC01-T_AAACGGGAGGGAAACA,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2,1.298895,0.000000,1.298895,0.000000,0.000000,0.000000,0.0
4,SMC01-T_AAACGGGGTATAGGTA,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2,0.981064,0.811139,1.126265,1.558441,0.981064,0.606306,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
63684,SMC10-N_TCAGCTCGTAGCGTCC,SMC10,Normal,SMC10-N,Mast cells,Mast cells,0.000000,0.000000,0.000000,0.000000,0.000000,1.410707,0.0
63685,SMC10-N_TGACTAGCAGACGCAA,SMC10,Normal,SMC10-N,Mast cells,Mast cells,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
63686,SMC10-N_TGCTACCGTCTCCATC,SMC10,Normal,SMC10-N,Mast cells,Mast cells,0.000000,0.000000,1.884089,0.000000,0.000000,0.000000,0.0
63687,SMC10-N_TTTATGCAGTGTCTCA,SMC10,Normal,SMC10-N,Mast cells,Mast cells,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0


In [11]:
tumor_samples = norm[norm['Class'] == 'Tumor']
tumor_samples

Unnamed: 0,Index,Patient,Class,Sample,Cell_type,Cell_subtype,PSMB5,PSMB6,PSMB7,PSMB8,PSMB9,PSMB10,PTPRC
0,SMC01-T_AAACCTGCATACGCCG,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2,0.946544,0.838974,1.423970,1.543122,1.745780,0.718422,0.000000
1,SMC01-T_AAACCTGGTCGCATAT,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2,1.376992,1.021651,1.516348,1.215023,1.021651,0.465363,0.000000
2,SMC01-T_AAACCTGTCCCTTGCA,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2,1.313420,0.000000,0.000000,0.858422,0.000000,0.858422,0.000000
3,SMC01-T_AAACGGGAGGGAAACA,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2,1.298895,0.000000,1.298895,0.000000,0.000000,0.000000,0.000000
4,SMC01-T_AAACGGGGTATAGGTA,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2,0.981064,0.811139,1.126265,1.558441,0.981064,0.606306,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
58292,SMC25-T_TTTGGTTCAACGCACC,SMC25,Tumor,SMC25-T,B cells,IgG+ Plasma,0.443246,0.443246,0.443246,0.000000,0.000000,0.749296,0.443246
58293,SMC25-T_TTTGTCAAGCGCTCCA,SMC25,Tumor,SMC25-T,B cells,CD19+CD20+ B,0.000000,0.000000,0.000000,1.791343,1.791343,1.791343,1.791343
63502,SMC06-T_TCTTCGGCAAACAACA,SMC06,Tumor,SMC06-T,Mast cells,Mast cells,0.000000,0.000000,0.000000,1.246724,0.000000,0.000000,0.000000
63503,SMC07-T_TGAGAGGGTTTAGGAA,SMC07,Tumor,SMC07-T,Mast cells,Mast cells,0.000000,2.046507,0.000000,1.474856,1.474856,0.000000,0.000000


In [12]:
Epithelial = tumor_samples[tumor_samples['Cell_type'] == 'Epithelial cells']
Epithelial

Unnamed: 0,Index,Patient,Class,Sample,Cell_type,Cell_subtype,PSMB5,PSMB6,PSMB7,PSMB8,PSMB9,PSMB10,PTPRC
0,SMC01-T_AAACCTGCATACGCCG,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2,0.946544,0.838974,1.423970,1.543122,1.745780,0.718422,0.000000
1,SMC01-T_AAACCTGGTCGCATAT,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2,1.376992,1.021651,1.516348,1.215023,1.021651,0.465363,0.000000
2,SMC01-T_AAACCTGTCCCTTGCA,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2,1.313420,0.000000,0.000000,0.858422,0.000000,0.858422,0.000000
3,SMC01-T_AAACGGGAGGGAAACA,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2,1.298895,0.000000,1.298895,0.000000,0.000000,0.000000,0.000000
4,SMC01-T_AAACGGGGTATAGGTA,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2,0.981064,0.811139,1.126265,1.558441,0.981064,0.606306,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17464,SMC25-T_TTGTAGGGTCATATGC,SMC25,Tumor,SMC25-T,Epithelial cells,CMS2,1.008543,1.361662,1.433402,0.254849,0.457763,1.200609,0.000000
17465,SMC25-T_TTTGCGCAGACACGAC,SMC25,Tumor,SMC25-T,Epithelial cells,CMS2,0.445333,0.445333,0.610588,1.176819,1.259751,0.247254,0.247254
17466,SMC25-T_TTTGCGCCATGGAATA,SMC25,Tumor,SMC25-T,Epithelial cells,CMS2,0.775991,1.014911,0.775991,0.000000,1.014911,0.775991,0.000000
17467,SMC25-T_TTTGGTTGTAGGGTAC,SMC25,Tumor,SMC25-T,Epithelial cells,CMS2,0.000000,0.673635,0.673635,1.578033,0.000000,0.000000,0.000000


In [19]:
Epithelial=avg_score(Epithelial)
Epithelial.set_index('Index', inplace=True)
Epithelial

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dt1['Immunoproteasome']= (1/3)*((dt1["PSMB8"]+1)+(dt1["PSMB9"]+1)+(dt1["PSMB10"]+1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dt1['Const_proteasome']=(1/3)*((dt1["PSMB5"]+1)+(dt1["PSMB6"]+1)+(dt1["PSMB7"]+1))


Unnamed: 0_level_0,Patient,Class,Sample,Cell_type,Cell_subtype,PSMB5,PSMB6,PSMB7,PSMB8,PSMB9,PSMB10,PTPRC,Immunoproteasome,Const_proteasome
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
SMC01-T_AAACCTGCATACGCCG,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2,0.946544,0.838974,1.423970,1.543122,1.745780,0.718422,0.000000,2.335775,2.069830
SMC01-T_AAACCTGGTCGCATAT,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2,1.376992,1.021651,1.516348,1.215023,1.021651,0.465363,0.000000,1.900679,2.304997
SMC01-T_AAACCTGTCCCTTGCA,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2,1.313420,0.000000,0.000000,0.858422,0.000000,0.858422,0.000000,1.572281,1.437806
SMC01-T_AAACGGGAGGGAAACA,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2,1.298895,0.000000,1.298895,0.000000,0.000000,0.000000,0.000000,1.000000,1.865930
SMC01-T_AAACGGGGTATAGGTA,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2,0.981064,0.811139,1.126265,1.558441,0.981064,0.606306,0.000000,2.048604,1.972822
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SMC25-T_TTGTAGGGTCATATGC,SMC25,Tumor,SMC25-T,Epithelial cells,CMS2,1.008543,1.361662,1.433402,0.254849,0.457763,1.200609,0.000000,1.637740,2.267869
SMC25-T_TTTGCGCAGACACGAC,SMC25,Tumor,SMC25-T,Epithelial cells,CMS2,0.445333,0.445333,0.610588,1.176819,1.259751,0.247254,0.247254,1.894608,1.500418
SMC25-T_TTTGCGCCATGGAATA,SMC25,Tumor,SMC25-T,Epithelial cells,CMS2,0.775991,1.014911,0.775991,0.000000,1.014911,0.775991,0.000000,1.596968,1.855631
SMC25-T_TTTGGTTGTAGGGTAC,SMC25,Tumor,SMC25-T,Epithelial cells,CMS2,0.000000,0.673635,0.673635,1.578033,0.000000,0.000000,0.000000,1.526011,1.449090


In [29]:
low_immuno=Epithelial[Epithelial['Immunoproteasome']<=np.percentile(Epithelial['Immunoproteasome'],[25, 50, 75])[0]]
low_immuno=low_immuno[['Immunoproteasome', 'Cell_type']]
#low_immuno['Cell_type'] = low_immuno['Cell_type'].replace('Epithelial cells', 'LowIP epithelial cells')
low_immuno

Unnamed: 0_level_0,Immunoproteasome,Cell_type
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
SMC01-T_AAACGGGAGGGAAACA,1.000000,Epithelial cells
SMC01-T_AAAGATGTCCGTTGCT,1.000000,Epithelial cells
SMC01-T_AAATGCCTCATTTGGG,1.000000,Epithelial cells
SMC01-T_AACCATGTCCTTGACC,1.000000,Epithelial cells
SMC01-T_AACTCTTGTTCGTCTC,1.168621,Epithelial cells
...,...,...
SMC25-T_TTAACTCTCAAACCGT,1.000000,Epithelial cells
SMC25-T_TTAGGCAGTTTGTGTG,1.153026,Epithelial cells
SMC25-T_TTCTCAACATAGACTC,1.000000,Epithelial cells
SMC25-T_TTGAACGAGGCGACAT,1.000000,Epithelial cells


In [30]:
high_immuno=Epithelial[Epithelial['Immunoproteasome']>=np.percentile(Epithelial['Immunoproteasome'],[25, 50, 75])[2]]
high_immuno=high_immuno[['Immunoproteasome', 'Cell_type']]
#high_immuno['Cell_type'] = high_immuno['Cell_type'].replace('Epithelial cells', 'HighIP epithelial cells')
high_immuno

Unnamed: 0_level_0,Immunoproteasome,Cell_type
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
SMC01-T_AAACCTGCATACGCCG,2.335775,Epithelial cells
SMC01-T_AAACCTGGTCGCATAT,1.900679,Epithelial cells
SMC01-T_AAACGGGGTATAGGTA,2.048604,Epithelial cells
SMC01-T_AAAGATGAGGCCGAAT,2.124462,Epithelial cells
SMC01-T_AAAGATGTCACGACTA,2.289014,Epithelial cells
...,...,...
SMC25-T_TGTTCCGTCCTTGCCA,1.884682,Epithelial cells
SMC25-T_TTCTACATCTTAGAGC,1.819336,Epithelial cells
SMC25-T_TTCTCAAAGGTTCCTA,1.699779,Epithelial cells
SMC25-T_TTTGCGCAGACACGAC,1.894608,Epithelial cells


In [31]:
#obtaining all the cell types except epithelial (tumor) cells
other_cells = tumor_samples[tumor_samples['Cell_type'] != 'Epithelial cells']
other_cells.set_index('Index', inplace=True)
other_cells=other_cells[['Cell_type']]
other_cells

Unnamed: 0_level_0,Cell_type
Index,Unnamed: 1_level_1
SMC01-T_AAAGTAGAGTGGTAGC,Stromal cells
SMC01-T_ACACCCTGTTGGTAAA,Stromal cells
SMC01-T_ACAGCCGGTCTCTCGT,Stromal cells
SMC01-T_ACATCAGTCGCCTGAG,Stromal cells
SMC01-T_ACATCAGTCTCCCTGA,Stromal cells
...,...
SMC25-T_TTTGGTTCAACGCACC,B cells
SMC25-T_TTTGTCAAGCGCTCCA,B cells
SMC06-T_TCTTCGGCAAACAACA,Mast cells
SMC07-T_TGAGAGGGTTTAGGAA,Mast cells


In [23]:
combined_lowIP_HighIP=pd.concat([low_immuno, high_immuno, other_cells])
combined_lowIP_HighIP

Unnamed: 0_level_0,Immunoproteasome,Cell_type
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
SMC01-T_AAACGGGAGGGAAACA,1.000000,LowIP epithelial cells
SMC01-T_AAAGATGTCCGTTGCT,1.000000,LowIP epithelial cells
SMC01-T_AAATGCCTCATTTGGG,1.000000,LowIP epithelial cells
SMC01-T_AACCATGTCCTTGACC,1.000000,LowIP epithelial cells
SMC01-T_AACTCTTGTTCGTCTC,1.168621,LowIP epithelial cells
...,...,...
SMC25-T_TTTGGTTCAACGCACC,,B cells
SMC25-T_TTTGTCAAGCGCTCCA,,B cells
SMC06-T_TCTTCGGCAAACAACA,,Mast cells
SMC07-T_TGAGAGGGTTTAGGAA,,Mast cells


In [32]:
combined_low_immuno=pd.concat([low_immuno, other_cells])
combined_low_immuno

Unnamed: 0_level_0,Immunoproteasome,Cell_type
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
SMC01-T_AAACGGGAGGGAAACA,1.000000,Epithelial cells
SMC01-T_AAAGATGTCCGTTGCT,1.000000,Epithelial cells
SMC01-T_AAATGCCTCATTTGGG,1.000000,Epithelial cells
SMC01-T_AACCATGTCCTTGACC,1.000000,Epithelial cells
SMC01-T_AACTCTTGTTCGTCTC,1.168621,Epithelial cells
...,...,...
SMC25-T_TTTGGTTCAACGCACC,,B cells
SMC25-T_TTTGTCAAGCGCTCCA,,B cells
SMC06-T_TCTTCGGCAAACAACA,,Mast cells
SMC07-T_TGAGAGGGTTTAGGAA,,Mast cells


In [33]:
combined_high_immuno=pd.concat([high_immuno, other_cells])
combined_high_immuno

Unnamed: 0_level_0,Immunoproteasome,Cell_type
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
SMC01-T_AAACCTGCATACGCCG,2.335775,Epithelial cells
SMC01-T_AAACCTGGTCGCATAT,1.900679,Epithelial cells
SMC01-T_AAACGGGGTATAGGTA,2.048604,Epithelial cells
SMC01-T_AAAGATGAGGCCGAAT,2.124462,Epithelial cells
SMC01-T_AAAGATGTCACGACTA,2.289014,Epithelial cells
...,...,...
SMC25-T_TTTGGTTCAACGCACC,,B cells
SMC25-T_TTTGTCAAGCGCTCCA,,B cells
SMC06-T_TCTTCGGCAAACAACA,,Mast cells
SMC07-T_TGAGAGGGTTTAGGAA,,Mast cells


In [34]:
combined_low_immuno.to_csv('Combined_low_immuno.csv')
combined_high_immuno.to_csv('Combined_high_immuno.csv')

In [24]:
combined_lowIP_HighIP.to_csv('Combined_LowIP_HighIP.csv')