In [1]:
# Libraries
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import deconomix as dcx
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# DISCO Dataset: `disco_breast_v01.h5ad` from https://zenodo.org/records/7396984.
data = sc.read_h5ad('../Data/DISCO/disco_breast_v01.h5ad', backed='r')

In [4]:
# Import consensus gene set, determined in B01_Consensus_Gene_Selection.ipynb
genes_and_conv = pd.read_csv('../Data/Preprocessed/final_genes_and_conv.csv')
consensus_genes = genes_and_conv.loc[:,"gene"].values.flatten()

In [6]:
ct_aggregation = pd.read_csv('../Data/DISCO/cell_type_aggregation.csv')
ct_major = dict(zip(ct_aggregation['Cell type (DISCO)'], ct_aggregation['Cell type (major)']))
ct_minor = dict(zip(ct_aggregation['Cell type (DISCO)'], ct_aggregation['Cell type (minor)']))

In [8]:
# Filtering for solid breast cancer samples
data_cancer = data[(data.obs['disease'] == 'breast cancer') & (data.obs['sample_type'] == 'solid tumor')]
data_healthy = data[(data.obs['disease'] == 'NA') & (data.obs['sample_type'] == 'normal')]
# Attach original ct labels
ct_labels_cancer = data_cancer.obs['ct']
ct_labels_healthy = data_healthy.obs['ct']
# Choose consensus genes
gene_mask = data.var.index.isin(consensus_genes)

disco_cancer = pd.DataFrame(data_cancer.X[:,gene_mask].toarray().T,
                        index=data_cancer.var['gene'][gene_mask].values,
                        columns=ct_labels_cancer)

#Normalize to 10k counts per sample
disco_cancer_norm = disco_cancer * (10000 / disco_cancer.sum())

# Reorder index to match order of consensus genes
disco_cancer = disco_cancer.loc[consensus_genes, :]
disco_cancer_norm = disco_cancer_norm.loc[consensus_genes, :]

# Save results to pickle
disco_cancer.to_pickle('../Data/Preprocessed/disco_cancer.pkl')
disco_cancer_norm.to_pickle('../Data/Preprocessed/disco_cancer_norm.pkl')

disco_healthy = pd.DataFrame(data_healthy.X[:,gene_mask].toarray().T,
                        index=data_healthy.var['gene'][gene_mask].values,
                        columns=ct_labels_healthy)

# drop cancer epithelial cells
disco_healthy = disco_healthy.loc[:, disco_healthy.columns.map(ct_major) != 'Cancer Epithelial']

# Normalize to 10k counts per sample
disco_healthy_norm = disco_healthy * (10000 / disco_healthy.sum())

# Reorder index to match order of consensus genes
disco_healthy = disco_healthy.loc[consensus_genes, :]
disco_healthy_norm = disco_healthy_norm.loc[consensus_genes, :]

# Save results to pickle
disco_healthy.to_pickle('../Data/Preprocessed/disco_healthy.pkl')
disco_healthy_norm.to_pickle('../Data/Preprocessed/disco_healthy_norm.pkl')


In [9]:
# TCGA data downloaded by running tcga_downloader.py once, extract the downloaded archives, then run it again...
data = pd.read_csv("../Data/TCGA/tcga_brca_rnaseq_counts.csv", index_col=0)
pam50_row = data.loc["PAM50"].tolist()  # Extract PAM50 row as a list
data = data.drop("PAM50")  # Remove PAM50 row from main data
data = data.apply(pd.to_numeric, errors="coerce").astype("Int64")
pd.Series(pam50_row).to_csv('../Data/Preprocessed/tcga_subtype.csv', index=False)

# select consensus genes and scale bulks with conversion factors

data

  data = pd.read_csv("../Data/TCGA/tcga_brca_rnaseq_counts.csv", index_col=0)


Unnamed: 0_level_0,TCGA-B6-A408,TCGA-AQ-A04L,TCGA-E2-A15S,TCGA-D8-A1JK,TCGA-B6-A0RT,TCGA-BH-A1F2,TCGA-LL-A441,TCGA-A2-A3XT,TCGA-BH-A0GY,TCGA-BH-A18S,...,TCGA-AN-A0AM,TCGA-BH-A1FJ.1,TCGA-E2-A156,TCGA-BH-A1FU.1,TCGA-E2-A109,TCGA-C8-A137,TCGA-A1-A0SE,TCGA-B6-A0IN,TCGA-A8-A06R,TCGA-AR-A1AT
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TSPAN6,1899,3350,1456,4728,1494,6213,5172,1286,3451,3603,...,2501,9160,635,2483,3936,745,5089,2668,1287,1402
TNMD,4,4,24,1,78,528,68,23,15,760,...,1,879,1,121,2,0,24,134,9,3
DPM1,2167,4400,4671,1826,3955,2325,4333,4211,7829,1322,...,7444,4066,718,1037,3148,1537,1563,969,2255,1210
SCYL3,1516,1654,2480,695,812,1505,2584,1472,948,947,...,2268,2022,690,1092,2106,1798,1769,620,654,1129
C1orf112,417,341,1329,456,404,391,971,1507,534,231,...,1177,762,273,293,832,497,1308,190,401,549
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AC008763.4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AL592295.6,326,282,516,217,349,609,395,285,244,289,...,432,584,134,223,425,207,617,425,342,518
AC006486.3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AL391628.1,11,3,15,1,8,25,35,45,4,9,...,11,11,1,12,5,5,3,2,7,3


In [10]:
# select consensus genes and apply conversion factors
conv_factors = genes_and_conv.loc[:, "conversion factor"].values.flatten()
tcga_conv = data.loc[consensus_genes, :].multiply(conv_factors, axis=0)
tcga_conv

Unnamed: 0_level_0,TCGA-B6-A408,TCGA-AQ-A04L,TCGA-E2-A15S,TCGA-D8-A1JK,TCGA-B6-A0RT,TCGA-BH-A1F2,TCGA-LL-A441,TCGA-A2-A3XT,TCGA-BH-A0GY,TCGA-BH-A18S,...,TCGA-AN-A0AM,TCGA-BH-A1FJ.1,TCGA-E2-A156,TCGA-BH-A1FU.1,TCGA-E2-A109,TCGA-C8-A137,TCGA-A1-A0SE,TCGA-B6-A0IN,TCGA-A8-A06R,TCGA-AR-A1AT
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
IGKC,3427544.288955,10599.69434,29498.296286,776500.451905,3731670.116131,508408.561932,1019022.463114,278400.218307,124634.320652,208517.58894,...,577356.81066,589739.865896,653.061737,220101.899437,974508.770438,60131.915284,332463.682992,610356.522565,1266216.377515,506112.798751
IGHA1,22733.649347,859.520458,6900.409657,15221.028363,58923.218754,68950.75538,44638.085728,19659.863563,9762.64915,23453.149195,...,15619.268801,20157.512573,75.768729,34369.30147,4497.631733,13063.741121,44248.331388,60871.990456,45922.517216,7966.627206
IGHM,77987.026304,360.384827,315.336724,49046.122555,122841.172572,3100.811116,75655.786954,8166.220074,9819.235201,7098.830291,...,9382.518865,106488.711045,35.037414,10477.438044,32474.677192,2368.779436,8835.684943,8039.835117,516290.056154,8262.572962
IGHG1,1067414.705488,923.622329,233.622119,290954.616302,551384.8729,9016.862986,295292.924712,73392.66018,23087.162554,11215.89909,...,130337.372431,18480.596188,107.303182,3701.959791,141477.88764,1511.073296,4195.011122,39116.763899,360859.923056,126454.083845
JCHAIN,5115.005149,341.178737,3053.215211,8841.212144,51511.299567,51623.687857,14421.825923,4480.814084,6283.040591,16131.733477,...,1348.65948,96999.121973,20.069337,20465.37242,8158.854669,4484.827952,22569.976945,19265.226038,85990.42138,2354.802266
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PPID,1538.241662,1439.245911,2945.123578,1195.564064,1999.904537,1755.270807,1228.879941,1511.58896,1839.036442,989.957505,...,1860.929733,2176.002747,825.281882,1011.850796,2593.87904,2042.739237,1893.293729,903.336223,1110.846547,1109.894664
STAMBP,1832.061483,1906.497363,2658.232693,1862.238191,2744.068663,2376.583417,1745.554919,1394.163918,1406.905194,1194.32705,...,2634.76192,3065.282957,1060.208347,1146.044317,2835.939975,2066.09862,2282.700325,1197.680018,2813.139795,1483.352855
EP300,1103.766733,1353.736843,3073.386524,919.303259,1673.433343,2827.234286,1618.777499,1644.497896,1147.973665,1155.408468,...,1366.998923,2502.11239,1052.526879,1715.83181,2261.787428,1000.282322,2077.926777,540.329282,975.767569,1109.192129
PTGS1,371.285763,258.67835,96.937985,412.451208,263.99002,275.144528,350.570248,353.226083,196.000639,320.293727,...,237.697252,496.110018,496.375601,196.000639,162.005948,184.04938,469.020498,142.352767,347.648829,155.100776


In [11]:
tcga_conv_norm = tcga_conv * (10000/ tcga_conv.sum())
tcga_conv_norm

Unnamed: 0_level_0,TCGA-B6-A408,TCGA-AQ-A04L,TCGA-E2-A15S,TCGA-D8-A1JK,TCGA-B6-A0RT,TCGA-BH-A1F2,TCGA-LL-A441,TCGA-A2-A3XT,TCGA-BH-A0GY,TCGA-BH-A18S,...,TCGA-AN-A0AM,TCGA-BH-A1FJ.1,TCGA-E2-A156,TCGA-BH-A1FU.1,TCGA-E2-A109,TCGA-C8-A137,TCGA-A1-A0SE,TCGA-B6-A0IN,TCGA-A8-A06R,TCGA-AR-A1AT
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
IGKC,703.612218,2.103291,6.888553,214.497157,689.589068,91.027913,208.874223,46.559796,40.026762,61.692947,...,113.463906,100.377323,0.45446,89.690401,241.38447,21.344196,79.467313,109.530107,271.347141,160.064418
IGHA1,4.666803,0.170554,1.61141,4.204592,10.888639,12.345275,9.149696,3.287926,3.13531,6.938954,...,3.069546,3.430932,0.052727,14.005315,1.114057,4.637056,10.576482,10.923641,9.841086,2.519544
IGHM,16.009312,0.071511,0.073639,13.548291,22.700273,0.555184,15.507552,1.365723,3.153483,2.100292,...,1.843881,18.125028,0.024382,4.269503,8.043932,0.840813,2.111954,1.44277,110.639724,2.613141
IGHG1,219.120736,0.183274,0.054556,80.372056,101.892442,1.614423,60.527694,12.274226,7.414526,3.318386,...,25.614294,3.14551,0.074671,1.50853,35.043876,0.536365,1.002715,7.019608,77.331418,39.992665
JCHAIN,1.050017,0.0677,0.712998,2.442259,9.518963,9.242953,2.956115,0.749374,2.017821,4.772807,...,0.265043,16.509842,0.013966,8.339535,2.020937,1.591917,5.394801,3.457196,18.427542,0.744735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PPID,0.315773,0.285589,0.687756,0.330257,0.36957,0.314272,0.25189,0.252799,0.590613,0.292893,...,0.365716,0.370369,0.574307,0.412324,0.6425,0.725083,0.452546,0.162106,0.238052,0.351018
STAMBP,0.376089,0.378305,0.62076,0.514417,0.507087,0.425515,0.357795,0.233161,0.451833,0.353359,...,0.517791,0.52173,0.737791,0.467007,0.702458,0.733375,0.545624,0.214927,0.602849,0.469129
EP300,0.226583,0.268621,0.717709,0.253944,0.30924,0.506202,0.331809,0.275027,0.368676,0.341844,...,0.268647,0.425875,0.732445,0.699193,0.560242,0.355056,0.496678,0.096964,0.209105,0.350796
PTGS1,0.076218,0.051329,0.022637,0.113934,0.048784,0.049263,0.071858,0.059074,0.062946,0.094764,...,0.046713,0.084441,0.345424,0.079869,0.040129,0.065329,0.112108,0.025546,0.0745,0.049053


In [12]:
tcga_conv.to_pickle('../Data/Preprocessed/tcga_conv.pkl')
tcga_conv_norm.to_pickle('../Data/Preprocessed/tcga_conv_norm.pkl')