## Imports and Data

In [1]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from matplotlib import colormaps as cm
import seaborn as sb
import scanpy as sc
import umap as up
from scipy.stats import gaussian_kde
from scib import metrics

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
sc.set_figure_params(figsize=(4,4),dpi=90)

In [38]:
adata= sc.read_h5ad("Data\\project_data_full.h5ad")

## Basic Preprocessing

In [39]:
adata

AnnData object with n_obs × n_vars = 560 × 54271
    obs: 'CCLE_ID', 'Name', 'Pathology', 'Site_Primary', 'Site_Subtype1', 'Site_Subtype2', 'Site_Subtype3', 'Histology', 'Hist_Subtype1', 'Hist_Subtype2', 'Hist_Subtype3', 'Gender', 'Life_Stage', 'Age', 'Race', 'Geo_Loc', 'inferred_ethnicity', 'Site_Of_Finding', 'Disease', 'Annotation_Source', 'Original.Source.of.Cell.Line', 'Characteristics', 'Growth.Medium', 'Supplements', 'Freezing.Medium', 'Doubling.Time.from.Vendor', 'Doubling.Time.Calculated.hrs', 'type', 'type_refined', 'PATHOLOGIST_ANNOTATION', 'mutRate', 'tcga_code'

In [40]:
sc.pp.filter_genes(adata,min_cells=1)

In [41]:
adata

AnnData object with n_obs × n_vars = 560 × 47269
    obs: 'CCLE_ID', 'Name', 'Pathology', 'Site_Primary', 'Site_Subtype1', 'Site_Subtype2', 'Site_Subtype3', 'Histology', 'Hist_Subtype1', 'Hist_Subtype2', 'Hist_Subtype3', 'Gender', 'Life_Stage', 'Age', 'Race', 'Geo_Loc', 'inferred_ethnicity', 'Site_Of_Finding', 'Disease', 'Annotation_Source', 'Original.Source.of.Cell.Line', 'Characteristics', 'Growth.Medium', 'Supplements', 'Freezing.Medium', 'Doubling.Time.from.Vendor', 'Doubling.Time.Calculated.hrs', 'type', 'type_refined', 'PATHOLOGIST_ANNOTATION', 'mutRate', 'tcga_code'
    var: 'n_cells'

In [42]:
adata_deepCDR= sc.read_h5ad("Data\\project_data.h5ad")

In [43]:
adata_deepCDR

AnnData object with n_obs × n_vars = 561 × 697
    obs: 'CCLE_ID', 'Name', 'Pathology', 'Site_Primary', 'Site_Subtype1', 'Site_Subtype2', 'Site_Subtype3', 'Histology', 'Hist_Subtype1', 'Hist_Subtype2', 'Hist_Subtype3', 'Gender', 'Life_Stage', 'Age', 'Race', 'Geo_Loc', 'inferred_ethnicity', 'Site_Of_Finding', 'Disease', 'Annotation_Source', 'Original.Source.of.Cell.Line', 'Characteristics', 'Growth.Medium', 'Supplements', 'Freezing.Medium', 'Doubling.Time.from.Vendor', 'Doubling.Time.Calculated.hrs', 'type', 'type_refined', 'PATHOLOGIST_ANNOTATION', 'mutRate', 'tcga_code'

In [44]:
full = pd.read_csv("Data\\CCLE_depMap_18Q4_TPM_v2.csv",index_col=0)

In [45]:
full = full.index

In [46]:
set(adata_deepCDR.obs.index)-set(adata.obs.index) #missing cellline

{'ACH-001190'}

In [47]:
set(adata_deepCDR.var.index)-set(adata.var.index)

set()

In [48]:
adata.var["DeepCDR_Oncogene"] =False
adata.var.loc[adata_deepCDR.var.index,"DeepCDR_Oncogene"]=True

In [49]:
sum(adata.var["DeepCDR_Oncogene"])

697

In [62]:
sc.pp.highly_variable_genes(adata,n_top_genes=3000,flavor="seurat_v3")

In [64]:
adata.var.sort_values("highly_variable_rank",ascending =True)[:10]

Unnamed: 0,n_cells,DeepCDR_Oncogene,highly_variable,means,dispersions,dispersions_norm,highly_variable_rank,variances,variances_norm
VIM,560,False,True,7.135359,11.750155,1.218092,0.0,13.735656,27.872288
KRT19,560,False,True,5.67046,11.542365,0.814296,1.0,15.802694,27.637481
KRT8,560,False,True,7.549101,12.023965,1.439404,2.0,12.338095,25.790941
SPARC,560,False,True,4.985572,13.045238,1.623394,3.0,14.851829,23.553888
KRT18,560,False,True,7.348741,11.653817,0.874298,4.0,11.060748,22.803081
UCHL1,560,False,True,5.13627,10.152159,0.137915,5.0,13.820809,22.448134
TGFBI,560,False,True,5.480094,11.70001,0.840388,6.0,13.041703,22.247508
FN1,560,False,True,5.967811,13.737876,2.824705,7.0,12.090553,21.92661
KRT7,560,False,True,5.057133,11.274549,0.647298,8.0,13.47463,21.617093
LGALS1,560,False,True,8.884486,12.111261,1.556828,9.0,9.470472,21.263935


In [72]:
#filter columns with too few information
for c in adata.obs.columns:
    print(c)
    count = adata.obs[c].count()
    if count <100:
        adata.obs.drop(columns=[c],inplace=True)
    

CCLE_ID
Name
Pathology
Site_Primary
Site_Subtype1
Site_Subtype2
Site_Subtype3
Histology
Hist_Subtype1
Hist_Subtype2
Hist_Subtype3
Gender
Life_Stage
Age
Race
Geo_Loc
inferred_ethnicity
Site_Of_Finding
Disease
Annotation_Source
Original.Source.of.Cell.Line
Characteristics
Growth.Medium
Supplements
Freezing.Medium
Doubling.Time.from.Vendor
Doubling.Time.Calculated.hrs
type
type_refined
PATHOLOGIST_ANNOTATION
mutRate
tcga_code


In [73]:
adata

AnnData object with n_obs × n_vars = 560 × 47269
    obs: 'CCLE_ID', 'Name', 'Pathology', 'Site_Primary', 'Site_Subtype1', 'Site_Subtype2', 'Site_Subtype3', 'Histology', 'Hist_Subtype1', 'Hist_Subtype2', 'Hist_Subtype3', 'Gender', 'Age', 'Race', 'inferred_ethnicity', 'Site_Of_Finding', 'Disease', 'Annotation_Source', 'Original.Source.of.Cell.Line', 'Characteristics', 'Growth.Medium', 'Freezing.Medium', 'Doubling.Time.from.Vendor', 'Doubling.Time.Calculated.hrs', 'type', 'type_refined', 'PATHOLOGIST_ANNOTATION', 'mutRate', 'tcga_code'
    var: 'n_cells', 'DeepCDR_Oncogene', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_rank', 'variances', 'variances_norm'
    uns: 'hvg'

In [79]:
#clean up some columns (especially if only one catgeory)
for c in adata.obs.columns:
    if adata.obs[c].dtype=="category":
        print(c)
        print(adata.obs[c].cat.categories)
        if len(adata.obs[c].cat.categories)==1:
            adata.obs.drop(columns=[c],inplace=True)

Name
Index(['8-MG-BA', '22Rv1', '42-MG-BA', '639-V', '647-V', '697', '769-P',
       '786-O', '5637', '8305C',
       ...
       'VMRC-RCW', 'VMRC-RCZ', 'WM-115', 'WM-793', 'WSU-DLCL2', 'YAPC',
       'YH-13', 'YKG1', 'ZR-75-30', 'huH-1'],
      dtype='object', length=559)
Pathology
Index(['metastasis', 'primary'], dtype='object')
Site_Primary
Index(['autonomic_ganglia', 'biliary_tract', 'bone', 'breast',
       'central_nervous_system', 'endometrium',
       'haematopoietic_and_lymphoid_tissue', 'kidney', 'large_intestine',
       'liver', 'lung', 'oesophagus', 'ovary', 'pancreas', 'pleura',
       'prostate', 'salivary_gland', 'skin', 'soft_tissue', 'stomach',
       'thyroid', 'upper_aerodigestive_tract', 'urinary_tract'],
      dtype='object')
Site_Subtype1
Index(['NS', 'bile_duct', 'bladder', 'brain', 'bronchus', 'caecum',
       'cerebellum', 'colon', 'femur', 'fibrous_tissue_and_uncertain_origin',
       'frontal_lobe', 'head_neck', 'left_upper_lobe', 'lower_third',
       'lymp

In [80]:
adata

AnnData object with n_obs × n_vars = 560 × 47269
    obs: 'CCLE_ID', 'Name', 'Pathology', 'Site_Primary', 'Site_Subtype1', 'Site_Subtype2', 'Histology', 'Hist_Subtype1', 'Hist_Subtype2', 'Gender', 'Age', 'Race', 'inferred_ethnicity', 'Site_Of_Finding', 'Disease', 'Original.Source.of.Cell.Line', 'Characteristics', 'Growth.Medium', 'Freezing.Medium', 'Doubling.Time.from.Vendor', 'Doubling.Time.Calculated.hrs', 'type', 'type_refined', 'PATHOLOGIST_ANNOTATION', 'mutRate', 'tcga_code'
    var: 'n_cells', 'DeepCDR_Oncogene', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_rank', 'variances', 'variances_norm'
    uns: 'hvg'

In [82]:
for c in adata.obs.columns:
    if adata.obs[c].dtype=="category":
        print(c)
        print(adata.obs[c].cat.categories)
        
#To clean up:
#Freezing medium
#Characteristics


Name
Index(['8-MG-BA', '22Rv1', '42-MG-BA', '639-V', '647-V', '697', '769-P',
       '786-O', '5637', '8305C',
       ...
       'VMRC-RCW', 'VMRC-RCZ', 'WM-115', 'WM-793', 'WSU-DLCL2', 'YAPC',
       'YH-13', 'YKG1', 'ZR-75-30', 'huH-1'],
      dtype='object', length=559)
Pathology
Index(['metastasis', 'primary'], dtype='object')
Site_Primary
Index(['autonomic_ganglia', 'biliary_tract', 'bone', 'breast',
       'central_nervous_system', 'endometrium',
       'haematopoietic_and_lymphoid_tissue', 'kidney', 'large_intestine',
       'liver', 'lung', 'oesophagus', 'ovary', 'pancreas', 'pleura',
       'prostate', 'salivary_gland', 'skin', 'soft_tissue', 'stomach',
       'thyroid', 'upper_aerodigestive_tract', 'urinary_tract'],
      dtype='object')
Site_Subtype1
Index(['NS', 'bile_duct', 'bladder', 'brain', 'bronchus', 'caecum',
       'cerebellum', 'colon', 'femur', 'fibrous_tissue_and_uncertain_origin',
       'frontal_lobe', 'head_neck', 'left_upper_lobe', 'lower_third',
       'lymp

In [84]:
import re

In [125]:
mapping={}
for c in adata.obs["Characteristics"].cat.categories:
    mapping[c]="other"
    if re.search('adherent', c, re.IGNORECASE) and re.search('susp', c, re.IGNORECASE) is None:
        mapping[c]="adherent"
    if re.search('sus', c, re.IGNORECASE) and re.search('adh', c, re.IGNORECASE) is None:
        mapping[c]="suspension"
    if re.search('mix', c, re.IGNORECASE):
        mapping[c]="mixed"
    if re.search('epit', c, re.IGNORECASE):
        mapping[c]="adherent"
    if re.search('fibro', c, re.IGNORECASE):
        mapping[c]="adherent"
    if re.search('lymph', c, re.IGNORECASE):
        mapping[c]="suspension"
    if re.search('singly', c, re.IGNORECASE):
        mapping[c]="suspension"
    if re.search('clumps', c, re.IGNORECASE):
        mapping[c]="suspension"
    if re.search('monolayer', c, re.IGNORECASE):
        mapping[c]="adherent"

In [126]:
len(mapping.keys())

187

In [127]:
for c,v in mapping.items():
    print(c)
    print(v)
    print()

(relatively small) round cells growing in suspention, singly or in small clusters
suspension

ADHERENT
adherent

ADHERENT CELLS
adherent

ADHERENT CELLS, GROWING IN MONOLAYERS
adherent

ADHERENT EPITHELIAL
adherent

ADHERENT EPITHELIAL;
adherent

ADHERENT EPITHELOID CELLS GROWING IN MONOLAYERS
adherent

ADHERENT FIBROBLAST
adherent

ADHERENT FIBROBLAST LIKE
adherent

ADHERENT FIBROBLASTS
adherent

ADHERENT FIBROBLASTS, NOT LIMITED TO MONOLAYER GROWTH
adherent

ADHERENT MONOLAYERS
adherent

ADHERENT OVOID TO EPITHELOID CELLS GROWING IN MONOLAYERS
adherent

ADHERENT, EPITHIAL-LIKE
adherent

ADHERENT, LARGE, SPINDLE-LIKE CELLS GROWING AS MONOLAYERS
adherent

ADHERENT; EPITHELIAL-LIKE
adherent

ADHERENT; fast grower
adherent

ADHRENT EPITHELIAL; was clumpy
adherent

AHDHERENT; `
other

AHERENT EPITHELIAL;
adherent

Adherenet
other

Adherent
adherent

Adherent , small pools of cells with large extracel. Space
adherent

Adherent Epithelial
adherent

Adherent Spindle cells and large multinucl

In [129]:
adata.obs["adherent_or_suspension"] = adata.obs.Characteristics.map(mapping)

In [135]:
mapping={}
for c in adata.obs["Freezing.Medium"].cat.categories:
    if re.search('5', c, re.IGNORECASE):
        mapping[c]="5% DMSO"
    if re.search('10', c, re.IGNORECASE):
        mapping[c]="10% DMSO"

In [136]:
adata.obs["Freezing.Medium"] = adata.obs["Freezing.Medium"].map(mapping)

In [144]:
pd.DataFrame(adata.X,columns=adata.var.index,index=adata.obs.index).to_csv("cleaned_TPM_counts.csv")

In [145]:
adata.obs.to_csv("Data\\metadata_cleaned.csv")


In [146]:
adata.var.to_csv("Data\\gene_information.csv")

In [147]:
adata.write_h5ad("Data\\clean_data_full.h5ad")