In [1]:
# import the needed packages
import pandas as pd
import stlearn as st
import scanpy as sc
import scanpy.external as sce
# import squidpy as sq
import matplotlib.pyplot as plt
import os
import sys
import subprocess
import numpy as np
import anndata as ad
from anndata import AnnData
import matplotlib as mpl
import seaborn as sns; sns.set(color_codes=True)
import monkeybread as mb

import cupy as cp
import time
import rapids_singlecell as rsc

import scimap as sm

# make sure that we are in the right directory
print(os.getcwd())

# set the filenames we are looking at
filenames = ["S17_7722_E4","S1614455_C21","S17_16197_A1","S18_13562_A1","S18_5464_A1","S18_13562_B1","S17_10147B7_S2"]
pop_samples = ['Haitian','Haitian','African American','African American','Haitian','African American','Haitian']

/mnt/plummergrp/QuPath_0.4.4/BrCa


In [None]:
# map the phenotypes onto the tissues
adata = ad.read_h5ad('./session_file.h5ad')
newAD = ad.read_h5ad('./session_file_scimap_v8_scaled.h5ad')

adata.obs['phenotype'] = newAD.obs['phenotype'].astype('category')
adata.obs['phenotype'] = adata.obs['phenotype'].str.replace('likely-', '').astype('category')

# remove the "other cells"
adata = adata[(adata.obs['phenotype'] != 'Non-Immune cells') & (adata.obs['phenotype'] != 'Other Immune cells')]

adatas_sm = []

for i in range(len(filenames)):
    adatas_sm.append(adata[adata.obs['ImageID'] == filenames[i]])
    # print(adatas_sm[i])
    st.pl.cluster_plot(adatas_sm[i],use_label="phenotype", show_cluster_labels=False, size=0.5, figsize=(8, 8), cmap='tab20')
    plt.show()

In [None]:
# generate random squares of the images 
import random
radius_box = 400 # radius box sizes
num_sq = 50 # how many random squares we generate for each sample
min_cells = 100 # lowest number of cells allowed

# load in the workflow
phenoDF = pd.read_csv('../SciMap/phenotype_workflow.csv')

# define a function that flattens list which is useful later on
def flatten(l):
    return [item for sublist in l for item in sublist]

# initialize the lists/dfs used to store the scores and the mistakes
df_mistakes = pd.DataFrame(0, index=adatas_sm[0].obs['phenotype'].unique().tolist(), columns=['num_mistakes', 'num_total', '%_wrong'])
df_gene_mistakes = pd.DataFrame(0, index=adatas_sm[0].obs['phenotype'].unique().tolist(), columns=pd.Index.tolist(adatas_sm[0].var_names))

total_cells = 0
for i in range(len(filenames)):
    # scale the data
    temp_uncut = adatas_sm[i].copy()
    st.pp.scale(temp_uncut, zero_center=True)

    # calculate the standard deviations we are going to use
    groups_tot = temp_uncut.obs['phenotype'].unique().tolist()
    genes_tot = pd.Index.tolist(temp_uncut.var_names)
    df_tot = pd.DataFrame(columns=genes_tot, index=groups_tot)
    temp_pheno = temp_uncut.obs['phenotype']
    X_temp_uncut = np.array(temp_uncut.X)


    # for each element in the table, calculate the mean expression
    for group in groups_tot:
        phenotype_cells = X_temp_uncut[temp_pheno == group, :]
        mean_expression = phenotype_cells.mean(axis=0)
        df_tot.loc[group] = mean_expression
    mystd = df_tot.std()
    
    j = 0
    while j < num_sq:
        # draw a box for where the centroids can be such that it wont touch the edge
        x_min = min(temp_uncut.obs["imagecol"])+radius_box
        x_max = max(temp_uncut.obs["imagecol"])-radius_box
        y_min = min(temp_uncut.obs["imagerow"])+radius_box
        y_max = max(temp_uncut.obs["imagerow"])-radius_box

        # randomly generate a centroid
        x_cent = random.random()*(x_max-x_min)+x_min
        y_cent = random.random()*(y_max-y_min)+y_min
        # print(str(x_cent) + ", " + str(y_cent))

        x_min_cut = x_cent-radius_box # want x to be bigger than this value
        x_max_cut = x_cent+radius_box # want x to be smaller than this value
        y_min_cut = y_cent-radius_box # want y to be bigger than this value
        y_max_cut = y_cent+radius_box # want y to be smaller than this value

        # create the randomly drawn box
        temp_scaled = temp_uncut[(temp_uncut.obs['imagecol']>x_min_cut) 
                            & (temp_uncut.obs['imagecol']<x_max_cut)
                            & (temp_uncut.obs['imagerow']>y_min_cut)
                            & (temp_uncut.obs['imagerow']<y_max_cut)]

        # print(temp)
        if(len(temp_scaled)>min_cells):
            total_cells = total_cells + len(temp_scaled)
            genes = pd.Index.tolist(temp_scaled.var_names)

            # create cluster plot
            # st.pl.cluster_plot(temp_scaled,use_label="phenotype", show_cluster_labels=False, size=12, figsize=(8, 8), title=filenames[i])
            
            # create gene expression dotplot
            # sc.pl.dotplot(temp_scaled, genes, groupby='phenotype', dendrogram=False, title=filenames[i])

            # create a data frame that is cell types x genes
            groups = temp_scaled.obs['phenotype'].unique().tolist()
            genes = pd.Index.tolist(temp_scaled.var_names)
            df = df_tot.loc[groups]

            # create a workflow table that is correspondant with the subsample
            absentCells = list((set(groups) ^ set(adata.obs['phenotype'].unique().tolist())))
            temp_phenoDF = phenoDF[~phenoDF['Unnamed: 1'].isin(absentCells)].copy()
            temp_phenoDF.drop(inplace=True, columns = 'Unnamed: 0')
            temp_phenoDF.set_index('Unnamed: 1', inplace=True)
            
            # compare the workflow with the mean expression
            # correctness:
            # if the workflow indicates pos, and the mean expression is >1 std
            # if the workflow indicates NaN, and the mean expression is >-1 std and <1 std
            # if the workflow indicates neg, and the mean expression is <-1 std

            for group in groups:
                for gene in genes:
                    df_mistakes.at[group, 'num_total'] = df_mistakes.at[group, 'num_total'] + 1
                    if (type(temp_phenoDF.at[group, gene]) == np.float64 or type(temp_phenoDF.at[group, gene]) == float) and pd.isna(temp_phenoDF.at[group, gene]):
                        if abs(df.at[group, gene]) > mystd[gene]:
                            df_mistakes.at[group, 'num_mistakes'] = df_mistakes.at[group, 'num_mistakes'] + 1
                            df_gene_mistakes.at[group, gene] = df_gene_mistakes.at[group, gene] + 1
                    elif isinstance(temp_phenoDF.at[group, gene], str) and 'pos' in temp_phenoDF.at[group, gene]:
                        if df.at[group, gene] <= mystd[gene]:
                            df_mistakes.at[group, 'num_mistakes'] = df_mistakes.at[group, 'num_mistakes'] + 1
                            df_gene_mistakes.at[group, gene] = df_gene_mistakes.at[group, gene] + 1
                    elif isinstance(temp_phenoDF.at[group, gene], str) and 'neg' in temp_phenoDF.at[group, gene]:
                        if df.at[group, gene] >= -mystd[gene]:
                            df_mistakes.at[group, 'num_mistakes'] = df_mistakes.at[group, 'num_mistakes'] + 1
                            df_gene_mistakes.at[group, gene] = df_gene_mistakes.at[group, gene] + 1
            j = j+1

# find out which cell types are most falsely identified
for group in df_mistakes.index.tolist():
    df_mistakes.at[group, '%_wrong'] = (df_mistakes.at[group, 'num_mistakes']*100)/df_mistakes.at[group, 'num_total']
df_mistakes.sort_values(inplace=True, by='%_wrong', ascending=False)
print(df_mistakes)

# print total score
print(((sum(df_mistakes['num_total'])-sum(df_mistakes['num_mistakes']))/sum(df_mistakes['num_total']))*100)

In [None]:
# find out the % of mistake for each gene by cell type
df_gene_mistakes_percent = pd.DataFrame(0, index=adatas_sm[0].obs['phenotype'].unique().tolist(), columns=pd.Index.tolist(adatas_sm[0].var_names))
for group in df_gene_mistakes.index.tolist():
    for gene in df_gene_mistakes.columns.tolist():
        df_gene_mistakes_percent.at[group, gene] = (df_gene_mistakes.at[group, gene]*100)/(df_mistakes.at[group, 'num_total']/len(df_gene_mistakes.columns.tolist()))

df_gene_mistakes_percent.style.background_gradient(cmap ='viridis', axis=None)\
        .set_properties(**{'font-size': '20px'}) 

In [None]:
# find out which genes have the most mistakes
df_gene_summed_mistakes = pd.DataFrame(0, index=df_gene_mistakes.columns.tolist(), columns=['%_mistakes'])
for gene in df_gene_mistakes.columns.tolist():
    df_gene_summed_mistakes.at[gene, '%_mistakes'] = (sum(df_gene_mistakes[gene])/(sum(df_mistakes['num_total'])/len(df_gene_mistakes.columns.tolist())))*100
df_gene_summed_mistakes.sort_values(inplace=True, by='%_mistakes', ascending=False)
print(df_gene_summed_mistakes)

In [None]:
# phenotype the cells manually to test the logic
df_cell_genes = pd.DataFrame(0, index=list(range(0, len(temp_scaled.obs_names))), columns=temp_scaled.var_names)
# create an expression matrix in np array so it runs exponentially faster
exp_mat = np.array(temp_scaled.X)

# turn the matrix into a ternary:
# 1: the expression is >1 std --> upregulated
# 0: the expression is within 1 std 
# -1: the expression is <-1 std --> downregulated

for i in range(len(df_cell_genes)):
   j = 0
   for gene in temp_scaled.var_names:
      if exp_mat[i, j] > mystd[gene]:
         df_cell_genes.at[i, gene] = 1
      elif exp_mat[i, j]< -mystd[gene]:
         df_cell_genes.at[i, gene] = -1
      j = j+1


In [None]:
# load in the workflow
phenoDF = pd.read_csv('../SciMap/phenotype_workflow.csv')
phenoDF.drop(inplace=True, columns = 'Unnamed: 0')
phenoDF.set_index('Unnamed: 1', inplace=True)

# generate anypos cell types + genes
df_anypos = pd.DataFrame(columns=['phenotypes', 'genes'])

# iterate over rows in 'df_pheno' and filter based on the condition
for index, row in phenoDF.iterrows():
    if 'anypos' in row.values:
        cell_type = index
        genes_with_anypos = row[row == 'anypos'].index.tolist()

        # append the cell type and gene list to the new DataFrame
        df_anypos = pd.concat([df_anypos, pd.DataFrame({'phenotypes': cell_type, 'genes': genes_with_anypos})], ignore_index=True)

# reset the index using the genes for ease of access
df_anypos.set_index('genes', inplace=True)
# print(df_anypos)

In [None]:
# score the cell types using the difference between the standard deviation and the expression level
exp_mat = np.array(temp_scaled.X)
cell_phenotypes = []
factor = 3

for i in range(len(exp_mat)):
    single_use_phenotypes = pd.DataFrame(0, index=groups_tot, columns=['score'])
    for group in groups_tot:
        score = 0
        j = 0
        for gene in genes_tot:
            if (type(phenoDF.at[group, gene]) == np.float64 or type(phenoDF.at[group, gene]) == float) and pd.isna(phenoDF.at[group, gene]):
                diff = abs(exp_mat[i, j]) - abs(mystd[gene])
                if abs(exp_mat[i, j]) <= mystd[gene]:
                    score = score + diff
                else:
                    score = score - diff
            elif isinstance(phenoDF.at[group, gene], str) and 'pos' in phenoDF.at[group, gene]:
                diff = abs(exp_mat[i, j] - mystd[gene])
                if exp_mat[i, j] > mystd[gene]:
                    score = score + (diff**factor)
                else:
                    score = score - (diff**(factor/3))
            elif isinstance(phenoDF.at[group, gene], str) and 'neg' in phenoDF.at[group, gene]:
                diff = abs(exp_mat[i, j]) + (mystd[gene])
                if exp_mat[i, j] < -mystd[gene]:
                    score = score + (diff**factor)
                else:
                    score = score - (diff**(factor/3))
            j = j + 1
        single_use_phenotypes.at[group, 'score'] = score
    cell_phenotypes.append(str(single_use_phenotypes.sort_values(ascending=False, by='score').index[0]))
print(cell_phenotypes)

In [None]:
st.pl.cluster_plot(temp_scaled,use_label="phenotype", show_cluster_labels=False, size=12, figsize=(8, 8))
cell_phenotypes = pd.Series(cell_phenotypes)
temp_scaled.obs['custom_pheno'] = cell_phenotypes.to_numpy()
temp_scaled.obs['custom_pheno'] = temp_scaled.obs['custom_pheno'].astype('category')
st.pl.cluster_plot(temp_scaled,use_label="custom_pheno", show_cluster_labels=False, size=12, figsize=(8, 8))

In [None]:
# compare the ARI between the clusters
from sklearn.metrics.cluster import adjusted_rand_score
print(adjusted_rand_score(temp_scaled.obs['phenotype'], temp_scaled.obs['custom_pheno']))
# oof

In [None]:
cell_phenotypes = []
i = 0
# for i in range(len(df_cell_genes)):
    # for gene in df_anypos.index.tolist():
    #    if df_cell_genes.at[i, gene] == 1:
    #        cell_phenotypes.append(df_anypos.at[gene, 'phenotypes'])
single_use_phenotypes = pd.DataFrame(0, index=groups_tot, columns=['score'])
for group in groups_tot:
    correct = 0
    for gene in genes_tot:
        if (type(phenoDF.at[group, gene]) == np.float64 or type(phenoDF.at[group, gene]) == float) and pd.isna(phenoDF.at[group, gene]):
            if df_cell_genes.at[i, gene] == 0:
                correct = correct + 1
            else:
                correct = correct - 1
        elif isinstance(phenoDF.at[group, gene], str) and 'pos' in phenoDF.at[group, gene]:
            if df_cell_genes.at[i, gene] == 1:
                correct = correct + 3
            else:
                correct = correct - 1
        elif isinstance(phenoDF.at[group, gene], str) and 'neg' in phenoDF.at[group, gene]:
            if df_cell_genes.at[i, gene] == -1:
                correct = correct + 3
            else:
                correct = correct - 1
    single_use_phenotypes.at[group, 'score'] = correct
print(single_use_phenotypes.sort_values(ascending=False, by='score'))
            
# print(pd.DataFrame(cell_phenotypes, columns=['phenotypes']).value_counts())
# print(temp_scaled.obs['phenotype'].value_counts())
# print(len(temp_scaled.obs['phenotype']))