# Rank Boosted Classifier on Pancreas Dataset

In [22]:
import numpy as np
import pandas as  pd
import scanpy as sc
from sklearn.linear_model import LinearRegression


In [23]:
# dropout induction
def fit_regression(E, D):
        clf = LinearRegression()
        if (np.sum(np.isnan(E)) > 1):
                print("Error")
                return 1, 0
        if (np.sum(np.isnan(D)) > 1):
                print("Error2")
                return 1, 0
        clf.fit(np.reshape(E, (E.shape[0], 1)), D)
        return (clf.coef_, clf.intercept_)

def calc_dropout(df):
        pg = []
        for col in df:
                pg.append(np.sum(df[col] == 0)/len(df))
        pg = np.array(pg)
        return pg

def get_prob_of_less_than(val, k):
        if np.random.random() < k:
                return 0
        else:
                return val
        
def inductive_dropout(df, f=0.5):
        p_g = calc_dropout(df)
        p_g[p_g <= 0] = 1e-6
        p_g[p_g >= 1] = 1 - 1e-6
        D_g = np.log(p_g/(1-p_g))
        R_g = np.array([df[col].mean() for col in list(df.columns)])
        E_g = np.log2(R_g + 1)
        beta, alpha = fit_regression(E_g, D_g)

        E_g_prime = f * E_g
        delta_g = beta*(E_g_prime - E_g)
        D_g_prime = D_g + delta_g

        p_g_prime = 1/ (1 + np.exp(-1*D_g_prime))
        R_g_prime = np.power(2, E_g_prime) - 1

        for i in range(len(df.columns)):
                df[df.columns[i]] = df[df.columns[i]].apply(lambda x : get_prob_of_less_than(x, p_g_prime[i]-p_g[i]))
                R_g_i = np.mean(df[df.columns[i]])
#               df[df.columns[i]] += R_g_i - R_g_prime[i]
                df[df.columns[i]] *= (R_g_prime[i]/(R_g_i + 1e-4))
        return df  
    
def apply_dropout(data, celltypes, n_iter = 1, f = 0.8):
        """
        INPUT:
                data: dataframe containing cell-gene expression matrix
                n_iter: no of times to apply dropout
                f: dropout concentration
        """
        new_df = pd.DataFrame()
        celltypes = celltypes.tolist()
        new_celltypes = celltypes*(n_iter)
        for i in range(n_iter):
                data = inductive_dropout(data.copy(), f)
                new_df = pd.concat((new_df, data))
        new_df['celltype'] = new_celltypes
        return new_df

# TRAINING DATA

## Read the data

In [24]:
## Read train data
adata_train = sc.read('/content/gdrive/MyDrive/Shared resources/Baron_pancreatic_islet.h5ad')
adata_train

AnnData object with n_obs × n_vars = 8569 × 20125
    obs: 'celltype'

## Preprocess the train data

In [25]:
adata_train.obs_names_make_unique()
sc.pp.filter_cells(adata_train, min_genes=200)
sc.pp.filter_genes(adata_train, min_cells=3)
sc.pp.normalize_total(adata_train, target_sum=1e4)
sc.pp.log1p(adata_train)
#sc.pp.highly_variable_genes(adata_train, n_top_genes = 1000)
adata_train.raw = adata_train
#adata_train = adata_train[:, adata_train.var.highly_variable]
sc.pp.scale(adata_train, max_value=10)
adata_train.shape

(8569, 16359)

## Get DE gene list
- We can do this step before or after dropout induction
- We will select both upregulated and downregulated genes

In [26]:
sc.tl.rank_genes_groups(adata_train, groupby="celltype", method='wilcoxon', use_raw = True)

- Note that we have to take the matrix sorted by logfoldchange. Not score.

In [27]:
adata_train

AnnData object with n_obs × n_vars = 8569 × 16359
    obs: 'celltype', 'n_genes'
    var: 'n_cells', 'mean', 'std'
    uns: 'log1p', 'rank_genes_groups'

In [28]:
result = adata_train.uns['rank_genes_groups']
groups = result['names'].dtype.names

- Need to discuss if taking score makes more sense than log-fold-change.
- The scores are calculated using a statistical test, which compares the expression of each gene in the target group of cells (e.g., a specific cluster) to the expression in all other cells.

In [29]:
detrain_dict = {}
for group in groups:
    print(group)
    gene_rank_df = sc.get.rank_genes_groups_df(adata_train, group=group, pval_cutoff=0.05)
    gene_rank_df.sort_values(by=['logfoldchanges'], inplace=True, ascending=False)
    if len(gene_rank_df) < 50:
        lfc_genes_df = gene_rank_df
    if len(gene_rank_df) >= 50:
        upregulated_genes = gene_rank_df.head(50)
        dnregulated_genes = gene_rank_df.tail(50)
        lfc_genes_df = pd.concat([upregulated_genes, dnregulated_genes], axis=0)
    detrain_dict[group] = dict(zip(lfc_genes_df['names'], lfc_genes_df['logfoldchanges']))

# Take all the DE genes to create subset of genes in the main matrix 
tot_gene_list = list(set([key for subdict in detrain_dict.values() for key in subdict.keys()]))

acinar
activated_stellate
alpha
beta
delta
ductal
endothelial
epsilon
gamma
macrophage
mast
quiescent_stellate
schwann
t_cell


In [30]:
# select only the subset of columns in the obs dataframe
adata_sub = adata_train[:,tot_gene_list]
adata_sub

View of AnnData object with n_obs × n_vars = 8569 × 983
    obs: 'celltype', 'n_genes'
    var: 'n_cells', 'mean', 'std'
    uns: 'log1p', 'rank_genes_groups'

In [31]:
# For each of the groups multiply the DE genes with the logFC with the expression
sub_list = []
for group in groups:
    print(group)
    gdata = adata_sub[adata_sub.obs['celltype'] == group, :].to_df()
    for gene, factor in detrain_dict[group].items():
        gdata[gene] = gdata[gene]* abs(factor)
    gdata = gdata.assign(celltype=group)
    sub_list.append(gdata)

transformed_count  = pd.concat(sub_list, axis=0)

acinar
activated_stellate
alpha
beta
delta
ductal
endothelial
epsilon
gamma
macrophage
mast
quiescent_stellate
schwann
t_cell


In [32]:
transformed_count 

index,PPP1R1B,EMCN,GRAMD1B,DPP6,ITGA5,KLHDC8A,EHD2,PODXL2,TMEM45B,SERPINA5,...,LST1,FLNB,TRPM5,HADH,SYNDIG1,CHN2,MYC,PKIG,C1QC,celltype
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
human1_lib1.final_cell_0001,-0.135802,-0.092737,-0.204484,-0.647779,-0.355001,-5.745687,-0.308726,-0.395415,-0.341975,-0.271569,...,-0.054845,0.165443,-0.130478,0.192637,-5.350889,-0.132739,-0.235670,0.435789,-0.07305,acinar
human1_lib1.final_cell_0002,-0.135802,-0.092737,-0.204484,-0.647779,0.116690,-5.745687,-0.308726,-0.395415,-0.341975,0.697971,...,-0.054845,0.532015,-0.130478,-0.374488,-5.350889,1.678999,-0.235670,-0.589278,-0.07305,acinar
human1_lib1.final_cell_0003,-0.135802,-0.092737,-0.204484,-0.647779,-0.355001,-5.745687,-0.308726,-0.395415,-0.341975,-0.271569,...,-0.054845,-0.309661,-0.130478,0.371409,-5.350889,-0.132739,2.421814,-0.589278,-0.07305,acinar
human1_lib1.final_cell_0004,-0.135802,-0.092737,-0.204484,-0.647779,-0.355001,-5.745687,-0.308726,-0.395415,-0.341975,1.744958,...,-0.054845,-0.884024,-0.130478,-0.693480,-5.350889,-0.132739,-0.235670,0.553566,-0.07305,acinar
human1_lib1.final_cell_0005,-0.135802,-0.092737,-0.204484,-0.647779,-0.355001,-5.745687,-0.308726,-0.395415,-0.341975,-0.271569,...,-0.054845,-0.255182,-0.130478,-0.693480,-5.350889,-0.132739,1.496242,-0.589278,-0.07305,acinar
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
human2_lib2.final_cell_0582,-0.135802,-0.092737,-0.204484,-0.647779,-0.355001,-0.219952,-0.308726,-0.395415,-0.341975,-0.271569,...,-0.054845,-0.884024,-0.130478,-0.693480,-0.206240,-0.132739,-0.235670,-0.589278,-0.07305,t_cell
human2_lib2.final_cell_0590,-0.135802,-0.092737,-0.204484,-0.647779,-0.355001,-0.219952,-0.308726,-0.395415,-0.341975,-0.271569,...,-0.054845,-0.884024,-0.130478,-0.693480,-0.206240,-0.132739,-0.235670,-0.589278,-0.07305,t_cell
human3_lib3.final_cell_0866,-0.135802,-0.092737,-0.204484,-0.647779,-0.355001,-0.219952,-0.308726,-0.395415,-0.341975,-0.271569,...,10.000000,-0.884024,-0.130478,-0.693480,-0.206240,-0.132739,-0.235670,2.390225,-0.07305,t_cell
human3_lib3.final_cell_0896,-0.135802,-0.092737,-0.204484,-0.647779,-0.355001,-0.219952,-0.308726,-0.395415,-0.341975,-0.271569,...,-0.054845,-0.884024,-0.130478,-0.693480,-0.206240,-0.132739,-0.235670,-0.589278,-0.07305,t_cell


In [33]:
celltype = transformed_count[["celltype"]]
celltype

index,celltype
index,Unnamed: 1_level_1
human1_lib1.final_cell_0001,acinar
human1_lib1.final_cell_0002,acinar
human1_lib1.final_cell_0003,acinar
human1_lib1.final_cell_0004,acinar
human1_lib1.final_cell_0005,acinar
...,...
human2_lib2.final_cell_0582,t_cell
human2_lib2.final_cell_0590,t_cell
human3_lib3.final_cell_0866,t_cell
human3_lib3.final_cell_0896,t_cell


In [34]:
# rank the values in each row
df_ranked = transformed_count.drop("celltype", axis=1).rank(axis=1, method='min', ascending=False)
df_ranked

index,PPP1R1B,EMCN,GRAMD1B,DPP6,ITGA5,KLHDC8A,EHD2,PODXL2,TMEM45B,SERPINA5,...,PFN2,LST1,FLNB,TRPM5,HADH,SYNDIG1,CHN2,MYC,PKIG,C1QC
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
human1_lib1.final_cell_0001,407.0,291.0,575.0,873.0,743.0,963.0,699.0,774.0,730.0,664.0,...,900.0,195.0,153.0,392.0,148.0,949.0,395.0,628.0,131.0,241.0
human1_lib1.final_cell_0002,439.0,324.0,610.0,880.0,173.0,963.0,722.0,788.0,756.0,113.0,...,557.0,226.0,128.0,424.0,780.0,949.0,61.0,657.0,861.0,273.0
human1_lib1.final_cell_0003,353.0,239.0,519.0,848.0,700.0,963.0,649.0,733.0,686.0,610.0,...,602.0,141.0,651.0,337.0,96.0,949.0,340.0,43.0,827.0,189.0
human1_lib1.final_cell_0004,411.0,297.0,579.0,856.0,738.0,963.0,693.0,770.0,726.0,60.0,...,892.0,203.0,897.0,395.0,864.0,949.0,398.0,631.0,124.0,249.0
human1_lib1.final_cell_0005,388.0,273.0,558.0,859.0,725.0,963.0,678.0,753.0,711.0,642.0,...,891.0,177.0,631.0,372.0,867.0,949.0,375.0,54.0,837.0,223.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
human2_lib2.final_cell_0582,288.0,174.0,459.0,882.0,694.0,507.0,631.0,735.0,675.0,583.0,...,932.0,79.0,943.0,273.0,893.0,471.0,276.0,534.0,861.0,123.0
human2_lib2.final_cell_0590,282.0,168.0,450.0,878.0,689.0,499.0,624.0,730.0,670.0,575.0,...,926.0,76.0,940.0,268.0,888.0,462.0,271.0,527.0,859.0,119.0
human3_lib3.final_cell_0866,282.0,170.0,451.0,872.0,686.0,498.0,623.0,728.0,667.0,575.0,...,922.0,3.0,936.0,267.0,883.0,463.0,270.0,526.0,32.0,119.0
human3_lib3.final_cell_0896,287.0,170.0,455.0,876.0,691.0,504.0,626.0,732.0,672.0,578.0,...,925.0,78.0,938.0,271.0,887.0,467.0,274.0,530.0,854.0,122.0


In [35]:
df_ranked.min(axis=0)

index
PPP1R1B     1.0
EMCN        1.0
GRAMD1B     2.0
DPP6       13.0
ITGA5       4.0
           ... 
SYNDIG1     1.0
CHN2        1.0
MYC         3.0
PKIG        7.0
C1QC        1.0
Length: 983, dtype: float64

In [36]:
adata = sc.AnnData(df_ranked)
adata.obs["celltype"] = celltype

In [37]:
adata.obs

Unnamed: 0_level_0,celltype
index,Unnamed: 1_level_1
human1_lib1.final_cell_0001,acinar
human1_lib1.final_cell_0002,acinar
human1_lib1.final_cell_0003,acinar
human1_lib1.final_cell_0004,acinar
human1_lib1.final_cell_0005,acinar
...,...
human2_lib2.final_cell_0582,t_cell
human2_lib2.final_cell_0590,t_cell
human3_lib3.final_cell_0866,t_cell
human3_lib3.final_cell_0896,t_cell
