# Purpose: filter for only protein coding genes and save to csv.

In [19]:
import pandas as pd
import numpy as np
import anndata


In [20]:
def filter_for_pc_genes(expression, pc_genes):
    return expression.iloc[:, np.isin(expression.columns, pc_genes)]

# Load Dataframe of PC genes (made from identify_pc_genes.ipynb)

In [21]:
pc_genes = pd.read_csv('../../data/pc_genes/uniprot-compressed_true_download_true_fields_accession_2Creviewed_2C-2023.01.13-21.17.12.15.tsv.gz', sep = "\t")

In [22]:
pc_genes_map = pd.read_csv('../../data/pc_genes/uniprot-compressed_ENSEMBLE.tsv.gz', sep = "\t")

In [23]:
pc_genes = pc_genes[pc_genes.loc[:,"Organism"] == "Homo sapiens (Human)"]

In [24]:
pc_genes_confident = pc_genes[pc_genes.loc[:,"Protein existence"] == "Evidence at protein level"]

In [25]:
pc_genes.head()

Unnamed: 0,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,Protein existence
31,A0A087X1C5,reviewed,CP2D7_HUMAN,Putative cytochrome P450 2D7 (EC 1.14.14.1),CYP2D7,Homo sapiens (Human),515,Uncertain
58,A0A0B4J2F0,reviewed,PIOS1_HUMAN,Protein PIGBOS1 (PIGB opposite strand protein 1),PIGBOS1,Homo sapiens (Human),54,Evidence at protein level
59,A0A0B4J2F2,reviewed,SIK1B_HUMAN,Putative serine/threonine-protein kinase SIK1B...,SIK1B,Homo sapiens (Human),783,Uncertain
75,A0A0C5B5G6,reviewed,MOTSC_HUMAN,Mitochondrial-derived peptide MOTS-c (Mitochon...,MT-RNR1,Homo sapiens (Human),16,Evidence at protein level
147,A0A0K2S4Q6,reviewed,CD3CH_HUMAN,Protein CD300H (CD300 antigen-like family memb...,CD300H,Homo sapiens (Human),201,Evidence at protein level


In [26]:
merged_df = pd.merge(pc_genes_confident, pc_genes_map, left_on = 'Entry', right_on='From', how = 'inner')

In [27]:
merged_df.head()

Unnamed: 0,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,Protein existence,From,To
0,A0A0B4J2F0,reviewed,PIOS1_HUMAN,Protein PIGBOS1 (PIGB opposite strand protein 1),PIGBOS1,Homo sapiens (Human),54,Evidence at protein level,A0A0B4J2F0,ENSG00000225973.4
1,A0A0K2S4Q6,reviewed,CD3CH_HUMAN,Protein CD300H (CD300 antigen-like family memb...,CD300H,Homo sapiens (Human),201,Evidence at protein level,A0A0K2S4Q6,ENSG00000284690.3
2,A0A0U1RRE5,reviewed,NBDY_HUMAN,Negative regulator of P-body association (P-bo...,NBDY LINC01420,Homo sapiens (Human),68,Evidence at protein level,A0A0U1RRE5,ENSG00000204272.13
3,A0A1B0GTW7,reviewed,CIROP_HUMAN,Ciliated left-right organizer metallopeptidase...,CIROP LMLN2,Homo sapiens (Human),788,Evidence at protein level,A0A1B0GTW7,ENSG00000283654.3
4,A0AV02,reviewed,S12A8_HUMAN,Solute carrier family 12 member 8 (Cation-chlo...,SLC12A8 CCC9,Homo sapiens (Human),714,Evidence at protein level,A0AV02,ENSG00000221955.11


In [28]:
pc_genes_confident.loc[:,'Entry'].to_csv('../../data/pc_genes/highConfPCUniprots.txt', "\n", index=False)

# saves highConfPCUniprots.txt
# This needs to be put in https://www.uniprot.org/id-mapping: Input is UniProtKB_AC, Output is Ensemble
# ALl of these mapped

# The ENSEMBLE Ids were mapped to 

# Load Raw Expression Datasets

In [29]:
bulk_expression = pd.read_csv('../../data/bulk/bulk_grouped.csv', index_col=0)
sum_expression = pd.read_csv('../../data/pseudobulk/sum_pseudobulk.csv', index_col=0)


In [None]:
adata = anndata.read_h5ad('/pipeline42/datasets/TabulaSapiens/TS_Liver.h5ad')

In [None]:
pc_merged = pd.merge(adata.var, merged_df, left_on="ensemblid", right_on='To')

In [None]:
merged_df['To'] = merged_df['To'].str.extract('(.+?)\.')


In [None]:
PC = adata.var.loc[:,'ensemblid'].str.extract('(.+?)\.')

In [None]:
adata_ensembleids = adata.var.loc[:,'ensemblid'].str.extract('(.+?)\.')

In [None]:
adata2 = adata[:,np.isin(adata_ensembleids, merged_df['To'])]

In [None]:
adata2.var.index.to_numpy()

array(['OR4F29', 'OR4F16', 'SAMD11', ..., 'MT-ND5', 'MT-ND6', 'MT-CYB'],
      dtype=object)

In [None]:
bulk_expression.columns.to_series().to_csv('bulk_genes.txt', index = False, header = False)
# Put those into https://biit.cs.ut.ee/gprofiler/convert

In [None]:
bulk_genes = pd.read_csv('../../data/pc_genes/genes_in_bulk.csv')

In [None]:
bulk_genes.head()

Unnamed: 0,initial_alias,converted_alias,name,description,namespace
0,DDX11L1,ENSG00000223972,DDX11L1,DEAD/H-box helicase 11 like 1 (pseudogene) [So...,"ENTREZGENE,HGNC,WIKIGENE"
1,DDX11L1,ENSG00000290825,DDX11L2,DEAD/H-box helicase 11 like 2 (pseudogene) [So...,"ENTREZGENE,HGNC,WIKIGENE"
2,WASH7P,ENSG00000227232,WASH7P,"WASP family homolog 7, pseudogene [Source:HGNC...",HGNC
3,MIR6859-1,ENSG00000278267,MIR6859-1,microRNA 6859-1 [Source:HGNC Symbol;Acc:HGNC:5...,"ENTREZGENE,HGNC,WIKIGENE"
4,MIR1302-2HG,ENSG00000243485,MIR1302-2HG,MIR1302-2 host gene [Source:HGNC Symbol;Acc:HG...,HGNC


In [None]:
bulk_pc = merge(merged_df, bulk_genes, left_on='', right_on='converted_alias'

# Filter expression datasets for PC Genes


In [None]:
sum_pc_expression = filter_for_pc_genes(sum_expression, pc_genes=adata2.var.index.to_numpy())

In [None]:
bulk_pc_expression = filter_for_pc_genes(bulk_expression, pc_genes=adata2.var.index.to_numpy())

Load bulk gte

# Save

### data/bulk/bulk_pc.csv
### data/pseudobulk/sum_pseudobulk_pc.csv

In [None]:
bulk_pc_expression.to_csv('../../data/bulk/bulk_pc.csv')
sum_pc_expression.to_csv('../../data/pseudobulk/sum_pseudobulk_pc.csv')