# Filter for canonical genes


To get a more reasonable view on the gene synonyms and labels, using the gene index labels filtered for genes that localized on canonical chromosomes.



In [3]:
import pyspark.sql
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pandas as pd
import json


global spark
spark = (pyspark.sql.SparkSession
    .builder
    .config("spark.driver.bindAddress", "localhost")
    .getOrCreate()
)
print(f'Spark version: {spark.version}')

Spark version: 3.0.0


In [74]:
target_index_file = '/Users/dsuveges/project_data/ot/target_index/targets'

targets = (
    spark.read.parquet(target_index_file)
    .select(
        col('id').alias('Ensembl'),
        col('bioType'),
        col('genomicLocation.chromosome').alias('chrom'),
        col('proteinAnnotations.id'))
    .toPandas()
    .drop_duplicates()
)


merged = (
    pd.read_csv(labels_file, sep='\t')
    .merge(targets, on='Ensembl', how='outer', indicator=True)
)

merged.head()

Unnamed: 0,Ensembl,alias,alias_type,bioType,chrom,id,_merge
0,ENSG00000121410,OTTHUMG00000183507,vega_id,protein_coding,19,P04217,both
1,ENSG00000121410,NM_130786,refseq_accession,protein_coding,19,P04217,both
2,ENSG00000121410,5,hgnc_id,protein_coding,19,P04217,both
3,ENSG00000121410,69417,rgd_id,protein_coding,19,P04217,both
4,ENSG00000121410,138670,omim_id,protein_coding,19,P04217,both


In [66]:
print(f'Number of genes in the gene index: {len(targets.Ensembl.unique())}')
print(f'Number of genes labels file: {len(merged.loc[merged.alias.notna()].Ensembl.unique())}')
print(f'Number of genes in the gene index: {}')
# print(f'Number of genes in the gene index: {}')
# print(f'Number of genes in the gene index: {}')

Number of genes in the gene index: 60616
Number of genes labels file: 41873


In [78]:
(
    merged
    .bioType
    .unique()
    .apply(lambda x: print(f'\n{x}\n{merged.loc[merged.bioType == x]._merge.value_counts()}'))
)

AttributeError: 'numpy.ndarray' object has no attribute 'apply'

In [84]:
(
    pd.Series(
        merged
        .bioType
        .unique()
    )
    .apply(lambda x: print(f'\n{x}\n{merged.loc[merged.bioType == x,["Ensembl","_merge"]].drop_duplicates()._merge.value_counts()}'))
)


protein_coding
both          19439
right_only      505
left_only         0
Name: _merge, dtype: int64

lncRNA
right_only    11163
both           5726
left_only         0
Name: _merge, dtype: int64

transcribed_unprocessed_pseudogene
both          725
right_only    213
left_only       0
Name: _merge, dtype: int64

processed_pseudogene
both          7187
right_only    2973
left_only        0
Name: _merge, dtype: int64

transcribed_processed_pseudogene
both          341
right_only    159
left_only       0
Name: _merge, dtype: int64

transcribed_unitary_pseudogene
both          120
right_only     18
left_only       0
Name: _merge, dtype: int64

unprocessed_pseudogene
both          1686
right_only     926
left_only        0
Name: _merge, dtype: int64

unitary_pseudogene
both          79
right_only    19
left_only      0
Name: _merge, dtype: int64

nan
both          0
right_only    0
left_only     0
Name: _merge, dtype: int64

polymorphic_pseudogene
both          48
right_only     0
left_on

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
20    None
21    None
22    None
23    None
24    None
25    None
26    None
27    None
28    None
29    None
30    None
31    None
32    None
33    None
34    None
35    None
36    None
37    None
38    None
39    None
40    None
dtype: object

In [61]:
targets_pooled = (
    targets
    .groupby(['Ensembl'])
    .agg(uniprotIds=('id', lambda x: x.tolist()),
        chroms=('chrom', lambda x: x.tolist()))
    
)

In [106]:
# targets_pooled.loc[targets_pooled.chroms.apply(lambda x: len(x) > 1)]
merged.loc[merged._merge == 'left_only']


Unnamed: 0,Ensembl,alias,alias_type,bioType,chrom,id,_merge
13701,ENSG00000279355,OTTHUMG00000152859,vega_id,,,,left_only
13702,ENSG00000279355,NR_024277,refseq_accession,,,,left_only
13703,ENSG00000279355,20988,hgnc_id,,,,left_only
13704,ENSG00000279355,AGPAT4-IT1,symbol,,,,left_only
13705,ENSG00000279355,AGPAT4 intronic transcript 1,name,,,,left_only
...,...,...,...,...,...,...,...
749531,ENSG00000276345,LOC107987373,symbol,,,,left_only
749532,ENSG00000276345,"39S ribosomal protein L23, mitochondrial",description,,,,left_only
749533,ENSG00000276345,-,symbol_from_nomenclature_authority,,,,left_only
749534,ENSG00000276345,-,full_name_from_nomenclature_authority,,,,left_only


In [139]:
x = (
    merged[['Ensembl', '_merge', 'bioType']]
    .drop_duplicates()
    .groupby(['bioType','_merge'])
    .agg(count=("Ensembl","count"))
    .unstack()
    .rename(columns=str)
    .assign(recovery = lambda x: x['count']['both'] / (x['count']['both'] + x['count']['right_only']))
)

x

Unnamed: 0_level_0,count,count,count,recovery
_merge,left_only,right_only,both,Unnamed: 4_level_1
bioType,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
IG_C_gene,,,14.0,
IG_C_pseudogene,,,9.0,
IG_D_gene,,,37.0,
IG_J_gene,,,18.0,
IG_J_pseudogene,,,3.0,
IG_V_gene,,2.0,143.0,0.986207
IG_V_pseudogene,,3.0,184.0,0.983957
IG_pseudogene,,1.0,,
Mt_rRNA,,,2.0,
Mt_tRNA,,,22.0,


In [136]:
x.rename(columns=str).reset_index()

Unnamed: 0_level_0,bioType,count,count,count
_merge,Unnamed: 1_level_1,left_only,right_only,both
0,IG_C_gene,,,14.0
1,IG_C_pseudogene,,,9.0
2,IG_D_gene,,,37.0
3,IG_J_gene,,,18.0
4,IG_J_pseudogene,,,3.0
5,IG_V_gene,,2.0,143.0
6,IG_V_pseudogene,,3.0,184.0
7,IG_pseudogene,,1.0,
8,Mt_rRNA,,,2.0
9,Mt_tRNA,,,22.0


In [129]:
x.iloc[1]['count']['both']

9.0