In [1]:
from pyspark.sql.functions import (
    col, udf, struct, lit, split, regexp_replace, create_map, min as spark_min, max as spark_max,
    count, sum as spar_sum
)
from pyspark.sql.types import FloatType, ArrayType, StructType, StructField
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from itertools import chain


# for heatmap:
import numpy as np
import seaborn as sns
import matplotlib.pylab as plt

# establish spark connection
sparkConf = (
    SparkConf()
    .set('spark.driver.memory', '15g')
    .set('spark.executor.memory', '15g')
    .set('spark.driver.maxResultSize', '0')
)
spark = (
    SparkSession.builder
    .config(conf=sparkConf)
    .master('local[*]')
    .getOrCreate()
)

In [14]:
ids = ['ENSG00000176890', 'ENSG00000164458']
target_data = '/Users/dsuveges/project_data/target_index/21.11.parquet/'
gene_pair = (
    spark.read.parquet(target_data)
    .filter(
        col('id').isin(ids)
    )
    .select('id', 'approvedSymbol', 'alternativeGenes', 'approvedName',
           'synonyms', 'symbolSynonyms', 'nameSynonyms', 'obsoleteSymbols', 'obsoleteNames')
    .persist()
)

In [15]:
gene_pair.select('approvedSymbol', 'obsoleteSymbols', 'obsoleteNames').show(2, vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------
 approvedSymbol  | TYMS                                                                             
 obsoleteSymbols | [{TS, HGNC}]                                                                     
 obsoleteNames   | []                                                                               
-RECORD 1-------------------------------------------------------------------------------------------
 approvedSymbol  | TBXT                                                                             
 obsoleteSymbols | [{T, HGNC}]                                                                      
 obsoleteNames   | [{T, brachyury homolog (mouse), HGNC}, {T brachyury transcription factor, HGNC}] 



## Conclusion:

The labels `TS` identified as target enties were grounded to `TBXT` gene, as that gene as an obsolete name `T`, if the label is assumed to be a name, then it got stemmed eg. the plural form was removed hence `TS` become `T`, then the grounding happened.

