In [15]:
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType
import pyspark.sql.functions as pf

# establish spark connection
sparkConf = (
    SparkConf()
    .set('spark.driver.memory', '15g')
    .set('spark.executor.memory', '15g')
    .set('spark.driver.host', 'localhost')
    .set('spark.driver.maxResultSize', '0')
    .set('spark.debug.maxToStringFields', '2000')
    .set('spark.sql.execution.arrow.maxRecordsPerBatch', '500000')
)
spark = (
    SparkSession.builder
    .config(conf=sparkConf)
    .getOrCreate()
)
EXCLUDED_TARGET_TERMS = ['TEC', 'TECS', 'Tec', 'tec', '\'', '(', ')', '-', '-S', 'S', 'S-', 'SS', 'SSS',
    'Ss', 'Ss-', 's', 's-', 'ss', 'U3', 'U6', 'u6', 'SNORA70', 'U2', 'U8']


cooccurrenceFile = '/Users/dsuveges/project_data/epmc_evidence/cooccurrences'

filtered_cooccurrence_df = (
    # Reading file:
    spark.read.parquet(cooccurrenceFile)

    # Casting integer pmid column to string:
    .withColumn("pmid", pf.col('pmid').cast(StringType()))

    # publication identifier is a pmid if available otherwise pmcid
    .withColumn(
        'publicationIdentifier',
        pf.when(pf.col('pmid').isNull(), pf.col('pmcid'))
        .otherwise(pf.col('pmid'))
    )

    # Filtering for disease/target cooccurrences:
    .filter(
        (pf.col('type') == 'GP-DS') &  # Filter gene/protein - disease cooccurrence
        (pf.col('isMapped')) &  # Filtering for mapped cooccurrences
        (pf.length(pf.col('text')) < 600) &  # Exclude sentences with more than 600 characters
        (pf.col('label1').isin(EXCLUDED_TARGET_TERMS) == False)  # Excluding target labels from the exclusion list
    )

    # Renaming columns:
    .withColumnRenamed('keywordId1', 'targetFromSourceId')
    .withColumnRenamed('keywordId2', 'diseaseFromSourceMappedId')

    .withColumn('pmid_str_tmp', pf.col('pmid'))
)


filtered_cooccurrence_df.count()


30111325

In [6]:
filtered_cooccurrence_df.filter(pf.col('pmcid').isNull()).count()

0

In [11]:
filtered_cooccurrence_df.select('publicationIdentifier').show()

+---------------------+
|publicationIdentifier|
+---------------------+
|             29147441|
|             29147441|
|             29147441|
|             29147441|
|             29147441|
|             29147441|
|             29147441|
|             29147441|
|             29147441|
|             29147441|
|             29147441|
|             29147441|
|             29147441|
|             29147441|
|             29147441|
|             29147441|
|             29147441|
|             29147441|
|             29147441|
|             29147441|
+---------------------+
only showing top 20 rows



In [16]:
aggregated_df = (
    filtered_cooccurrence_df

    # Aggregating data by publication, target and disease:
    .groupBy(['publicationIdentifier', 'targetFromSourceId', 'diseaseFromSourceMappedId'])
    .agg(
        pf.first(pf.col('pmcid').alias('pmcid')),
        pf.collect_set(pf.col('pmid_str_tmp')).alias('literature'),
        pf.collect_set(
            pf.struct(
                pf.col('text'),
                pf.col('start1').alias('tStart'),
                pf.col('end1').alias('tEnd'),
                pf.col('start2').alias('dStart'),
                pf.col('end2').alias('dEnd'),
                pf.col('section')
            )
        ).alias('textMiningSentences'),
        pf.sum(pf.col('evidence_score')).alias('resourceScore')
    )
    
    .drop('publicationIdentifier')

    # Only evidence with score above 1 is considered:
    .filter(pf.col('resourceScore') > 1)
)

In [17]:
(
    aggregated_df

    # Adding literal columns:
    .withColumn('datasourceId', pf.lit('europepmc'))
    .withColumn('datatypeId', pf.lit('literature'))

    # Reorder columns:
    .select(['datasourceId', 'datatypeId', 'targetFromSourceId', 'diseaseFromSourceMappedId','resourceScore',
             'literature', 'textMiningSentences', 'pmcId'])

    # Save output:
    .write.format('json').mode('overwrite').option('compression', 'gzip').save('new_epmc_evidence')
)

AnalysisException: cannot resolve '`pmcId`' given input columns: [datasourceId, datatypeId, diseaseFromSourceMappedId, first(pmcid AS `pmcid`, false), literature, resourceScore, targetFromSourceId, textMiningSentences];;
'Project [datasourceId#2096, datatypeId#2104, targetFromSourceId#1929, diseaseFromSourceMappedId#1956, resourceScore#2081, literature#2073, textMiningSentences#2079, 'pmcId]
+- Project [targetFromSourceId#1929, diseaseFromSourceMappedId#1956, first(pmcid AS `pmcid`, false)#2082, literature#2073, textMiningSentences#2079, resourceScore#2081, datasourceId#2096, literature AS datatypeId#2104]
   +- Project [targetFromSourceId#1929, diseaseFromSourceMappedId#1956, first(pmcid AS `pmcid`, false)#2082, literature#2073, textMiningSentences#2079, resourceScore#2081, europepmc AS datasourceId#2096]
      +- Filter (resourceScore#2081 > cast(1 as double))
         +- Project [targetFromSourceId#1929, diseaseFromSourceMappedId#1956, first(pmcid AS `pmcid`, false)#2082, literature#2073, textMiningSentences#2079, resourceScore#2081]
            +- Aggregate [publicationIdentifier#1902, targetFromSourceId#1929, diseaseFromSourceMappedId#1956], [publicationIdentifier#1902, targetFromSourceId#1929, diseaseFromSourceMappedId#1956, first(pmcid#1827, false) AS first(pmcid AS `pmcid`, false)#2082, collect_set(pmid_str_tmp#1983, 0, 0) AS literature#2073, collect_set(struct(text, text#1835, tStart, start1#1845L, tEnd, end1#1837L, dStart, start2#1846L, dEnd, end2#1838L, section, section#1834), 0, 0) AS textMiningSentences#2079, sum(evidence_score#1839) AS resourceScore#2081]
               +- Project [pmid#1876, pmcid#1827, pubDate#1828, date#1829, year#1830, month#1831, day#1832, organisms#1833, section#1834, text#1835, association#1836, end1#1837L, end2#1838L, evidence_score#1839, label1#1840, targetFromSourceId#1929, label2#1842, diseaseFromSourceMappedId#1956, relation#1844, start1#1845L, start2#1846L, type#1847, type1#1848, type2#1849, ... 3 more fields]
                  +- Project [pmid#1876, pmcid#1827, pubDate#1828, date#1829, year#1830, month#1831, day#1832, organisms#1833, section#1834, text#1835, association#1836, end1#1837L, end2#1838L, evidence_score#1839, label1#1840, targetFromSourceId#1929, label2#1842, keywordId2#1843 AS diseaseFromSourceMappedId#1956, relation#1844, start1#1845L, start2#1846L, type#1847, type1#1848, type2#1849, ... 2 more fields]
                     +- Project [pmid#1876, pmcid#1827, pubDate#1828, date#1829, year#1830, month#1831, day#1832, organisms#1833, section#1834, text#1835, association#1836, end1#1837L, end2#1838L, evidence_score#1839, label1#1840, keywordId1#1841 AS targetFromSourceId#1929, label2#1842, keywordId2#1843, relation#1844, start1#1845L, start2#1846L, type#1847, type1#1848, type2#1849, ... 2 more fields]
                        +- Filter ((((type#1847 = GP-DS) AND isMapped#1850) AND (length(text#1835) < 600)) AND (label1#1840 IN (TEC,TECS,Tec,tec,',(,),-,-S,S,S-,SS,SSS,Ss,Ss-,s,s-,ss,U3,U6,u6,SNORA70,U2,U8) = false))
                           +- Project [pmid#1876, pmcid#1827, pubDate#1828, date#1829, year#1830, month#1831, day#1832, organisms#1833, section#1834, text#1835, association#1836, end1#1837L, end2#1838L, evidence_score#1839, label1#1840, keywordId1#1841, label2#1842, keywordId2#1843, relation#1844, start1#1845L, start2#1846L, type#1847, type1#1848, type2#1849, ... 2 more fields]
                              +- Project [cast(pmid#1826 as string) AS pmid#1876, pmcid#1827, pubDate#1828, date#1829, year#1830, month#1831, day#1832, organisms#1833, section#1834, text#1835, association#1836, end1#1837L, end2#1838L, evidence_score#1839, label1#1840, keywordId1#1841, label2#1842, keywordId2#1843, relation#1844, start1#1845L, start2#1846L, type#1847, type1#1848, type2#1849, isMapped#1850]
                                 +- Relation[pmid#1826,pmcid#1827,pubDate#1828,date#1829,year#1830,month#1831,day#1832,organisms#1833,section#1834,text#1835,association#1836,end1#1837L,end2#1838L,evidence_score#1839,label1#1840,keywordId1#1841,label2#1842,keywordId2#1843,relation#1844,start1#1845L,start2#1846L,type#1847,type1#1848,type2#1849,isMapped#1850] parquet


In [18]:
df

NameError: name 'df' is not defined