In [1]:
sc.version

'3.2.1'

## 1. Imports

In [2]:
from pathlib import Path
import pyspark.sql.functions as F
from pyspark.sql.types import ArrayType, StringType, BooleanType, IntegerType
import pandas as pd
from pyspark.sql.functions import col, regexp_replace, when, coalesce, count

## 2. Define directories

In [3]:
dir_data = Path('/export/data_ml4ds/AI4U/Datasets/semanticscholar/20231205/rawdata')
dir_parquet_sematicscholar = Path('/export/data_ml4ds/AI4U/Datasets/semanticscholar/20231205/parquet')
dir_parquet_researchportal = Path('/export/data_ml4ds/AI4U/Datasets/ResearchPortal/publications.parquet')
dir_parquet_sematicscholar = Path('/export/data_ml4ds/AI4U/Datasets/semanticscholar/20231205/parquet')

## 1. Import tables

### 1.1. Table **`papers`**

In [4]:
%%time
dir_papers = dir_data.joinpath('papers')
df_papers = spark.read.json('file:///' + dir_papers.as_posix())

#We drop corrupt records
df_papers = df_papers.cache()
df_papers = df_papers.where(F.col("_corrupt_record").isNull()).drop("_corrupt_record")

print('Number of papers available:', df_papers.count())
df_papers.printSchema()
df_papers.show(n=2, truncate=120, vertical=True)

                                                                                

CPU times: user 66.1 ms, sys: 14.9 ms, total: 81 ms
Wall time: 4min 16s


In [5]:
%%time

# This function will be used for extracting only the Semantic Scholar FOS in string format
# Semantic Scholar uses several models, but we keep only FOS from s2-fos-model
def extractFOS(x):
    try:
        return [el['category'] for el in x
            if el['source'] == "s2-fos-model"]
    except:
        return None
    
extractFOS_UDF = F.udf(extractFOS, ArrayType(StringType()))

# Adapt columns names and formats for backwards compatibility
dataset = df_papers.select(F.col('corpusid').alias('id'), \
                           'title', \
                           F.col('url').alias('S2Url'), \
                           F.col('year').cast(IntegerType()), \
                           F.col('externalids.DOI').alias('doi'), \
                           F.col('externalids.PubMed').alias('pmid'), \
                           F.col('externalids.MAG').alias('magId'), \
                           'externalids', \
                           extractFOS_UDF(F.col('s2fieldsofstudy')).alias("fieldsOfStudy"), \
                           'publicationtypes', \
                           'publicationdate', \
                           F.col('journal.name').alias('journalName'), \
                           F.col('journal.pages').alias('journalPages'), \
                           F.col('journal.volume').alias('journalVolume'), \
                           'venue', \
                           'publicationvenueid', \
                           'isopenaccess', \
                           F.col('referencecount').cast(IntegerType()), \
                           F.col('citationcount').cast(IntegerType()), \
                           F.col('influentialcitationcount').cast(IntegerType()) \
                          )
dataset.printSchema()
dataset.show(n=2, truncate=120, vertical=True)

CPU times: user 10.1 ms, sys: 359 µs, total: 10.5 ms
Wall time: 69.5 ms


In [6]:
%%time

dir_abstracts = dir_data.joinpath('abstracts')
df_abstracts = spark.read.json('file:///' + dir_abstracts.as_posix())

#We drop corrupt records
df_abstracts = df_abstracts.cache()
df_abstracts = df_abstracts.where(F.col("_corrupt_record").isNull()).drop("_corrupt_record")

print('Number of abstracts available:', df_abstracts.count())
df_abstracts.printSchema()
df_abstracts.show(n=2, truncate=120, vertical=True)



CPU times: user 50.7 ms, sys: 23.1 ms, total: 73.8 ms
Wall time: 4min 16s


                                                                                

In [7]:
%%time

# Adapt columns names and formats for backwards compatibility
df_abstracts = df_abstracts.select(F.col('corpusid').alias('id'), \
                           F.col('abstract').alias('paperAbstract'), \
                           'openaccessinfo' \
                          )
df_abstracts.printSchema()
df_abstracts.show(n=2, truncate=120, vertical=True)

CPU times: user 3.02 ms, sys: 0 ns, total: 3.02 ms
Wall time: 13.6 ms


In [8]:
%%time

dataset = (dataset.join(df_abstracts, dataset.id ==  df_abstracts.id, "left")
                      .drop(df_abstracts.id)
                ).cache()

print('Number of documents in dataset:', dataset.count())
dataset.printSchema()
dataset.show(n=2, truncate=120, vertical=True) 

CPU times: user 3.74 ms, sys: 0 ns, total: 3.74 ms
Wall time: 259 ms


In [None]:
print("Papers with PMID:", dataset.where(F.col("pmid").isNotNull()).count())
print("Papers with DOI:", dataset.where(F.col("doi").isNotNull()).count())
print("Unique DOIs:", dataset.where(F.col("doi").isNotNull()).select('doi').distinct().count())

In [9]:
dataset = dataset.select('id', 'title', 'doi', 'paperAbstract')
dataset.printSchema()

root
 |-- id: long (nullable = true)
 |-- title: string (nullable = true)
 |-- doi: string (nullable = true)
 |-- paperAbstract: string (nullable = true)



In [12]:
# Print number of publications in total in the Semantic Scholar database
print('Number of documents in Semantic Scholar:', dataset.count())

# Remove publications without Abstarct
dataset_no_na = dataset.filter(~(col("paperAbstract").isNull() | (col("paperAbstract") == '')))
print('Number of documents in Semantic Scholar: with abstract:', dataset_no_na.count())

24/01/24 03:15:45 ERROR scheduler.TaskSchedulerImpl: Lost executor 24 on node33.cluster.tsc.uc3m.es: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/01/24 03:15:45 WARN scheduler.TaskSetManager: Lost task 65.0 in stage 18.0 (TID 671) (node33.cluster.tsc.uc3m.es executor 24): ExecutorLostFailure (executor 24 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/01/24 03:15:45 WARN scheduler.TaskSetManager: Lost task 103.0 in stage 18.0 (TID 724) (node33.cluster.tsc.uc3m.es executor 24): ExecutorLostFailure (executor 24 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/01/24 03:15:45 WARN scheduler.TaskSetManager: Lost task 116

Number of documents in Semantic Scholar: 215205862


24/01/24 03:19:15 ERROR scheduler.TaskSchedulerImpl: Lost executor 29 on node37.cluster.tsc.uc3m.es: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/01/24 03:19:15 WARN scheduler.TaskSetManager: Lost task 15.0 in stage 25.0 (TID 890) (node37.cluster.tsc.uc3m.es executor 29): ExecutorLostFailure (executor 29 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/01/24 03:19:15 WARN scheduler.TaskSetManager: Lost task 12.0 in stage 25.0 (TID 887) (node37.cluster.tsc.uc3m.es executor 29): ExecutorLostFailure (executor 29 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/01/24 03:19:15 WARN scheduler.TaskSetManager: Lost task 14.0

Number of documents in Semantic Scholar: with abstract: 102585428


                                                                                

### 1.2. Table **`publications`**

In [13]:
df_publications = spark.read.parquet(str(Path(dir_parquet_researchportal)))

In [171]:
df_publications.printSchema()

root
 |-- actID: string (nullable = true)
 |-- ActivityType: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Abstract: string (nullable = true)
 |-- Keywords: string (nullable = true)
 |-- Research_Areas: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- DOI: string (nullable = true)
 |-- Year: long (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- ISSN: string (nullable = true)
 |-- EISSN: string (nullable = true)
 |-- ISBN: string (nullable = true)



In [172]:
# Any duplicated?
duplicates_df_publications = df_publications.groupBy("actID").count().filter(col("count") > 1)
print('Number of duplicated publications:', duplicates_df_publications.count())

Number of duplicated publications: 0


## 2. Matching

In [14]:
# Print number of publications in total in the Research Portal database
print('Number of publications in the Research Portal before the matching:', df_publications.count())

# Remove publications which already have Abstarct
df_publications_na = df_publications.filter("Abstract IS NULL OR Abstract = ''")
print('Number of publications in the Research Portal without abstract before the matching:', df_publications_na.count())


                                                                                

Number of publications in the Research Portal before the matching: 40244




Number of publications in the Research Portal without abstract before the matching: 24725


                                                                                

In [15]:
# Make the format of the DOI of publications in research portal match the format of Sematic Scholar
doi_pattern = "https://doi.org/"
# df_publications_na = df_publications_na.withColumn("clear_DOI", regexp_replace(col("DOI"), doi_pattern, ""))
df_publications_na = df_publications_na.withColumn("doi_pub", regexp_replace(col("DOI"), doi_pattern, ""))
df_publications_na = df_publications_na.withColumnRenamed("Title", "title_pub")


In [271]:
df_publications_na.printSchema()

root
 |-- actID: string (nullable = true)
 |-- ActivityType: string (nullable = true)
 |-- title_pub: string (nullable = true)
 |-- Abstract: string (nullable = true)
 |-- Keywords: string (nullable = true)
 |-- Research_Areas: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- DOI: string (nullable = true)
 |-- Year: long (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- ISSN: string (nullable = true)
 |-- EISSN: string (nullable = true)
 |-- ISBN: string (nullable = true)
 |-- doi_pub: string (nullable = true)



In [17]:
dataset_no_na = dataset_no_na.withColumnRenamed("title", "title_ss")
dataset_no_na = dataset_no_na.withColumnRenamed("doi", "doi_ss")
dataset_no_na.printSchema()


root
 |-- id: long (nullable = true)
 |-- title_ss: string (nullable = true)
 |-- doi_ss: string (nullable = true)
 |-- paperAbstract: string (nullable = true)



In [19]:
# perform a join of both tables in order of getting the number of matched publications
joined_df = df_publications_na.join(
    dataset_no_na,
    (df_publications_na.doi_pub == dataset_no_na.doi_ss) & (df_publications_na.title_pub == dataset_no_na.title_ss),
    "inner"
)
print('Number of matches:', joined_df.count())

24/01/24 03:28:20 ERROR scheduler.TaskSchedulerImpl: Lost executor 37 on node03.cluster.tsc.uc3m.es: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/01/24 03:28:20 WARN scheduler.TaskSetManager: Lost task 26.0 in stage 40.0 (TID 1127) (node03.cluster.tsc.uc3m.es executor 37): ExecutorLostFailure (executor 37 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/01/24 03:28:20 WARN scheduler.TaskSetManager: Lost task 28.0 in stage 40.0 (TID 1129) (node03.cluster.tsc.uc3m.es executor 37): ExecutorLostFailure (executor 37 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/01/24 03:28:20 WARN scheduler.TaskSetManager: Lost task 27

Number of matches: 404


                                                                                

In [275]:
joined_df.printSchema()

root
 |-- actID: string (nullable = true)
 |-- ActivityType: string (nullable = true)
 |-- title_pub: string (nullable = true)
 |-- Abstract: string (nullable = true)
 |-- Keywords: string (nullable = true)
 |-- Research_Areas: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- DOI: string (nullable = true)
 |-- Year: long (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- ISSN: string (nullable = true)
 |-- EISSN: string (nullable = true)
 |-- ISBN: string (nullable = true)
 |-- doi_pub: string (nullable = true)
 |-- id: long (nullable = true)
 |-- title_ss: string (nullable = true)
 |-- doi_ss: string (nullable = true)
 |-- paperAbstract: string (nullable = true)



In [20]:
joined_df_0 = joined_df.select('actID', 'id', 'title_pub', 'title_ss', 'doi_pub', 'doi_ss', 'paperAbstract')

In [21]:
# Unir las DataFrames en función de la columna actID
merged_df = df_publications.join(joined_df_0, "actID", "left_outer")

# Actualizar el valor de Abstract con el valor de paperAbstract donde esté disponible
result_df = merged_df.withColumn("Abstract", coalesce(col("paperAbstract"), col("Abstract")))

# Seleccionar las columnas necesarias
result_df = result_df.select("actID", "ActivityType", "Title", "Abstract", "Keywords", "Research_Areas",
                             "DOI", "Year", "Publisher", "ISSN", "EISSN", "ISBN")



In [22]:
print('Number of publications in the Research Portal without abstract after the matching:', result_df.filter("Abstract IS NULL OR Abstract = ''").count())


24/01/24 03:33:18 ERROR scheduler.TaskSchedulerImpl: Lost executor 43 on node73.cluster.tsc.uc3m.es: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/01/24 03:33:18 WARN scheduler.TaskSetManager: Lost task 31.0 in stage 49.0 (TID 1393) (node73.cluster.tsc.uc3m.es executor 43): ExecutorLostFailure (executor 43 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/01/24 03:33:18 WARN scheduler.TaskSetManager: Lost task 23.0 in stage 49.0 (TID 1384) (node73.cluster.tsc.uc3m.es executor 43): ExecutorLostFailure (executor 43 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/01/24 03:33:18 WARN scheduler.TaskSetManager: Lost task 21

Number of publications in the Research Portal without abstract after the matching: 24322


                                                                                

In [24]:
# check if there is any null value
null_check = result_df.select([F.count(F.when(F.col(column).isNull() | (F.trim(F.col(column)) == ""), column)).alias(column) for column in ['actID', 'Title', 'Abstract', 'DOI']])
null_check.show()

24/01/24 03:40:45 ERROR scheduler.TaskSchedulerImpl: Lost executor 49 on node13.cluster.tsc.uc3m.es: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/01/24 03:40:45 WARN scheduler.TaskSetManager: Lost task 63.0 in stage 63.0 (TID 1682) (node13.cluster.tsc.uc3m.es executor 49): ExecutorLostFailure (executor 49 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/01/24 03:40:45 WARN scheduler.TaskSetManager: Lost task 57.0 in stage 63.0 (TID 1676) (node13.cluster.tsc.uc3m.es executor 49): ExecutorLostFailure (executor 49 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/01/24 03:40:45 WARN scheduler.TaskSetManager: Lost task 48

+-----+-----+--------+-----+
|actID|Title|Abstract|  DOI|
+-----+-----+--------+-----+
|    0|    0|   24322|23471|
+-----+-----+--------+-----+



                                                                                

In [26]:
# Check if there is any duplicated
dup_check = result_df.groupBy(['actID', 'Title', 'Abstract', 'DOI']).agg(F.count("*").alias("count"))
dup_check = dup_check.filter("count > 1")
dup_check.show()

24/01/24 03:48:23 ERROR scheduler.TaskSchedulerImpl: Lost executor 59 on node20.cluster.tsc.uc3m.es: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/01/24 03:48:23 WARN scheduler.TaskSetManager: Lost task 36.0 in stage 77.0 (TID 1891) (node20.cluster.tsc.uc3m.es executor 59): ExecutorLostFailure (executor 59 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/01/24 03:48:23 WARN scheduler.TaskSetManager: Lost task 33.0 in stage 77.0 (TID 1885) (node20.cluster.tsc.uc3m.es executor 59): ExecutorLostFailure (executor 59 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/01/24 03:48:23 WARN scheduler.TaskSetManager: Lost task 43

+---------+--------------------+--------------------+--------------------+-----+
|    actID|               Title|            Abstract|                 DOI|count|
+---------+--------------------+--------------------+--------------------+-----+
|act518994|Circus, Play and ...|Physical training...|https://doi.org/1...|    2|
+---------+--------------------+--------------------+--------------------+-----+



                                                                                

In [27]:
# delete this duplicated instance
result_df_no_duplicates = result_df.dropDuplicates(["actID", "Title", "Abstract", "DOI"])

In [29]:
# Save as a parquet the enriched datasets 
result_df.write.parquet('export/usuarios_ml4ds/mbalairon/publications.parquet')
result_df_no_duplicates.write.parquet('export/usuarios_ml4ds/mbalairon/publications_noDup.parquet')

24/01/24 04:09:17 ERROR scheduler.TaskSchedulerImpl: Lost executor 70 on node34.cluster.tsc.uc3m.es: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/01/24 04:09:17 WARN scheduler.TaskSetManager: Lost task 40.0 in stage 106.0 (TID 2389) (node34.cluster.tsc.uc3m.es executor 70): ExecutorLostFailure (executor 70 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/01/24 04:09:17 WARN scheduler.TaskSetManager: Lost task 37.0 in stage 106.0 (TID 2387) (node34.cluster.tsc.uc3m.es executor 70): ExecutorLostFailure (executor 70 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/01/24 04:09:17 WARN scheduler.TaskSetManager: Lost task 