In [1]:
sc.version

'3.4.0'

## 1. Imports

In [2]:
#from configparser import ConfigParser
from pathlib import Path
import pyspark.sql.functions as F
#import requests
from pyspark.sql.types import ArrayType, StringType, BooleanType, IntegerType


## 2. Define directories

In [3]:
# Define directories
#
# Relevant directories are read from the config file:
# dir_data:    full path to hdfs directory where the raw data .gz files are stored
# dir_parquet: full path to hdfs directory where the parquet tables will be stored
# version:     Version of Semantic Scholar that is being processed
#              for information purposes only

# cf = ConfigParser()
# cf.read("../config.cf")

# dir_data = Path(cf.get("spark", "dir_data"))
# dir_parquet = Path(cf.get("spark", "dir_parquet"))
# version = cf.get("spark", "version")
# dir_pdfs = Path(cf.get("spark", "dir_pdfs"))

dir_data = Path('/export/data_ml4ds/AI4U/Datasets/semanticscholar/20231205/rawdata')
dir_parquet = Path('/export/data_ml4ds/AI4U/Datasets/semanticscholar/20231205/parquet')

## 3. Configuration hdfs

**Files will be read from and saved to NFS. Skip this section entirely !**

It is not possible to listdir() directly using Path as it is a hdfs

In [None]:
"""# Configuration hdfs
fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
hdfs_dir_parquet = spark._jvm.org.apache.hadoop.fs.Path(dir_parquet.as_posix())
# Create output directories if they do not exist
# !hadoop dfs ...
# !hadoop dfs -put 20220201 /export/ml4ds/IntelComp/Datalake/SemanticScholar/

if not fs.exists(hdfs_dir_parquet):
    fs.mkdirs(hdfs_dir_parquet)
"""

In [None]:
"""
# Configuration hdfs
fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
hdfs_dir_data = spark._jvm.org.apache.hadoop.fs.Path(dir_data.as_posix())

print(hdfs_dir_data)

# Get selected version
releases = sorted(
    [
        f.getPath().getName()
        for f in fs.listStatus(hdfs_dir_data)
        if f.isDirectory() and f.getPath().getName().isdigit()
    ]
)
version = version.replace("-", "")
if version == "last":
    version = releases[-1]
if version not in releases:
    print(f"Version {version} not found")
    print(f"Available versions: {releases}")

hdfs_dir_data_files = spark._jvm.org.apache.hadoop.fs.Path(
    dir_data.joinpath(version).as_posix()
)
hdfs_dir_parquet = spark._jvm.org.apache.hadoop.fs.Path(dir_parquet.as_posix())
hdfs_dir_version = spark._jvm.org.apache.hadoop.fs.Path(
    dir_parquet.joinpath(version).as_posix()
)

# Create output directories if they do not exist
# !hadoop dfs ...
# !hadoop dfs -put 20220201 /export/ml4ds/IntelComp/Datalake/SemanticScholar/

if not fs.exists(hdfs_dir_parquet):
    fs.mkdirs(hdfs_dir_parquet)

if not fs.exists(hdfs_dir_version):
    fs.mkdirs(hdfs_dir_version)
"""

## 4. Import tables

### 4.1. Table **`papers`**

In [4]:
%%time
dir_papers = dir_data.joinpath('papers')
df_papers = spark.read.json('file:///' + dir_papers.as_posix())

#We drop corrupt records
df_papers = df_papers.cache()
df_papers = df_papers.where(F.col("_corrupt_record").isNull()).drop("_corrupt_record")

print('Number of papers available:', df_papers.count())
df_papers.printSchema()
df_papers.show(n=2, truncate=120, vertical=True)

                                                                                

Number of papers available: 215205862
root
 |-- authors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- authorId: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- citationcount: long (nullable = true)
 |-- corpusid: long (nullable = true)
 |-- externalids: struct (nullable = true)
 |    |-- ACL: string (nullable = true)
 |    |-- ArXiv: string (nullable = true)
 |    |-- CorpusId: string (nullable = true)
 |    |-- DBLP: string (nullable = true)
 |    |-- DOI: string (nullable = true)
 |    |-- MAG: string (nullable = true)
 |    |-- PubMed: string (nullable = true)
 |    |-- PubMedCentral: string (nullable = true)
 |-- influentialcitationcount: long (nullable = true)
 |-- isopenaccess: boolean (nullable = true)
 |-- journal: struct (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- pages: string (nullable = true)
 |    |-- volume: string (nullable = true)
 |-- publicationdate: string (nullable = true)

<span style="background-color:yellow;">The following code is used to count how many papers of each type are there in the dataset. It is very inefficient probably, but it works nevertheless.</span>

In [5]:
tipos = df_papers.where(F.col("publicationtypes").isNotNull()).select('publicationtypes').collect()
tipos = [el[0] for el in tipos]
tipos = [item for sublist in tipos for item in sublist]

from collections import Counter
counts = Counter(tipos)
print(counts)

                                                                                

Counter({'JournalArticle': 43946390, 'Review': 15134139, 'Conference': 4126089, 'Study': 2335850, 'CaseReport': 2225372, 'LettersAndComments': 1541897, 'Editorial': 688273, 'ClinicalTrial': 581142, 'Book': 335060, 'News': 238313, 'MetaAnalysis': 103411, 'Dataset': 690})


In [6]:
%%time

# This function will be used for extracting only the Semantic Scholar FOS in string format
# Semantic Scholar uses several models, but we keep only FOS from s2-fos-model
def extractFOS(x):
    try:
        return [el['category'] for el in x
            if el['source'] == "s2-fos-model"]
    except:
        return None
    
extractFOS_UDF = F.udf(extractFOS, ArrayType(StringType()))

# Adapt columns names and formats for backwards compatibility
dataset = df_papers.select(F.col('corpusid').alias('id'), \
                           'title', \
                           F.col('url').alias('S2Url'), \
                           F.col('year').cast(IntegerType()), \
                           F.col('externalids.DOI').alias('doi'), \
                           F.col('externalids.PubMed').alias('pmid'), \
                           F.col('externalids.MAG').alias('magId'), \
                           'externalids', \
                           extractFOS_UDF(F.col('s2fieldsofstudy')).alias("fieldsOfStudy"), \
                           'publicationtypes', \
                           'publicationdate', \
                           F.col('journal.name').alias('journalName'), \
                           F.col('journal.pages').alias('journalPages'), \
                           F.col('journal.volume').alias('journalVolume'), \
                           'venue', \
                           'publicationvenueid', \
                           'isopenaccess', \
                           F.col('referencecount').cast(IntegerType()), \
                           F.col('citationcount').cast(IntegerType()), \
                           F.col('influentialcitationcount').cast(IntegerType()) \
                          )
dataset.printSchema()
dataset.show(n=2, truncate=120, vertical=True)

root
 |-- id: long (nullable = true)
 |-- title: string (nullable = true)
 |-- S2Url: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- doi: string (nullable = true)
 |-- pmid: string (nullable = true)
 |-- magId: string (nullable = true)
 |-- externalids: struct (nullable = true)
 |    |-- ACL: string (nullable = true)
 |    |-- ArXiv: string (nullable = true)
 |    |-- CorpusId: string (nullable = true)
 |    |-- DBLP: string (nullable = true)
 |    |-- DOI: string (nullable = true)
 |    |-- MAG: string (nullable = true)
 |    |-- PubMed: string (nullable = true)
 |    |-- PubMedCentral: string (nullable = true)
 |-- fieldsOfStudy: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- publicationtypes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- publicationdate: string (nullable = true)
 |-- journalName: string (nullable = true)
 |-- journalPages: string (nullable = true)
 |-- journalVolume: string (nullable = 

[Stage 6:>                                                          (0 + 1) / 1]

-RECORD 0--------------------------------------------------------------------------------------------------
 id                       | 22597273                                                                       
 title                    | CLOSED circuit respiration/ventilation system. Phase l.                        
 S2Url                    | https://www.semanticscholar.org/paper/5204f12833ca6844a7ca0e531a3a4c8620053466 
 year                     | 1960                                                                           
 doi                      | null                                                                           
 pmid                     | 13858424                                                                       
 magId                    | null                                                                           
 externalids              | {null, null, 22597273, null, null, null, 13858424, null}                       
 fieldsOfStudy            | 

                                                                                

In [7]:
%%time

dir_abstracts = dir_data.joinpath('abstracts')
df_abstracts = spark.read.json('file:///' + dir_abstracts.as_posix())

#We drop corrupt records
df_abstracts = df_abstracts.cache()
df_abstracts = df_abstracts.where(F.col("_corrupt_record").isNull()).drop("_corrupt_record")

print('Number of abstracts available:', df_abstracts.count())
df_abstracts.printSchema()
df_abstracts.show(n=2, truncate=120, vertical=True)



Number of abstracts available: 102585428
root
 |-- abstract: string (nullable = true)
 |-- corpusid: long (nullable = true)
 |-- openaccessinfo: struct (nullable = true)
 |    |-- externalids: struct (nullable = true)
 |    |    |-- ACL: string (nullable = true)
 |    |    |-- ArXiv: string (nullable = true)
 |    |    |-- DOI: string (nullable = true)
 |    |    |-- MAG: string (nullable = true)
 |    |    |-- PubMedCentral: string (nullable = true)
 |    |-- license: string (nullable = true)
 |    |-- status: string (nullable = true)
 |    |-- url: string (nullable = true)

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------
 abstract       | la Iglesia ha venerado siempre las Sagradas Escrituras al igual que el mismo Cuerpo del Señor, no dejando de tomar de... 
 corpusid       | 193425558                                                                                                              

                                                                                

In [8]:
%%time

# Adapt columns names and formats for backwards compatibility
df_abstracts = df_abstracts.select(F.col('corpusid').alias('id'), \
                           F.col('abstract').alias('paperAbstract'), \
                           'openaccessinfo' \
                          )
df_abstracts.printSchema()
df_abstracts.show(n=2, truncate=120, vertical=True)

root
 |-- id: long (nullable = true)
 |-- paperAbstract: string (nullable = true)
 |-- openaccessinfo: struct (nullable = true)
 |    |-- externalids: struct (nullable = true)
 |    |    |-- ACL: string (nullable = true)
 |    |    |-- ArXiv: string (nullable = true)
 |    |    |-- DOI: string (nullable = true)
 |    |    |-- MAG: string (nullable = true)
 |    |    |-- PubMedCentral: string (nullable = true)
 |    |-- license: string (nullable = true)
 |    |-- status: string (nullable = true)
 |    |-- url: string (nullable = true)

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------
 id             | 193425558                                                                                                                
 paperAbstract  | la Iglesia ha venerado siempre las Sagradas Escrituras al igual que el mismo Cuerpo del Señor, no dejando de tomar de... 
 openaccessinfo | {{null, null, null, 2

In [9]:
%%time

dataset = (dataset.join(df_abstracts, dataset.id ==  df_abstracts.id, "left")
                      .drop(df_abstracts.id)
                ).cache()

print('Number of documents in dataset:', dataset.count())
dataset.printSchema()
dataset.show(n=2, truncate=120, vertical=True)

23/12/09 16:31:41 ERROR TaskSchedulerImpl: Lost executor 5 on node18.cluster.tsc.uc3m.es: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/12/09 16:31:41 WARN TaskSetManager: Lost task 54.0 in stage 15.0 (TID 277) (node18.cluster.tsc.uc3m.es executor 5): ExecutorLostFailure (executor 5 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/12/09 16:31:41 WARN TaskSetManager: Lost task 41.0 in stage 15.0 (TID 264) (node18.cluster.tsc.uc3m.es executor 5): ExecutorLostFailure (executor 5 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/12/09 16:31:41 WARN TaskSetManager: Lost task 49.0 in stage 15.0 (TID 272) (node18.cluster.tsc.

Number of documents in dataset: 215205862
root
 |-- id: long (nullable = true)
 |-- title: string (nullable = true)
 |-- S2Url: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- doi: string (nullable = true)
 |-- pmid: string (nullable = true)
 |-- magId: string (nullable = true)
 |-- externalids: struct (nullable = true)
 |    |-- ACL: string (nullable = true)
 |    |-- ArXiv: string (nullable = true)
 |    |-- CorpusId: string (nullable = true)
 |    |-- DBLP: string (nullable = true)
 |    |-- DOI: string (nullable = true)
 |    |-- MAG: string (nullable = true)
 |    |-- PubMed: string (nullable = true)
 |    |-- PubMedCentral: string (nullable = true)
 |-- fieldsOfStudy: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- publicationtypes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- publicationdate: string (nullable = true)
 |-- journalName: string (nullable = true)
 |-- journalPages: string (nullable = tru

[Stage 22:>                                                         (0 + 1) / 1]

-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------
 id                       | 26                                                                                                                       
 title                    | FPGA-based design and implementation of an approximate polynomial matrix EVD algorithm                                   
 S2Url                    | https://www.semanticscholar.org/paper/7011b84b03f1d992962c4a6c87459f7742bc3165                                           
 year                     | 2012                                                                                                                     
 doi                      | 10.1109/FPT.2012.6412125                                                                                                 
 pmid                     | null                                                                    

                                                                                

In [10]:
%%time

df_abstracts = spark.read.json('file:///' + dir_abstracts.as_posix())


dataset.write.parquet(
    'file:///' + dir_parquet.joinpath(f"papers.parquet").as_posix(),
    mode="overwrite",
)

23/12/09 16:48:43 ERROR TaskSchedulerImpl: Lost executor 8 on node76.cluster.tsc.uc3m.es: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/12/09 16:48:43 WARN TaskSetManager: Lost task 65.0 in stage 26.0 (TID 577) (node76.cluster.tsc.uc3m.es executor 8): ExecutorLostFailure (executor 8 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/12/09 16:48:43 WARN TaskSetManager: Lost task 82.0 in stage 26.0 (TID 585) (node76.cluster.tsc.uc3m.es executor 8): ExecutorLostFailure (executor 8 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/12/09 16:48:43 WARN TaskSetManager: Lost task 74.0 in stage 26.0 (TID 582) (node76.cluster.tsc.

CPU times: user 274 ms, sys: 55 ms, total: 329 ms
Wall time: 14min 18s


In [11]:
print("Papers with PMID:", dataset.where(F.col("pmid").isNotNull()).count())
print("Papers with DOI:", dataset.where(F.col("doi").isNotNull()).count())
print("Unique DOIs:", dataset.where(F.col("doi").isNotNull()).select('doi').distinct().count())

23/12/09 16:55:40 ERROR TaskSchedulerImpl: Lost executor 19 on node55.cluster.tsc.uc3m.es: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/12/09 16:55:40 WARN TaskSetManager: Lost task 28.0 in stage 29.0 (TID 774) (node55.cluster.tsc.uc3m.es executor 19): ExecutorLostFailure (executor 19 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/12/09 16:55:40 WARN TaskSetManager: Lost task 44.0 in stage 29.0 (TID 796) (node55.cluster.tsc.uc3m.es executor 19): ExecutorLostFailure (executor 19 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/12/09 16:55:40 WARN TaskSetManager: Lost task 43.0 in stage 29.0 (TID 790) (node55.cluster

Papers with PMID: 36513413


23/12/09 16:58:37 ERROR TaskSchedulerImpl: Lost executor 24 on node35.cluster.tsc.uc3m.es: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/12/09 16:58:37 WARN TaskSetManager: Lost task 50.0 in stage 36.0 (TID 1010) (node35.cluster.tsc.uc3m.es executor 24): ExecutorLostFailure (executor 24 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/12/09 16:58:37 WARN TaskSetManager: Lost task 59.0 in stage 36.0 (TID 1022) (node35.cluster.tsc.uc3m.es executor 24): ExecutorLostFailure (executor 24 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/12/09 16:58:37 WARN TaskSetManager: Lost task 62.0 in stage 36.0 (TID 1030) (node35.clus

Papers with DOI: 120511834


23/12/09 17:02:08 ERROR TaskSchedulerImpl: Lost executor 34 on node67.cluster.tsc.uc3m.es: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/12/09 17:02:08 WARN TaskSetManager: Lost task 24.0 in stage 43.0 (TID 1246) (node67.cluster.tsc.uc3m.es executor 34): ExecutorLostFailure (executor 34 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/12/09 17:02:08 WARN TaskSetManager: Lost task 21.0 in stage 43.0 (TID 1243) (node67.cluster.tsc.uc3m.es executor 34): ExecutorLostFailure (executor 34 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/12/09 17:02:08 WARN TaskSetManager: Lost task 23.0 in stage 43.0 (TID 1245) (node67.clus

Unique DOIs: 118947695


                                                                                

### 4.2. Table **`authors`**

In [12]:
%%time

dir_authors = dir_data.joinpath('authors')
df_authors = spark.read.json('file:///' + dir_authors.as_posix())

#We drop corrupt records
df_authors = df_authors.cache()
df_authors = df_authors.where(F.col("_corrupt_record").isNull()).drop("_corrupt_record")

print('Number of authors available:', df_authors.count())
df_authors.printSchema()
df_authors.show(n=2, truncate=120, vertical=True)



Number of authors available: 89626559
root
 |-- affiliations: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- aliases: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- authorid: string (nullable = true)
 |-- citationcount: long (nullable = true)
 |-- externalids: struct (nullable = true)
 |    |-- DBLP: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- ORCID: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |-- hindex: long (nullable = true)
 |-- homepage: string (nullable = true)
 |-- name: string (nullable = true)
 |-- papercount: long (nullable = true)
 |-- url: string (nullable = true)

-RECORD 0----------------------------------------------------------
 affiliations  | null                                              
 aliases       | null                                              
 authorid      | 1396826267                                        
 citati

                                                                                

<span style="background-color:yellow;">Right now, it seems affiliations are missing, and ORCID also missing for most of the authors (<0.35%)? We will continue checking to see if this becomes any better.</span>

In [13]:
# Search for rows where the name column contains "John"
search_term = "J. Arenas-García"
result = df_authors.filter(F.col("name").like(f"%{search_term}%"))

result.show(truncate=120, vertical=True)



-RECORD 0--------------------------------------------------------------------------------------------
 affiliations  | null                                                                                
 aliases       | [J. Arenas-garcia, Jeronimo Arenas-garcia, Jerónimo Arenas-garcía]                  
 authorid      | 1385756446                                                                          
 citationcount | 3019                                                                                
 externalids   | {[Jerónimo Arenas-García], null}                                                    
 hindex        | 29                                                                                  
 homepage      | null                                                                                
 name          | J. Arenas-García                                                                    
 papercount    | 107                                                              

                                                                                

In [14]:
print("Number of authors with affiliations:", df_authors.where(F.col("affiliations").isNotNull()).count())
print("Number of authors with ORCID:", df_authors.where(F.col("externalids.ORCID").isNotNull()).count())

                                                                                

Number of authors with affiliations: 254168




Number of authors with ORCID: 31283


                                                                                

In [15]:
# Adapt columns names and formats for backwards compatibility
df_authors = df_authors.select(F.col('authorid').alias('id'), \
                               "name", \
                               "aliases", \
                               F.col('papercount').cast(IntegerType()), \
                               F.col('citationcount').cast(IntegerType()), \
                               F.col('hindex').cast(IntegerType()) \
                              )
df_authors.printSchema()
df_authors.show(n=2, truncate=120, vertical=True)

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- aliases: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- papercount: integer (nullable = true)
 |-- citationcount: integer (nullable = true)
 |-- hindex: integer (nullable = true)

-RECORD 0-----------------------------
 id            | 1396826267           
 name          | ElÃ­as Miguel MuÃ±oz 
 aliases       | null                 
 papercount    | 1                    
 citationcount | 1                    
 hindex        | 1                    
-RECORD 1-----------------------------
 id            | 117877359            
 name          | Pablo García Dussán  
 aliases       | null                 
 papercount    | 6                    
 citationcount | 2                    
 hindex        | 1                    
only showing top 2 rows



In [16]:
%%time

df_authors.write.parquet(
    'file:///' + dir_parquet.joinpath(f"authors.parquet").as_posix(),
    mode="overwrite",
)



CPU times: user 10.5 ms, sys: 4.25 ms, total: 14.7 ms
Wall time: 24 s


                                                                                

### 4.3. Table **`paper_author`**

In [17]:
df_paper_author = df_papers.select(F.col('corpusid').alias('paper_id'), F.explode('authors'))
df_paper_author = df_paper_author.select('paper_id', F.col('col.authorId').alias('author_id'))

#We make sure that authors are in the author table
df_author_aux = df_authors.select("id")
df_paper_author = (
    df_paper_author.join(df_author_aux, \
                      df_paper_author.author_id ==  df_author_aux.id, "left")
                    .drop(df_author_aux.id)
).cache()

print("Number of paper_author entries:", df_paper_author.count())
df_paper_author.printSchema()
print(df_paper_author.show(n=2, vertical=True))

23/12/09 19:08:04 ERROR TaskSchedulerImpl: Lost executor 39 on node10.cluster.tsc.uc3m.es: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/12/09 19:08:04 WARN TaskSetManager: Lost task 27.0 in stage 71.0 (TID 1698) (node10.cluster.tsc.uc3m.es executor 39): ExecutorLostFailure (executor 39 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/12/09 19:08:04 WARN TaskSetManager: Lost task 21.0 in stage 71.0 (TID 1692) (node10.cluster.tsc.uc3m.es executor 39): ExecutorLostFailure (executor 39 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/12/09 19:08:04 WARN TaskSetManager: Lost task 0.0 in stage 71.0 (TID 1671) (node10.clust

Number of paper_author entries: 605528978
root
 |-- paper_id: long (nullable = true)
 |-- author_id: string (nullable = true)

-RECORD 0-------------
 paper_id  | 7487231  
 author_id | 10000172 
-RECORD 1-------------
 paper_id  | 21187561 
 author_id | 10000172 
only showing top 2 rows

None


                                                                                

In [18]:
# Save dataframe as parquet
df_paper_author.write.parquet(
    'file:///' + dir_parquet.joinpath("paper_author.parquet").as_posix(),
    mode="overwrite",
)


                                                                                

### 4.4. Table **`citations`**

In [19]:
%%time 
dir_citations = dir_data.joinpath('citations')
df_citations = spark.read.json('file:///' + dir_citations.as_posix())

#We drop corrupt records
df_citations = df_citations.cache()
df_citations = df_citations.where(F.col("_corrupt_record").isNull()).drop("_corrupt_record")

#Select and rename some columns
#We skip the context (text surronding the citation) and intention (result, background, methods),
#since these data are most likely not going to be used in IntelComp

df_citations = df_citations.select(F.col('citingcorpusid').alias('source'), \
                                   F.col('citedcorpusid').alias('dest'), \
                                   'isinfluential')

print('Number of citations:', df_citations.count())
df_citations.printSchema()
df_citations.show(n=2, truncate=120, vertical=True)

23/12/09 20:32:38 ERROR TaskSchedulerImpl: Lost executor 30 on node85.cluster.tsc.uc3m.es: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/12/09 20:32:38 WARN TaskSetManager: Lost task 5.0 in stage 84.0 (TID 2143) (node85.cluster.tsc.uc3m.es executor 30): ExecutorLostFailure (executor 30 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/12/09 20:32:38 WARN TaskSetManager: Lost task 25.0 in stage 84.0 (TID 2163) (node85.cluster.tsc.uc3m.es executor 30): ExecutorLostFailure (executor 30 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/12/09 20:32:38 WARN TaskSetManager: Lost task 15.0 in stage 84.0 (TID 2153) (node85.clust

Number of citations: 2577329376
root
 |-- source: long (nullable = true)
 |-- dest: long (nullable = true)
 |-- isinfluential: boolean (nullable = true)



[Stage 87:>                                                         (0 + 1) / 1]

-RECORD 0------------------
 source        | 257718632 
 dest          | 221178188 
 isinfluential | false     
-RECORD 1------------------
 source        | 11943884  
 dest          | 97427634  
 isinfluential | false     
only showing top 2 rows

CPU times: user 577 ms, sys: 36.6 ms, total: 614 ms
Wall time: 1h 1min 56s


                                                                                

In [20]:
%%time

# Save dataframe as parquet
df_citations.write.parquet(
    'file:///' + dir_parquet.joinpath("citations.parquet").as_posix(),
    mode="overwrite",
)



CPU times: user 173 ms, sys: 29.9 ms, total: 203 ms
Wall time: 18min 34s


----------------------------------------                                        
Exception occurred during processing of request from ('127.0.0.1', 34630)
Traceback (most recent call last):
  File "/usr/lib/python3.10/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib/python3.10/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib/python3.10/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/lib/python3.10/socketserver.py", line 747, in __init__
    self.handle()
  File "/opt/spark-3.4.0-bin-3.3.1/python/pyspark/accumulators.py", line 281, in handle
    poll(accum_updates)
  File "/opt/spark-3.4.0-bin-3.3.1/python/pyspark/accumulators.py", line 253, in poll
    if func():
  File "/opt/spark-3.4.0-bin-3.3.1/python/pyspark/accumulators.py", line 257, in accum_updates
    num_updates = read_in