In [1]:
sc.version

'3.4.0'

## 1. Imports

In [2]:
#from configparser import ConfigParser
from pathlib import Path
import pyspark.sql.functions as F
#import requests
from pyspark.sql.types import ArrayType, StringType, BooleanType, IntegerType


## 2. Define directories

In [3]:
# Define directories
#
# Relevant directories are read from the config file:
# dir_data:    full path to hdfs directory where the raw data .gz files are stored
# dir_parquet: full path to hdfs directory where the parquet tables will be stored
# version:     Version of Semantic Scholar that is being processed
#              for information purposes only

# cf = ConfigParser()
# cf.read("../config.cf")

# dir_data = Path(cf.get("spark", "dir_data"))
# dir_parquet = Path(cf.get("spark", "dir_parquet"))
# version = cf.get("spark", "version")
# dir_pdfs = Path(cf.get("spark", "dir_pdfs"))

dir_data = Path('/export/data_ml4ds/AI4U/Datasets/semanticscholar/20240730/rawdata')
dir_parquet = Path('/export/data_ml4ds/AI4U/Datasets/semanticscholar/20240730/parquet')

## 3. Configuration hdfs

**Files will be read from and saved to NFS. Skip this section entirely !**

It is not possible to listdir() directly using Path as it is a hdfs

In [None]:
"""# Configuration hdfs
fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
hdfs_dir_parquet = spark._jvm.org.apache.hadoop.fs.Path(dir_parquet.as_posix())
# Create output directories if they do not exist
# !hadoop dfs ...
# !hadoop dfs -put 20220201 /export/ml4ds/IntelComp/Datalake/SemanticScholar/

if not fs.exists(hdfs_dir_parquet):
    fs.mkdirs(hdfs_dir_parquet)
"""

In [None]:
"""
# Configuration hdfs
fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
hdfs_dir_data = spark._jvm.org.apache.hadoop.fs.Path(dir_data.as_posix())

print(hdfs_dir_data)

# Get selected version
releases = sorted(
    [
        f.getPath().getName()
        for f in fs.listStatus(hdfs_dir_data)
        if f.isDirectory() and f.getPath().getName().isdigit()
    ]
)
version = version.replace("-", "")
if version == "last":
    version = releases[-1]
if version not in releases:
    print(f"Version {version} not found")
    print(f"Available versions: {releases}")

hdfs_dir_data_files = spark._jvm.org.apache.hadoop.fs.Path(
    dir_data.joinpath(version).as_posix()
)
hdfs_dir_parquet = spark._jvm.org.apache.hadoop.fs.Path(dir_parquet.as_posix())
hdfs_dir_version = spark._jvm.org.apache.hadoop.fs.Path(
    dir_parquet.joinpath(version).as_posix()
)

# Create output directories if they do not exist
# !hadoop dfs ...
# !hadoop dfs -put 20220201 /export/ml4ds/IntelComp/Datalake/SemanticScholar/

if not fs.exists(hdfs_dir_parquet):
    fs.mkdirs(hdfs_dir_parquet)

if not fs.exists(hdfs_dir_version):
    fs.mkdirs(hdfs_dir_version)
"""

## 4. Import tables

### 4.1. Table **`papers`**

In [4]:
%%time
dir_papers = dir_data.joinpath('papers')
df_papers = spark.read.json('file:///' + dir_papers.as_posix())

#We drop corrupt records
df_papers = df_papers.cache()
df_papers = df_papers.where(F.col("_corrupt_record").isNull()).drop("_corrupt_record")

print('Number of papers available:', df_papers.count())
df_papers.printSchema()
df_papers.show(n=2, truncate=120, vertical=True)

                                                                                

Number of papers available: 220006291
root
 |-- authors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- authorId: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- citationcount: long (nullable = true)
 |-- corpusid: long (nullable = true)
 |-- externalids: struct (nullable = true)
 |    |-- ACL: string (nullable = true)
 |    |-- ArXiv: string (nullable = true)
 |    |-- CorpusId: string (nullable = true)
 |    |-- DBLP: string (nullable = true)
 |    |-- DOI: string (nullable = true)
 |    |-- MAG: string (nullable = true)
 |    |-- PubMed: string (nullable = true)
 |    |-- PubMedCentral: string (nullable = true)
 |-- influentialcitationcount: long (nullable = true)
 |-- isopenaccess: boolean (nullable = true)
 |-- journal: struct (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- pages: string (nullable = true)
 |    |-- volume: string (nullable = true)
 |-- publicationdate: string (nullable = true)

<span style="background-color:yellow;">The following code is used to count how many papers of each type are there in the dataset. It is very inefficient probably, but it works nevertheless.</span>

In [5]:
tipos = df_papers.where(F.col("publicationtypes").isNotNull()).select('publicationtypes').collect()
tipos = [el[0] for el in tipos]
tipos = [item for sublist in tipos for item in sublist]

from collections import Counter
counts = Counter(tipos)
print(counts)

                                                                                

Counter({'JournalArticle': 48373755, 'Review': 15641356, 'Conference': 4396499, 'Study': 2339597, 'CaseReport': 2285570, 'LettersAndComments': 1564083, 'Editorial': 714480, 'ClinicalTrial': 581207, 'Book': 385829, 'News': 241762, 'MetaAnalysis': 105257, 'Dataset': 1075})


In [6]:
%%time

# This function will be used for extracting only the Semantic Scholar FOS in string format
# Semantic Scholar uses several models, but we keep only FOS from s2-fos-model
def extractFOS(x):
    try:
        return [el['category'] for el in x
            if el['source'] == "s2-fos-model"]
    except:
        return None
    
extractFOS_UDF = F.udf(extractFOS, ArrayType(StringType()))

# Adapt columns names and formats for backwards compatibility
dataset = df_papers.select(F.col('corpusid').alias('id'), \
                           'title', \
                           F.col('url').alias('S2Url'), \
                           F.col('year').cast(IntegerType()), \
                           F.col('externalids.DOI').alias('doi'), \
                           F.col('externalids.PubMed').alias('pmid'), \
                           F.col('externalids.MAG').alias('magId'), \
                           'externalids', \
                           extractFOS_UDF(F.col('s2fieldsofstudy')).alias("fieldsOfStudy"), \
                           'publicationtypes', \
                           'publicationdate', \
                           F.col('journal.name').alias('journalName'), \
                           F.col('journal.pages').alias('journalPages'), \
                           F.col('journal.volume').alias('journalVolume'), \
                           'venue', \
                           'publicationvenueid', \
                           'isopenaccess', \
                           F.col('referencecount').cast(IntegerType()), \
                           F.col('citationcount').cast(IntegerType()), \
                           F.col('influentialcitationcount').cast(IntegerType()) \
                          )
dataset.printSchema()
dataset.show(n=2, truncate=120, vertical=True)

root
 |-- id: long (nullable = true)
 |-- title: string (nullable = true)
 |-- S2Url: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- doi: string (nullable = true)
 |-- pmid: string (nullable = true)
 |-- magId: string (nullable = true)
 |-- externalids: struct (nullable = true)
 |    |-- ACL: string (nullable = true)
 |    |-- ArXiv: string (nullable = true)
 |    |-- CorpusId: string (nullable = true)
 |    |-- DBLP: string (nullable = true)
 |    |-- DOI: string (nullable = true)
 |    |-- MAG: string (nullable = true)
 |    |-- PubMed: string (nullable = true)
 |    |-- PubMedCentral: string (nullable = true)
 |-- fieldsOfStudy: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- publicationtypes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- publicationdate: string (nullable = true)
 |-- journalName: string (nullable = true)
 |-- journalPages: string (nullable = true)
 |-- journalVolume: string (nullable = 

[Stage 7:>                                                          (0 + 1) / 1]

-RECORD 0--------------------------------------------------------------------------------------------------------------------------------
 id                       | 80809568                                                                                                     
 title                    | The essentials of identity – differentiating normal from pathological                                        
 S2Url                    | https://www.semanticscholar.org/paper/6d35ab3955b1beb231a276d3fec0914fe903d71a                               
 year                     | 2012                                                                                                         
 doi                      | 10.1016/J.NEURENF.2012.05.027                                                                                
 pmid                     | null                                                                                                         
 magId                    | 233250

                                                                                

In [7]:
%%time

dir_abstracts = dir_data.joinpath('abstracts')
df_abstracts = spark.read.json('file:///' + dir_abstracts.as_posix())

#We drop corrupt records
df_abstracts = df_abstracts.cache()
df_abstracts = df_abstracts.where(F.col("_corrupt_record").isNull()).drop("_corrupt_record")

print('Number of abstracts available:', df_abstracts.count())
df_abstracts.printSchema()
df_abstracts.show(n=2, truncate=120, vertical=True)

                                                                                

Number of abstracts available: 105538378
root
 |-- abstract: string (nullable = true)
 |-- corpusid: long (nullable = true)
 |-- openaccessinfo: struct (nullable = true)
 |    |-- externalids: struct (nullable = true)
 |    |    |-- ACL: string (nullable = true)
 |    |    |-- ArXiv: string (nullable = true)
 |    |    |-- DOI: string (nullable = true)
 |    |    |-- MAG: string (nullable = true)
 |    |    |-- PubMedCentral: string (nullable = true)
 |    |-- license: string (nullable = true)
 |    |-- status: string (nullable = true)
 |    |-- url: string (nullable = true)



[Stage 13:>                                                         (0 + 1) / 1]

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------
 abstract       | The invention discloses a preparing method of blonanserin intermediate. The method is processed according to the foll... 
 corpusid       | 102720041                                                                                                                
 openaccessinfo | {{null, null, null, 2747728283, null}, null, null, null}                                                                 
-RECORD 1----------------------------------------------------------------------------------------------------------------------------------
 abstract       | The gypsum salt bed is located deep in the Carboniferous system of Tarim Basin, and its temperature is 110-130 ℃. Dur... 
 corpusid       | 130918190                                                                                                                
 openaccessinfo | {{

                                                                                

In [8]:
%%time

# Adapt columns names and formats for backwards compatibility
df_abstracts = df_abstracts.select(F.col('corpusid').alias('id'), \
                           F.col('abstract').alias('paperAbstract'), \
                           'openaccessinfo' \
                          )
df_abstracts.printSchema()
df_abstracts.show(n=2, truncate=120, vertical=True)

root
 |-- id: long (nullable = true)
 |-- paperAbstract: string (nullable = true)
 |-- openaccessinfo: struct (nullable = true)
 |    |-- externalids: struct (nullable = true)
 |    |    |-- ACL: string (nullable = true)
 |    |    |-- ArXiv: string (nullable = true)
 |    |    |-- DOI: string (nullable = true)
 |    |    |-- MAG: string (nullable = true)
 |    |    |-- PubMedCentral: string (nullable = true)
 |    |-- license: string (nullable = true)
 |    |-- status: string (nullable = true)
 |    |-- url: string (nullable = true)



[Stage 14:>                                                         (0 + 1) / 1]

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------
 id             | 102720041                                                                                                                
 paperAbstract  | The invention discloses a preparing method of blonanserin intermediate. The method is processed according to the foll... 
 openaccessinfo | {{null, null, null, 2747728283, null}, null, null, null}                                                                 
-RECORD 1----------------------------------------------------------------------------------------------------------------------------------
 id             | 130918190                                                                                                                
 paperAbstract  | The gypsum salt bed is located deep in the Carboniferous system of Tarim Basin, and its temperature is 110-130 ℃. Dur... 
 openaccessinfo | {{

                                                                                

In [21]:
%%time

dataset = (dataset.join(df_abstracts, dataset.id ==  df_abstracts.id, "left")
                      .drop(df_abstracts.id)
                ).cache()

print('Number of documents in dataset:', dataset.count())
dataset.printSchema()
dataset.show(n=2, truncate=120, vertical=True)

AttributeError: 'DataFrame' object has no attribute 'id'

24/10/13 05:00:56 WARN HeartbeatReceiver: Removing executor 42 with no recent heartbeats: 309268 ms exceeds timeout 300000 ms
24/10/13 05:00:56 WARN HeartbeatReceiver: Removing executor 41 with no recent heartbeats: 313560 ms exceeds timeout 300000 ms
24/10/13 05:00:56 WARN HeartbeatReceiver: Removing executor 38 with no recent heartbeats: 312240 ms exceeds timeout 300000 ms
24/10/13 05:00:56 WARN HeartbeatReceiver: Removing executor 32 with no recent heartbeats: 312937 ms exceeds timeout 300000 ms
24/10/13 05:00:56 WARN HeartbeatReceiver: Removing executor 40 with no recent heartbeats: 312137 ms exceeds timeout 300000 ms
24/10/13 05:00:56 WARN HeartbeatReceiver: Removing executor 34 with no recent heartbeats: 313811 ms exceeds timeout 300000 ms
24/10/13 05:00:56 WARN HeartbeatReceiver: Removing executor 43 with no recent heartbeats: 318435 ms exceeds timeout 300000 ms
24/10/13 05:00:56 WARN HeartbeatReceiver: Removing executor 37 with no recent heartbeats: 315586 ms exceeds timeout 30

In [10]:
%%time

df_abstracts = spark.read.json('file:///' + dir_abstracts.as_posix())


dataset.write.parquet(
    'file:///' + dir_parquet.joinpath(f"papers.parquet").as_posix(),
    mode="overwrite",
)

24/10/12 01:23:07 WARN TaskSetManager: Lost task 101.0 in stage 17.0 (TID 680) (node05.cluster.tsc.uc3m.es executor 2): TaskKilled (Stage cancelled)
24/10/12 01:23:07 WARN TaskSetManager: Lost task 83.2 in stage 17.0 (TID 678) (node05.cluster.tsc.uc3m.es executor 2): TaskKilled (Stage cancelled)
24/10/12 01:23:07 WARN TaskSetManager: Lost task 89.1 in stage 17.0 (TID 688) (node05.cluster.tsc.uc3m.es executor 2): TaskKilled (Stage cancelled)
24/10/12 01:23:07 WARN TaskSetManager: Lost task 104.0 in stage 17.0 (TID 683) (node05.cluster.tsc.uc3m.es executor 2): TaskKilled (Stage cancelled)
24/10/12 01:27:04 ERROR TaskSchedulerImpl: Lost executor 2 on node05.cluster.tsc.uc3m.es: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/10/12 01:27:04 WARN TaskSetManager: Lost task 29.0 in stage 22.0 (TID 822) (node05.cluster.tsc.uc3m.es executor 2): ExecutorLostFailure (executor 2 exited caused by one of the r

CPU times: user 307 ms, sys: 59.4 ms, total: 367 ms
Wall time: 17min 13s


In [11]:
print("Papers with PMID:", dataset.where(F.col("pmid").isNotNull()).count())
print("Papers with DOI:", dataset.where(F.col("doi").isNotNull()).count())
print("Unique DOIs:", dataset.where(F.col("doi").isNotNull()).select('doi').distinct().count())

24/10/12 01:40:35 ERROR TaskSchedulerImpl: Lost executor 11 on node92.cluster.tsc.uc3m.es: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/10/12 01:40:35 WARN TaskSetManager: Lost task 51.0 in stage 25.0 (TID 1205) (node92.cluster.tsc.uc3m.es executor 11): ExecutorLostFailure (executor 11 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/10/12 01:40:35 WARN TaskSetManager: Lost task 40.0 in stage 25.0 (TID 1196) (node92.cluster.tsc.uc3m.es executor 11): ExecutorLostFailure (executor 11 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/10/12 01:40:35 WARN TaskSetManager: Lost task 47.0 in stage 25.0 (TID 1201) (node92.clus

Papers with PMID: 37497271


24/10/12 01:43:57 ERROR TaskSchedulerImpl: Lost executor 23 on node36.cluster.tsc.uc3m.es: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/10/12 01:43:57 WARN TaskSetManager: Lost task 66.0 in stage 32.0 (TID 1357) (node36.cluster.tsc.uc3m.es executor 23): ExecutorLostFailure (executor 23 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/10/12 01:43:57 WARN TaskSetManager: Lost task 3.0 in stage 32.0 (TID 1327) (node36.cluster.tsc.uc3m.es executor 23): ExecutorLostFailure (executor 23 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/10/12 01:43:57 WARN TaskSetManager: Lost task 39.0 in stage 32.0 (TID 1347) (node36.clust

Papers with DOI: 125038998


24/10/12 01:49:57 ERROR TaskSchedulerImpl: Lost executor 33 on node92.cluster.tsc.uc3m.es: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/10/12 01:49:57 WARN TaskSetManager: Lost task 32.0 in stage 39.0 (TID 1582) (node92.cluster.tsc.uc3m.es executor 33): ExecutorLostFailure (executor 33 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/10/12 01:49:57 WARN TaskSetManager: Lost task 19.0 in stage 39.0 (TID 1576) (node92.cluster.tsc.uc3m.es executor 33): ExecutorLostFailure (executor 33 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/10/12 01:49:57 WARN TaskSetManager: Lost task 37.0 in stage 39.0 (TID 1585) (node92.clus

Unique DOIs: 123595615


                                                                                

### 4.2. Table **`authors`**

In [12]:
%%time

dir_authors = dir_data.joinpath('authors')
df_authors = spark.read.json('file:///' + dir_authors.as_posix())

#We drop corrupt records
df_authors = df_authors.cache()
df_authors = df_authors.where(F.col("_corrupt_record").isNull()).drop("_corrupt_record")

print('Number of authors available:', df_authors.count())
df_authors.printSchema()
df_authors.show(n=2, truncate=120, vertical=True)

[Stage 52:>                                                         (0 + 1) / 1]

Number of authors available: 98089208
root
 |-- affiliations: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- aliases: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- authorid: string (nullable = true)
 |-- citationcount: long (nullable = true)
 |-- externalids: struct (nullable = true)
 |    |-- DBLP: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- ORCID: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |-- hindex: long (nullable = true)
 |-- homepage: string (nullable = true)
 |-- name: string (nullable = true)
 |-- papercount: long (nullable = true)
 |-- url: string (nullable = true)

-RECORD 0----------------------------------------------------------
 affiliations  | null                                              
 aliases       | [S A Shohaib]                                     
 authorid      | 2244420168                                        
 citati

                                                                                

<span style="background-color:yellow;">Right now, it seems affiliations are missing, and ORCID also missing for most of the authors (<0.35%)? We will continue checking to see if this becomes any better.</span>

In [13]:
# Search for rows where the name column contains "John"
search_term = "J. Arenas-García"
result = df_authors.filter(F.col("name").like(f"%{search_term}%"))

result.show(truncate=120, vertical=True)



-RECORD 0--------------------------------------------------------------------------------------------
 affiliations  | null                                                                                
 aliases       | [J. Arenas-garcia, Jeronimo Arenas-garcia, Jerónimo Arenas-garcía]                  
 authorid      | 1385756446                                                                          
 citationcount | 3028                                                                                
 externalids   | {[Jerónimo Arenas-García], null}                                                    
 hindex        | 29                                                                                  
 homepage      | null                                                                                
 name          | J. Arenas-García                                                                    
 papercount    | 104                                                              

                                                                                

In [14]:
print("Number of authors with affiliations:", df_authors.where(F.col("affiliations").isNotNull()).count())
print("Number of authors with ORCID:", df_authors.where(F.col("externalids.ORCID").isNotNull()).count())

                                                                                

Number of authors with affiliations: 247494




Number of authors with ORCID: 35387


                                                                                

In [15]:
# Adapt columns names and formats for backwards compatibility
df_authors = df_authors.select(F.col('authorid').alias('id'), \
                               "name", \
                               "aliases", \
                               F.col('papercount').cast(IntegerType()), \
                               F.col('citationcount').cast(IntegerType()), \
                               F.col('hindex').cast(IntegerType()) \
                              )
df_authors.printSchema()
df_authors.show(n=2, truncate=120, vertical=True)

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- aliases: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- papercount: integer (nullable = true)
 |-- citationcount: integer (nullable = true)
 |-- hindex: integer (nullable = true)

-RECORD 0------------------------------------
 id            | 2244420168                  
 name          | S. A. Shohaib               
 aliases       | [S A Shohaib]               
 papercount    | 1                           
 citationcount | 0                           
 hindex        | 0                           
-RECORD 1------------------------------------
 id            | 2093767694                  
 name          | Wolfgang Dipl.-Ing. Schärfl 
 aliases       | null                        
 papercount    | 4                           
 citationcount | 2                           
 hindex        | 1                           
only showing top 2 rows



In [16]:
%%time

df_authors.write.parquet(
    'file:///' + dir_parquet.joinpath(f"authors.parquet").as_posix(),
    mode="overwrite",
)



CPU times: user 20.7 ms, sys: 529 μs, total: 21.3 ms
Wall time: 27 s


                                                                                

### 4.3. Table **`paper_author`**

In [17]:
df_paper_author = df_papers.select(F.col('corpusid').alias('paper_id'), F.explode('authors'))
df_paper_author = df_paper_author.select('paper_id', F.col('col.authorId').alias('author_id'))

#We make sure that authors are in the author table
df_author_aux = df_authors.select("id")
df_paper_author = (
    df_paper_author.join(df_author_aux, \
                      df_paper_author.author_id ==  df_author_aux.id, "left")
                    .drop(df_author_aux.id)
).cache()

print("Number of paper_author entries:", df_paper_author.count())
df_paper_author.printSchema()
print(df_paper_author.show(n=2, vertical=True))

[Stage 72:>                                                         (0 + 1) / 1]

Number of paper_author entries: 625447465
root
 |-- paper_id: long (nullable = true)
 |-- author_id: string (nullable = true)

-RECORD 0-------------
 paper_id  | 7487231  
 author_id | 10000172 
-RECORD 1-------------
 paper_id  | 21187561 
 author_id | 10000172 
only showing top 2 rows

None


                                                                                

In [18]:
# Save dataframe as parquet
df_paper_author.write.parquet(
    'file:///' + dir_parquet.joinpath("paper_author.parquet").as_posix(),
    mode="overwrite",
)


24/10/12 02:04:14 ERROR TaskSchedulerImpl: Lost executor 30 on node05.cluster.tsc.uc3m.es: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/10/12 02:04:14 WARN TaskSetManager: Lost task 137.0 in stage 78.0 (TID 2414) (node05.cluster.tsc.uc3m.es executor 30): ExecutorLostFailure (executor 30 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/10/12 02:04:14 WARN TaskSetManager: Lost task 143.0 in stage 78.0 (TID 2417) (node05.cluster.tsc.uc3m.es executor 30): ExecutorLostFailure (executor 30 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/10/12 02:04:14 WARN TaskSetManager: Lost task 155.0 in stage 78.0 (TID 2420) (node05.c

### 4.4. Table **`citations`**

In [19]:
%%time 
dir_citations = dir_data.joinpath('citations')
df_citations = spark.read.json('file:///' + dir_citations.as_posix())

#We drop corrupt records
df_citations = df_citations.cache()
df_citations = df_citations.where(F.col("_corrupt_record").isNull()).drop("_corrupt_record")

#Select and rename some columns
#We skip the context (text surronding the citation) and intention (result, background, methods),
#since these data are most likely not going to be used in IntelComp

df_citations = df_citations.select(F.col('citingcorpusid').alias('source'), \
                                   F.col('citedcorpusid').alias('dest'), \
                                   'isinfluential')

print('Number of citations:', df_citations.count())
df_citations.printSchema()
df_citations.show(n=2, truncate=120, vertical=True)

24/10/12 02:10:14 ERROR TaskSchedulerImpl: Lost executor 39 on node18.cluster.tsc.uc3m.es: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/10/12 02:10:14 WARN TaskSetManager: Lost task 53.0 in stage 80.0 (TID 2715) (node18.cluster.tsc.uc3m.es executor 39): ExecutorLostFailure (executor 39 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/10/12 02:10:14 WARN TaskSetManager: Lost task 55.0 in stage 80.0 (TID 2717) (node18.cluster.tsc.uc3m.es executor 39): ExecutorLostFailure (executor 39 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
24/10/12 02:10:14 WARN TaskSetManager: Lost task 58.0 in stage 80.0 (TID 2720) (node18.clus

Number of citations: 2663282456
root
 |-- source: long (nullable = true)
 |-- dest: long (nullable = true)
 |-- isinfluential: boolean (nullable = true)

-RECORD 0-----------------
 source        | 30717035 
 dest          | 19447476 
 isinfluential | false    
-RECORD 1-----------------
 source        | 41671791 
 dest          | 30145272 
 isinfluential | false    
only showing top 2 rows

CPU times: user 592 ms, sys: 48.4 ms, total: 640 ms
Wall time: 36min 17s


In [20]:
%%time

# Save dataframe as parquet
df_citations.write.parquet(
    'file:///' + dir_parquet.joinpath("citations.parquet").as_posix(),
    mode="overwrite",
)

24/10/12 02:47:14 WARN TaskSetManager: Lost task 192.0 in stage 85.0 (TID 3301) (node50.cluster.tsc.uc3m.es executor 43): java.io.FileNotFoundException: /export/workdir/spark/tmp/blockmgr-c1ea327c-21df-458b-81ef-40c2c345e281/0b/rdd_227_192 (No space left on device)
	at java.base/java.io.FileOutputStream.open0(Native Method)
	at java.base/java.io.FileOutputStream.open(FileOutputStream.java:298)
	at java.base/java.io.FileOutputStream.<init>(FileOutputStream.java:237)
	at java.base/java.io.FileOutputStream.<init>(FileOutputStream.java:187)
	at org.apache.spark.storage.DiskStore.openForWrite(DiskStore.scala:156)
	at org.apache.spark.storage.DiskStore.put(DiskStore.scala:84)
	at org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$1(BlockManager.scala:1542)
	at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$doPut(BlockManager.scala:1462)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1526)
	at org.apache.spark.storage.BlockMa

CPU times: user 198 ms, sys: 20.9 ms, total: 219 ms
Wall time: 8min 6s
