In [1]:
# from pyspark.sql import SparkSession

# spark = SparkSession.builder.appName("test").getOrCreate()
# sc = spark.sparkContext


In [2]:
sc.version
spark.conf.set("spark.sql.parquet.int96RebaseModeInWrite", "CORRECTED")

# Imports

In [3]:
import numpy as np
import pandas as pd
from configparser import ConfigParser
from pathlib import Path
import zipfile
# import py7zr
import io
import os
import re
from pyspark.sql.functions import col
from pyspark.sql import functions as F
from pyspark.sql.types import StringType
from typing import List


# Auxiliary Functions

These functions are used to load, read, identify or save files.

## File processing

In [4]:
def get_only_letter(s):
    res = re.sub(r"[^a-zA-Z]", "", s)
    return res


def get_file_extension(f: Path):
    output = os.popen(f"file -b {f.as_posix()}")
    return output.read().split()[0]


def read_excel(f: Path):
    fname = None
    df = None
    try:
        fname = f.stem
        df = pd.read_excel(f, index_col=False)
        df.columns = df.columns.str.strip()

    except Exception as e:
        print("-- ERROR: file could not be processed")
        print("\t", e)
    return fname, df


def extract_file_info(f: Path):
    """
    Read a file and extract all its content into a pd.DataFrame()

    As a zip file can contain multiple inner files, we need to iterate
    over this function when calling it
    """
    fname = None
    df = None

    # Get type of file
    extension = get_file_extension(f)
    print(f"-- {extension}")

    if extension not in ["Zip", "7-zip"]:
        # Excel files (hopefully)
        fname, df = read_excel(f)
        yield fname, df
    else:
        # Compressed files
        if extension == "Zip":
            archive = zipfile.ZipFile(f.as_posix(), "r")
            content = {name: archive.read(name) for name in archive.namelist()}
        elif extension == "7-zip":
            with py7zr.SevenZipFile(f, "r") as archive:
                content = archive.readall()
        print("-- Files:")
        keys = [get_only_letter(k) for k in content.keys()]
        key_0 = keys[0]

        # Check if all elements are of the same table:
        all_equal = True if all(key_0 == x for x in keys) else False

        # Return info
        df_sub = []
        for k, v in content.items():
            print(f"-- -- {k}")
            df = pd.read_excel(v, engine="openpyxl")
            if all_equal:
                df_sub.append(df)
                continue
            else:
                fname = f.stem + "-" + re.sub("xlsx", "", get_only_letter(k))
                yield fname, df

        # Merge all into one dataframe if necessary
        if all_equal:
            fname = key_0
            df = pd.concat(df_sub, ignore_index=True)
            yield fname, df


## Spark conversion

In [5]:
def convert_spark(df: pd.DataFrame):
    """
    Transforms input pandas DataFrame into spark DataFrame
    """
    sparkDF = None
    
    # Clean DF
    # Strip column names
    df.columns = df.columns.str.strip()

    # Filter languages
    if "language" in df.columns.str.lower():
        df = df[(df.loc[:, df.columns.str.lower() == "language"] == "en").iloc[:, 0]]

    # Remove empty columns
    df = df.drop([c for c in df.columns if c.startswith("Unnamed:")], axis=1)

    # Replace NaN
    df = df.replace({np.nan: None})
    df.loc[:, df.isnull().all()] = ""

    # Format dates
    # date_cols = ["date" in i for i in df.columns.str.lower()]
    # df.loc[:, date_cols] = df.loc[:, date_cols].apply(pd.to_datetime, errors="coerce")
    date_cols = [c for c in df.columns if "date" in c.lower()]
    df[date_cols] = df[date_cols].apply(pd.to_datetime, errors="coerce")

    # Convert
    try:
        sparkDF = spark.createDataFrame(df)
        sparkDF = sparkDF.replace("", None)
        print("-- Conversion OK")

    except (TypeError, ValueError):
        print("-- -- Transform")
        df = pd.read_csv(io.StringIO(df.to_csv(index=False)))
        # Replace NaN
        df = df.replace({np.nan: None})
        df.loc[:, df.isnull().all()] = ""

        sparkDF = spark.createDataFrame(df)
        sparkDF = sparkDF.replace("", None)
        print("-- Conversion OK")

    except Exception as e:
        print("-- Exception")
        print("\t", e.__class__.__name__, "\n\t", e)

    return sparkDF


def save_parquet(sparkDF, fname):
    # Save original dataframe
    print(f'-- Saving in {dir_parquet.joinpath(f"{fname}.parquet")}')
    sparkDF.write.parquet(
        dir_parquet.joinpath(f"{fname}.parquet").as_posix(),
        mode="overwrite",
    )
    print("SAVE SUCCESS")


def load_parquet(fname):
    # Load dataframe
    print(f'-- Loading {dir_parquet.joinpath(f"{fname}.parquet")}')
    sparkDF = spark.read.parquet(dir_parquet.joinpath(f"{fname}.parquet").as_posix())
    return sparkDF


# Define directories

In the script version the directory is passed through configuration file. 

Here, you need to write directly the location of:

  - dir_raw     : the directory containing the raw data
  - dir_parquet : the directory in HDFS that will contain the processed files

If join with Semantic Scholar and PATSTAT is activated, you need to specify also the location of these files in the HDFS

In [13]:
# Define directories

# cf = ConfigParser()
# cf.read("config.cf")

# Data sources
# dir_raw = Path("/export/data_ml4ds/IntelComp/Datasets/cordis/20230425 ")
dir_raw = Path("/export/data_ml4ds/IntelComp/Datasets/cordis/20230823/rawdata")

# Auxiliary datasets
ss_join = True
# dir_ss = Path(cf.get("aux", "dir_ss"))
dir_ss = Path("/export/ml4ds/IntelComp/Datalake/semanticscholar/20230418/parquet")
pt_join = True
# dir_patstat = Path(cf.get("aux", "dir_patstat"))
dir_patstat = Path("/export/ml4ds/IntelComp/Datalake/patstat/2023_Spring/parquet")

# Target directory
# dir_parquet = Path(cf.get("cordis", "dir_parquet"))
dir_parquet = Path("/export/ml4ds/IntelComp/Datalake/CORDIS/20230823/parquet")

In [8]:
dir_raw_horizon = dir_raw.joinpath("HORIZON")
dir_raw_h2020 = dir_raw.joinpath("H2020")
dir_raw_fp7 = dir_raw.joinpath("FP7")
dir_raw_ref = dir_raw.joinpath("cordis-ref")

In [9]:
# Configuration hdfs
fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
hdfs_dir_parquet = spark._jvm.org.apache.hadoop.fs.Path(dir_parquet.as_posix())
# hdfs_dir_raw = spark._jvm.org.apache.hadoop.fs.Path(dir_raw.as_posix())

# Create output directories if they do not exist
if not fs.exists(hdfs_dir_parquet):
    fs.mkdirs(hdfs_dir_parquet)


# Process Files

Save all tables to parquet

## Read SemanticScholar

Load SemanticScholar information. It will be joint with Publications

In [11]:
# ss = spark.read.parquet("data/sample.parquet")
if ss_join:
    ss = spark.read.parquet(dir_ss.joinpath("papers.parquet").as_posix())


                                                                                

## Read PATSTAT

Load PATSTAT information. It will be joint with Patents

In [14]:
if pt_join:
    pt = spark.read.parquet(dir_patstat.joinpath("patstat_appln.parquet").as_posix())


                                                                                

## File list

In [15]:
# Files in LOCAL:
# Read fp7 files
fp7_file_list = [el for el in dir_raw_fp7.iterdir()]

# Read h2020 files
h2020_file_list = [el for el in dir_raw_h2020.iterdir()]

# Read horizon files
horizon_file_list = [el for el in dir_raw_horizon.iterdir()]

file_list = [
    el
    for el in (fp7_file_list + h2020_file_list + horizon_file_list)
    if (
        el.name.endswith("xlsx.zip")
        or el.name.endswith("xlsx.7z")
        or el.name.endswith(".xls")
        or el.name.endswith(".xlsx")
    )
]

publications = [x for x in file_list if "publications" in x.name.lower()]
reports = [x for x in file_list if "reports" in x.name.lower()]
projects = [x for x in file_list if "projects" in x.name.lower()]
irps = [x for x in file_list if "irps" in x.name.lower()]


## Processing Functions

### Publications

In [16]:
def cleanDOI(doi):
    if not isinstance(doi, str):
        doi = f"{doi}".lower()
    doi = re.sub(r"^https://doi.org/", "", doi.lower())
    doi = "".join(doi.split())
    return doi


cleanDOI_udf = F.udf(cleanDOI, StringType())


def process_publications(publications: List[Path], merge=False, ss_join=False):
    """
    Processes a list of Path, where each element is a file path with publications.\\
    (Optional) Concatenate all publications in one dataframe and join with SemanticScholar.\\
    Then, save it to parquet.

    If `merge`=`True` merge all dataframes into one
    
    If `ss_join`=`True` includes the IDs of Semantic Scholar publications in dataframe
    """

    pubs = []

    print("Processing...\n")
    for f in publications:
        for fname, df in extract_file_info(f):
            print(f"-- {fname}")

            if fname is None or df is None:
                continue

            # Unify format
            df.columns = df.columns.str.lower()
            if "horizon" in f.name.lower():
                df["frameworkProgramme"] = "HORIZON"
                df = df.rename(
                    columns={
                        "id": "id",
                        "title": "title",
                        "ispublishedas": "isPublishedAs",
                        "authors": "authors",
                        "journaltitle": "journalTitle",
                        "journalnumber": "journalNumber",
                        "publishedyear": "publishedYear",
                        "publishedpages": "publishedPages",
                        "issn": "issn",
                        "isbn": "isbn",
                        "doi": "doi",
                        "projectid": "projectID",
                        "projectacronym": "projectAcronym",
                        "collection": "collection",
                        "contentupdatedate": "contentUpdateDate",
                        "rcn": "rcn",
                    }
                )
            elif "h2020" in f.name.lower():
                df["frameworkProgramme"] = "H2020"
                df = df.rename(
                    columns={
                        "id": "id",
                        "title": "title",
                        "ispublishedas": "isPublishedAs",
                        "authors": "authors",
                        "journaltitle": "journalTitle",
                        "journalnumber": "journalNumber",
                        "publishedyear": "publishedYear",
                        "publishedpages": "publishedPages",
                        "issn": "issn",
                        "isbn": "isbn",
                        "doi": "doi",
                        "projectid": "projectID",
                        "projectacronym": "projectAcronym",
                        "collection": "collection",
                        "contentupdatedate": "contentUpdateDate",
                        "rcn": "rcn",
                    }
                )
            else:
                df["frameworkProgramme"] = "FP7"
                df = df.drop(columns="doi").rename(columns={"qa_processed_doi": "doi"})
                df = df.rename(
                    columns={
                        "project_id": "projectID",
                        "title": "title",
                        "author": "authors",
                        # "doi": "",
                        "publication_type": "isPublishedAs",
                        "repository_url": "repositoryUrl",
                        "journal_title": "journalTitle",
                        "publisher": "publisher",
                        "volume": "journalNumber",
                        "pages": "publishedPages",
                        "qa_processed_doi": "doi",
                        "record_id": "id",
                    }
                )
            if merge:
                pubs.append(df)
            else:
                df["doi"] = df["doi"].apply(cleanDOI)
                # Transform to spark
                sparkDF = convert_spark(df)

                # Join with SemanticScholar
                if ss_join:
                    joint_pub = (
                        sparkDF.withColumn(
                            "doi",
                            F.when(col("doi").isNotNull(), cleanDOI_udf(col("doi"))).otherwise(None),
                        )
                        .join(
                            ss.select(
                                col("id").alias("SSID"),
                                F.when(col("doi").isNotNull(), cleanDOI_udf(col("doi")))
                                .otherwise(None)
                                .alias("doi"),
                            ),
                            on="doi",
                            how="left",
                        )
                        .select(df.columns.tolist() + ["SSID"])
                    )
                else:
                    joint_pub = sparkDF

                save_parquet(joint_pub, fname)

        print("\n", "-" * 80, "\n")

    if merge:
        pub_df = pd.concat(pubs)
        
        # Transform to spark
        sparkDF = convert_spark(pub_df)
        
        # Join with SemanticScholar
        if ss_join:
            joint_pub = (
                sparkDF.withColumn(
                    "doi",
                    F.when(col("doi").isNotNull(), cleanDOI_udf(col("doi"))).otherwise(None),
                )
                .join(
                    ss.select(
                        col("id").alias("SSID"),
                        F.when(col("doi").isNotNull(), cleanDOI_udf(col("doi")))
                        .otherwise(None)
                        .alias("doi"),
                    ),
                    on="doi",
                    how="left",
                )
                .select(pub_df.columns.tolist() + ["SSID"])
            )
        else:
            joint_pub = sparkDF

        
        save_parquet(joint_pub, "publications")


process_publications(publications, merge=True, ss_join=ss_join)


Processing...



ERROR: ld.so: object '/opt/intel/composerxe/compiler/lib/intel64/libiomp5.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_core.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_rt.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/compiler/lib/intel64/libiomp5.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_core.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_rt.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.


-- Microsoft
-- FP7PC_DM_PROJ_PUBLICATIONS

 -------------------------------------------------------------------------------- 



ERROR: ld.so: object '/opt/intel/composerxe/compiler/lib/intel64/libiomp5.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_core.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_rt.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/compiler/lib/intel64/libiomp5.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_core.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_rt.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.


-- Zip
-- Files:
-- -- xlsx/projectPublications.xlsx
-- -- xlsx/projectPublications_2.xlsx
-- -- xlsx/projectPublications_3.xlsx
-- -- xlsx/projectPublications_4.xlsx
-- -- xlsx/projectPublications_5.xlsx
-- -- xlsx/projectPublications_6.xlsx
-- -- xlsx/projectPublications_7.xlsx
-- xlsxprojectPublicationsxlsx

 -------------------------------------------------------------------------------- 

-- Conversion OK
-- Saving in /export/ml4ds/IntelComp/Datalake/CORDIS/20230823/parquet/publications.parquet


23/08/26 21:27:30 WARN TaskSetManager: Stage 3 contains a task of very large size (4820 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

SAVE SUCCESS


Some publications statistics:
- Percentage of projects with publications
- Number of publications with DOI
- Number of publications with reference in SemanticScholar

In [17]:
def publications_statistics(ss_join=False):
    pubs = load_parquet("publications")
    if ss_join:
        df = pubs.select("frameworkProgramme", "doi", "SSID").toPandas()
    else:
        df = pubs.select("frameworkProgramme", "doi").toPandas()
                
    total = len(df)
    print(f"Number of total publications: {total}")

    # Publications with DOI
    pub_doi = df[["frameworkProgramme", "doi"]].dropna(subset=["doi"])
    num_pubs = pub_doi["doi"].count()
    print(f"Number of publications with DOI: {num_pubs} ({num_pubs/total*100:.2f}%)")
    for k, v in pub_doi.groupby("frameworkProgramme")["doi"].count().items():
        print(f"\t-- {k}: {v} ({v/total*100:.2f}%)")

    # Publications with ref in SS
    if ss_join:
        pub_SS = df[["doi", "SSID"]].dropna(subset=["SSID"])
        pub_withSS = len(pub_SS)
        print(f"Number of publications with SSID: {pub_withSS} ({pub_withSS/total*100:.2f}%)")
        print(f"Number of publications with DOI that don't have SS reference: {num_pubs-pub_withSS}")


publications_statistics(ss_join=ss_join)


-- Loading /export/ml4ds/IntelComp/Datalake/CORDIS/20230823/parquet/publications.parquet


                                                                                

Number of total publications: 731356
Number of publications with DOI: 626316 (85.64%)
	-- FP7: 320021 (43.76%)
	-- H2020: 306295 (41.88%)
Number of publications with SSID: 591208 (80.84%)
Number of publications with DOI that don't have SS reference: 35108


In [19]:
# Save XLS file to NFS location
# publication_file = "/export/data_ml4ds/IntelComp/Datasets/cordis/20220823/xlsx/publications.xlsx"
# load_parquet("publications").toPandas().to_excel(publication_file, index=False)

-- Loading /export/ml4ds/IntelComp/Datalake/CORDIS/20220823/parquet/publications.parquet


                                                                                

### Patents

In [18]:
def process_patents(patents: List[Path], merge=False, pt_join=False):
    """
    Processes a list of Path, where each element is a file path with patents.\\
    (Optional) Concatenate all patents in one dataframe and join with PATSTAT.\\
    Then, save it to parquet.

    If `merge`=`True` merge all dataframes into one
    
    If `pt_join`=`True` includes the IDs of PATSTAT in dataframe
    """

    pubs = []

    print("Processing...\n")
    for f in patents:
        for fname, df in extract_file_info(f):
            print(f"-- {fname}")

            if fname is None or df is None:
                continue

            if merge:
                if "horizon" in f.name.lower():
                    df["frameworkProgramme"] = "HORIZON"
                elif "h2020" in f.name.lower():
                    df["frameworkProgramme"] = "H2020"
                elif "fp7" in f.name.lower():
                    df["frameworkProgramme"] = "FP7"
                pubs.append(df)
            else:
                # Transform to spark
                sparkDF = convert_spark(df)

                # Join with PATSTAT
                if pt_join:
                    joint_pat = sparkDF.join(
                        pt.select(col("appln_nr").alias("applicationIdentifier"), "appln_id"),
                        on="applicationIdentifier",
                        how="left",
                    ).select(df.columns.tolist() + ["appln_id"])
                else:
                    joint_pat = sparkDF

                save_parquet(joint_pat, fname)

        print("\n", "-" * 80, "\n")

    if merge:
        pat_df = pd.concat(pubs)
        # Transform to spark
        sparkDF = convert_spark(pat_df)

        # Join with PATSTAT
        if pt_join:
            joint_pat = sparkDF.join(
                pt.select(col("appln_nr").alias("applicationIdentifier"), "appln_id"),
                on="applicationIdentifier",
                how="left",
            ).select(pat_df.columns.tolist() + ["appln_id"])
        else:
            joint_pat = sparkDF

        save_parquet(joint_pat, "patents")


process_patents(irps, merge=True, pt_join=pt_join)


Processing...

-- Microsoft


ERROR: ld.so: object '/opt/intel/composerxe/compiler/lib/intel64/libiomp5.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_core.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_rt.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/compiler/lib/intel64/libiomp5.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_core.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_rt.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.


-- projectIrps_h2020

 -------------------------------------------------------------------------------- 

-- Microsoft


ERROR: ld.so: object '/opt/intel/composerxe/compiler/lib/intel64/libiomp5.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_core.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_rt.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/compiler/lib/intel64/libiomp5.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_core.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_rt.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.


-- projectIrps_fp7

 -------------------------------------------------------------------------------- 

-- Conversion OK
-- Saving in /export/ml4ds/IntelComp/Datalake/CORDIS/20230823/parquet/patents.parquet


                                                                                

SAVE SUCCESS


Some patents statistics:
- Number of patents with reference in PATSTAT

In [19]:
def patents_statistics(pt_join=False):
    pats = load_parquet("patents")
    if pt_join:
        df = pats.select("frameworkProgramme", "appln_id").toPandas()
    else:
        df = pats.select("frameworkProgramme").toPandas()

    total = len(df)
    print(f"Number of total patents: {total}")

    for k, v in df.groupby("frameworkProgramme")["frameworkProgramme"].count().items():
        print(f"\t-- {k}: {v} ({v/total*100:.2f}%)")

    # Patents in PATSTAT
    if pt_join:
        pat_PATS = df[["frameworkProgramme", "appln_id"]].dropna(subset=["appln_id"])
        num_pats = pat_PATS["appln_id"].count()
        print(f"Number of patents in PATSTAT: {num_pats} ({num_pats/total*100:.2f}%)")
        for k, v in pat_PATS.groupby("frameworkProgramme")["appln_id"].count().items():
            print(f"\t-- {k}: {v} ({v/total*100:.2f}%)")


patents_statistics(pt_join=pt_join)


-- Loading /export/ml4ds/IntelComp/Datalake/CORDIS/20230823/parquet/patents.parquet


                                                                                

Number of total patents: 32073
	-- FP7: 23368 (72.86%)
	-- H2020: 8705 (27.14%)
Number of patents in PATSTAT: 32064 (99.97%)
	-- FP7: 23368 (72.86%)
	-- H2020: 8696 (27.11%)


In [23]:
# Count unique patents according to id
# load_parquet("patents").select("appln_id").distinct().count()

# Save XLS file to NFS location
# patent_file = "/export/data_ml4ds/IntelComp/Datasets/cordis/20220908/xlsx/patents.xlsx"
# load_parquet("patents").toPandas().to_excel(patent_file, index=False)

### Summaries

In [20]:
def process_reports(reports: List[Path], merge=False):
    """
    Processes a list of Path, where each element is a file path with report summaries.\\
    (Optional) Concatenate all patents in one dataframe.\\
    Then, save it to parquet.


    If `merge`=`True` merge all dataframes into one
    """

    reps = []

    print("Processing...\n")
    for f in reports:
        for fname, df in extract_file_info(f):
            print(f"-- {fname}")

            if fname is None or df is None:
                continue

            if merge:
                if "horizon" in f.name.lower():
                    df["frameworkProgramme"] = "HORIZON"
                elif "h2020" in f.name.lower():
                    df["frameworkProgramme"] = "H2020"
                elif "fp7" in f.name.lower():
                    df["frameworkProgramme"] = "FP7"
                reps.append(df)
            else:
                # Transform to spark
                sparkDF = convert_spark(df)

                save_parquet(sparkDF, fname)

        print("\n", "-" * 80, "\n")

    if merge:
        reps_df = pd.concat(reps)
        # Transform to spark
        sparkDF = convert_spark(reps_df)

        save_parquet(sparkDF, "reports")


process_reports(reports, merge=True)


Processing...



ERROR: ld.so: object '/opt/intel/composerxe/compiler/lib/intel64/libiomp5.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_core.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_rt.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/compiler/lib/intel64/libiomp5.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_core.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_rt.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.


-- Zip
-- Files:
-- -- xlsx/reportSummaries.xlsx


  warn("Workbook contains no default style, apply openpyxl's default")


-- xlsxreportSummariesxlsx

 -------------------------------------------------------------------------------- 



ERROR: ld.so: object '/opt/intel/composerxe/compiler/lib/intel64/libiomp5.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_core.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_rt.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/compiler/lib/intel64/libiomp5.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_core.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_rt.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.


-- Zip
-- Files:
-- -- xlsx/reportSummaries.xlsx


  warn("Workbook contains no default style, apply openpyxl's default")


-- xlsxreportSummariesxlsx

 -------------------------------------------------------------------------------- 

-- Zip
-- Files:
-- -- xlsx/reportSummaries.xlsx
-- xlsxreportSummariesxlsx

 -------------------------------------------------------------------------------- 



ERROR: ld.so: object '/opt/intel/composerxe/compiler/lib/intel64/libiomp5.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_core.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_rt.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/compiler/lib/intel64/libiomp5.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_core.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_rt.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
  warn("Workbook contains no default style, apply openpyxl's default")

-- Conversion OK
-- Saving in /export/ml4ds/IntelComp/Datalake/CORDIS/20230823/parquet/reports.parquet


                                                                                

SAVE SUCCESS


In [21]:
total = load_parquet("reports").count()
print(f"Number of total reports: {total}")

for k, v in load_parquet("reports").select("frameworkProgramme").toPandas().groupby("frameworkProgramme")["frameworkProgramme"].count().items():
        print(f"\t-- {k}: {v} ({v/total*100:.2f}%)")


-- Loading /export/ml4ds/IntelComp/Datalake/CORDIS/20230823/parquet/reports.parquet


                                                                                

Number of total reports: 51353
-- Loading /export/ml4ds/IntelComp/Datalake/CORDIS/20230823/parquet/reports.parquet


                                                                                

	-- FP7: 21606 (42.07%)
	-- H2020: 29613 (57.67%)
	-- HORIZON: 134 (0.26%)


### Process EuroSciVoc Codes

This part has been processed manually (at least partially) to get all the possible paths and codes, as no official reference has been found that includes all of them.

**NOTE:**
Some additional files not included in GitHub are required

**NOTE2:**
You do not need to run this cell, if you already have the EuroSciVoc files available

In [30]:
import json

with open("SciVoc-data/transform.json", "r") as f:
    taxonomies = json.load(f)

# Get all available titles from SciVoc
def process_keys(obj):
    """
    Process json object to obtain all titles
    """
    structure = []

    def get_keys(obj):
        keys = []
        if isinstance(obj, list):
            for el in obj:
                keys.append(get_keys(el))
        elif isinstance(obj, dict):
            if obj.get("subtitles"):
                for el in obj["subtitles"]:
                    keys.extend([f"{obj['title']}/{s.strip()}" for s in get_keys(el)])
            keys.append(obj["title"])
        return keys

    # Convert obj to list
    if not isinstance(obj, list):
        obj = [obj]
    for el in obj:
        structure.extend(get_keys(el))
    return structure


all_keys = process_keys(taxonomies)
all_keys = [f"/{k}" for k in all_keys]
# Save
with open("SciVoc-data/ScienceVocabulary.txt", "w") as f:
    [f.write(f"{el}\n") for el in all_keys]

# Load all EuroSciVoc
dfs = []
for f in projects:
    print(f)
    # Get type of file
    extension = get_file_extension(f)
    print(f"-- {extension}")

    if extension == "Zip":
        archive = zipfile.ZipFile(f.as_posix(), "r")
        dfs.append(
            pd.read_excel(
                archive.read("xlsx/euroSciVoc.xlsx"),
                engine="openpyxl",
            )
        )
    elif extension == "7-zip":
        with py7zr.SevenZipFile(f, "r") as archive:
            dfs.append(
                pd.read_excel(
                    archive.read("xlsx/euroSciVoc.xlsx")["xlsx/euroSciVoc.xlsx"],
                    engine="openpyxl",
                )
            )
# Concatenate both
df = pd.concat(dfs)


# Split codes to get only the code values
df["euroSciVocCode"] = df["euroSciVocCode"].apply(
    lambda x: [el.strip() for el in x.split("/") if len(el) > 0]
)
df["euroSciVocPath"] = df["euroSciVocPath"].apply(
    lambda x: [el.strip() for el in x.split("/") if len(el) > 0]
)
df["euroSciVocTitle"] = df["euroSciVocTitle"].apply(str.strip)
# Get length on the resulting lists
df["length"] = df["euroSciVocCode"].apply(len)

# Now we only want unique titles with longest codes (as this will be the entire path)
euroSciVoc = (
    df[["euroSciVocCode", "euroSciVocPath", "euroSciVocTitle", "length"]]
    .sort_values(by="length")
    .drop_duplicates(subset="euroSciVocTitle", keep="last")
)

# Also, as some elements are only found in the full path but don't have the code-title
#  directly assigned we can obtain it from that path.
aux = dict(
    df.loc[
        df.apply(lambda x: len(x["euroSciVocCode"]) == len(x["euroSciVocPath"]), axis=1)
    ]
    .apply(lambda x: list(zip(x["euroSciVocPath"], x["euroSciVocCode"])), axis=1)
    .explode()
    .drop_duplicates()
    .tolist()
)

# Finally we get the title - code relationship
title2code = dict(
    euroSciVoc[["euroSciVocCode", "euroSciVocTitle"]]
    .apply(lambda x: (x["euroSciVocTitle"], x["euroSciVocCode"][-1]), axis=1)
    .tolist()
)
title2code.update(aux)

# Save all SciVoc information
rows = []
for k in all_keys:
    this_row = []

    spl_path = k.split("/")

    # Full path
    this_row.append(k)

    # Title
    title = spl_path[-1]
    this_row.append(title)

    # Code
    code = title2code.get(title, None)
    this_row.append(code)

    # Full code
    full_code = "/".join([title2code.get(t, "") for t in spl_path])
    this_row.append(full_code)

    rows.append((this_row))
scivoccodes = pd.DataFrame(rows, columns=["full_path", "title", "code", "full_code"])
scivoccodes.to_excel("SciVoc-data/SciVocCodes.xlsx", index=None)


data/data_Pr_CORDIS/20220908/FP7/cordis-fp7projects-xlsx.zip
-- Zip


  warn("Workbook contains no default style, apply openpyxl's default")


data/data_Pr_CORDIS/20220908/H2020/cordis-h2020projects-xlsx.zip
-- Zip


  warn("Workbook contains no default style, apply openpyxl's default")


data/data_Pr_CORDIS/20220908/HORIZON/cordis-HORIZONprojects-xlsx.zip
-- Zip


  warn("Workbook contains no default style, apply openpyxl's default")


### Organisations

In [22]:
# Load all organizations
dfs = []
for f in projects:
    print(f)
    # Get type of file
    extension = get_file_extension(f)
    print(f"-- {extension}")

    if extension == "Zip":
        archive = zipfile.ZipFile(f.as_posix(), "r")
        dfs.append(
            pd.read_excel(
                archive.read("xlsx/organization.xlsx"),
                engine="openpyxl",
            )
        )
    elif extension == "7-zip":
        with py7zr.SevenZipFile(f, "r") as archive:
            dfs.append(
                pd.read_excel(
                    archive.read("xlsx/organization.xlsx")["xlsx/organization.xlsx"],
                    engine="openpyxl",
                )
            )
df = pd.concat(dfs)

/export/data_ml4ds/IntelComp/Datasets/cordis/20230823/rawdata/FP7/cordis-fp7projects-xlsx.zip


ERROR: ld.so: object '/opt/intel/composerxe/compiler/lib/intel64/libiomp5.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_core.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_rt.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/compiler/lib/intel64/libiomp5.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_core.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_rt.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.


-- Zip
/export/data_ml4ds/IntelComp/Datasets/cordis/20230823/rawdata/H2020/cordis-h2020projects-xlsx.zip


ERROR: ld.so: object '/opt/intel/composerxe/compiler/lib/intel64/libiomp5.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_core.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_rt.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/compiler/lib/intel64/libiomp5.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_core.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_rt.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.


-- Zip
/export/data_ml4ds/IntelComp/Datasets/cordis/20230823/rawdata/HORIZON/cordis-HORIZONprojects-xlsx.zip


ERROR: ld.so: object '/opt/intel/composerxe/compiler/lib/intel64/libiomp5.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_core.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_rt.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/compiler/lib/intel64/libiomp5.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_core.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_rt.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.


-- Zip


  warn("Workbook contains no default style, apply openpyxl's default")


In [23]:
orgs_info = (
    df[
        [
            "organisationID",
            "vatNumber",
            "name",
            "shortName",
            "SME",
            "activityType",
            "street",
            "postCode",
            "city",
            "country",
            "nutsCode",
            "geolocation",
            "organizationURL",
            "contentUpdateDate",
        ]
    ]
    .sort_values(by=["organisationID", "contentUpdateDate"])
    .drop_duplicates(subset="organisationID", keep="last")
    .join(
        df.groupby("organisationID")["projectID"].apply(list),
        on="organisationID",
    )
)

sparkDF = convert_spark(orgs_info)
save_parquet(sparkDF, "organizations")


-- Conversion OK
-- Saving in /export/ml4ds/IntelComp/Datalake/CORDIS/20230823/parquet/organizations.parquet


23/08/26 23:30:32 WARN TaskSetManager: Stage 24 contains a task of very large size (1040 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

SAVE SUCCESS


In [24]:
# Count unique organizations according to id
load_parquet("organizations").select("organisationID").distinct().count()

# Save XLS file to NFS location
# organizations_file = "/export/data_ml4ds/IntelComp/Datasets/cordis/20220823/xlsx/organizations.xlsx"
# load_parquet("organizations").toPandas().to_excel(organizations_file, index=False)

-- Loading /export/ml4ds/IntelComp/Datalake/CORDIS/20230823/parquet/organizations.parquet


                                                                                

67348

### Projects

In [32]:
def process_projects(projects: List[Path]):
    """
    Processall files given a list of Paths
    Joins original project with topics, organizations, SciVoc, publications and patents
    """

    projs = []

    # Load SciVoc dictionary
    _, scivoccodes = next(extract_file_info(Path("SciVoc-data/SciVocCodes.xlsx")))
    title2code = scivoccodes[["title", "code"]].dropna()
    title2code["code"] = title2code["code"].astype(int)
    title2code["title"] = title2code["title"].apply(lambda x: get_only_letter(x.lower()))
    title2code = dict(title2code.values)

    print("Processing...\n")
    for f in projects:

        df_proj = None
        tops = None
        orgs = None
        svcs = None

        for fname, df in extract_file_info(f):
            print(f"-- {fname}")

            if fname is None or df is None:
                continue

            sfname = fname.split("-")[-1]

            # Drop:
            if sfname in ["legalBasis", "webItem", "webLink"]:
                continue

            # Save
            elif sfname == "topics":
                # Merge entire dataframe with projects, as in this case each row is has unique (proj_id, topic, topic_title)
                tops = (
                    df[["topic", "projectID", "title"]]
                    .groupby("projectID")[["topic", "title"]]
                    .apply(
                        lambda x: pd.Series(
                            {
                                "topic": x["topic"].values[0],
                                "topic_title": x["title"].values[0],
                            }
                        )
                    )
                )
            elif sfname == "organization":
                # Get information that is going to be added to projects
                orgs = df[
                    ["projectID", "organisationID", "country", "role", "ecContribution"]
                ]
                orgs.loc[:, "ecContribution"] = orgs.loc[:, "ecContribution"].apply(
                    pd.to_numeric, errors="coerce"
                )
                orgs.loc[pd.isna(orgs["ecContribution"]), "ecContribution"] = 0.0
                orgs = pd.DataFrame(
                    orgs.groupby(["projectID"]).apply(
                        lambda x: pd.Series(
                            {
                                "countryContr": " ".join(
                                    [
                                        f"{k}|{v}"
                                        for k, v in x.groupby("country")[
                                            "ecContribution"
                                        ]
                                        .apply(sum)
                                        .items()
                                    ]
                                ),
                                "orgContr": " ".join(
                                    [
                                        f"{i}|{c}"
                                        for i, c in zip(
                                            x["organisationID"].values,
                                            x["ecContribution"].values,
                                        )
                                    ]
                                ),
                                "coordinatorCountry": x.loc[
                                    x["role"] == "coordinator", "country"
                                ].values[0],
                                "coordinatorOrg": x.loc[
                                    x["role"] == "coordinator", "organisationID"
                                ].values[0],
                            }
                        )
                    )
                )

            elif sfname == "euroSciVoc":
                svcs = (
                    df[["euroSciVocTitle", "projectID"]]
                    .apply({"euroSciVocTitle":lambda x: get_only_letter(x.lower()),
                            "projectID":lambda x: x})
                    .groupby("projectID")["euroSciVocTitle"]
                    .apply(lambda x: [title2code[i.strip()] for i in x.values if i.strip() in title2code.keys()])
                    .rename("euroSciVocCode")
                )
            elif sfname == "project":
                df_proj = df.copy()
            else:
                continue

        print("\n", "-" * 80, "\n")

        # Save enriched project
        additions = pd.concat([tops, orgs, svcs], axis=1).reset_index(drop=False)
        enrich_proj = (
            df_proj.drop(["legalBasis", "topics"], axis=1)
            .merge(additions, left_on="id", right_on="projectID", how="left")
            .drop("projectID", axis=1)
            .rename(columns={"id": "projectID"})
        )
        projs.append(enrich_proj)

    # Conver to spark
    projs_df = pd.concat(projs)
    sparkDF = convert_spark(projs_df)

    # Load publications
    pubs = (
        load_parquet("publications")
        .select("projectID", "id")
        .groupBy("projectID")
        .agg(F.collect_list("id").alias("publicationID"))
    )
    print("LOADED publications")

    # Load patents
    pats = (
        load_parquet("patents")
        .select("projectID", "patentFamilyIdentifier")#"appln_id")
        .groupBy("projectID")
        .agg(F.collect_list("patentFamilyIdentifier").alias("patentID"))
    )
    print("LOADED patents")

    sparkProjects = (
        sparkDF
        .join(pubs, on="projectID", how="left")
        .join(pats, on="projectID", how="left")
    )
    save_parquet(sparkProjects, "projects")

    return sparkProjects


sparkProjects = process_projects(projects)


ERROR: ld.so: object '/opt/intel/composerxe/compiler/lib/intel64/libiomp5.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_core.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_rt.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/compiler/lib/intel64/libiomp5.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_core.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_rt.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/compiler/lib/intel64/libio

-- Microsoft
Processing...

-- Zip
-- Files:
-- -- xlsx/euroSciVoc.xlsx
-- cordis-fp7projects-xlsx-euroSciVoc
-- -- xlsx/legalBasis.xlsx
-- cordis-fp7projects-xlsx-legalBasis
-- -- xlsx/organization.xlsx
-- cordis-fp7projects-xlsx-organization
-- -- xlsx/project.xlsx
-- cordis-fp7projects-xlsx-project
-- -- xlsx/topics.xlsx
-- cordis-fp7projects-xlsx-topics
-- -- xlsx/webItem.xlsx
-- cordis-fp7projects-xlsx-webItem
-- -- xlsx/webLink.xlsx
-- cordis-fp7projects-xlsx-webLink

 -------------------------------------------------------------------------------- 



ERROR: ld.so: object '/opt/intel/composerxe/compiler/lib/intel64/libiomp5.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_core.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_rt.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/compiler/lib/intel64/libiomp5.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_core.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_rt.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.


-- Zip
-- Files:
-- -- xlsx/euroSciVoc.xlsx


  warn("Workbook contains no default style, apply openpyxl's default")


-- cordis-h2020projects-xlsx-euroSciVoc
-- -- xlsx/legalBasis.xlsx


  warn("Workbook contains no default style, apply openpyxl's default")


-- cordis-h2020projects-xlsx-legalBasis
-- -- xlsx/organization.xlsx
-- cordis-h2020projects-xlsx-organization
-- -- xlsx/project.xlsx
-- cordis-h2020projects-xlsx-project
-- -- xlsx/topics.xlsx


  warn("Workbook contains no default style, apply openpyxl's default")


-- cordis-h2020projects-xlsx-topics
-- -- xlsx/webItem.xlsx
-- cordis-h2020projects-xlsx-webItem
-- -- xlsx/webLink.xlsx


  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


-- cordis-h2020projects-xlsx-webLink

 -------------------------------------------------------------------------------- 

-- Zip
-- Files:
-- -- xlsx/project.xlsx


ERROR: ld.so: object '/opt/intel/composerxe/compiler/lib/intel64/libiomp5.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_core.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_rt.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/compiler/lib/intel64/libiomp5.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_core.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/intel/composerxe/mkl/lib/intel64/libmkl_rt.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
  warn("Workbook contains no default style, apply openpyxl's default")

-- cordis-HORIZONprojects-xlsx-project
-- -- xlsx/organization.xlsx


  warn("Workbook contains no default style, apply openpyxl's default")


-- cordis-HORIZONprojects-xlsx-organization
-- -- xlsx/legalBasis.xlsx


  warn("Workbook contains no default style, apply openpyxl's default")


-- cordis-HORIZONprojects-xlsx-legalBasis
-- -- xlsx/topics.xlsx


  warn("Workbook contains no default style, apply openpyxl's default")


-- cordis-HORIZONprojects-xlsx-topics
-- -- xlsx/euroSciVoc.xlsx


  warn("Workbook contains no default style, apply openpyxl's default")


-- cordis-HORIZONprojects-xlsx-euroSciVoc
-- -- xlsx/webLink.xlsx


  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


-- cordis-HORIZONprojects-xlsx-webLink
-- -- xlsx/webItem.xlsx
-- cordis-HORIZONprojects-xlsx-webItem

 -------------------------------------------------------------------------------- 

-- Conversion OK
-- Loading /export/ml4ds/IntelComp/Datalake/CORDIS/20230823/parquet/publications.parquet
LOADED publications
-- Loading /export/ml4ds/IntelComp/Datalake/CORDIS/20230823/parquet/patents.parquet
LOADED patents
-- Saving in /export/ml4ds/IntelComp/Datalake/CORDIS/20230823/parquet/projects.parquet


23/08/26 23:51:56 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
23/08/26 23:51:56 WARN TaskSetManager: Stage 34 contains a task of very large size (2415 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

SAVE SUCCESS


Some projects statistics:
- Percentage of projects with publications
- Percentage of projects with patents

In [33]:
def projects_statistics():
    projs = load_parquet("projects")
    df = projs.select(["frameworkProgramme", "publicationID", "patentID"]).toPandas()
    
    total = len(df)
    print(f"Number of total projects: {total}")
    for k, v in df.groupby("frameworkProgramme")["frameworkProgramme"].count().items():
        print(f"\t-- {k}: {v} ({v/total*100:.2f}%)")

    # Projects with publications
    proj_pubs = df[["frameworkProgramme", "publicationID"]].dropna(
        subset=["publicationID"]
    )
    num_pubs = proj_pubs["publicationID"].count()
    print(f"Number of projects with publications: {num_pubs} ({num_pubs/total*100:.2f}%)")
    for k, v in (proj_pubs.groupby("frameworkProgramme")["publicationID"].count().items()):
        print(f"\t-- {k}: {v} ({v/total*100:.2f}%)")

    # Projects with patents
    proj_pats = df[["frameworkProgramme", "patentID"]].dropna(subset=["patentID"])
    num_pats = proj_pats["patentID"].count()
    print(f"Number of projects with patents: {num_pats} ({num_pats/total*100:.2f}%)")
    for k, v in proj_pats.groupby("frameworkProgramme")["patentID"].count().items():
        print(f"\t-- {k}: {v} ({v/total*100:.2f}%)")


projects_statistics()


-- Loading /export/ml4ds/IntelComp/Datalake/CORDIS/20230823/parquet/projects.parquet


                                                                                

Number of total projects: 69613
	-- FP7: 25785 (37.04%)
	-- H2020: 35386 (50.83%)
	-- HORIZON: 8442 (12.13%)
Number of projects with publications: 37279 (53.55%)
	-- FP7: 15554 (22.34%)
	-- H2020: 21725 (31.21%)
Number of projects with patents: 3034 (4.36%)
	-- FP7: 2155 (3.10%)
	-- H2020: 879 (1.26%)


In [35]:
# Save XLS file to NFS location
# projects_file = "/export/data_ml4ds/IntelComp/Datasets/cordis/20220823/xlsx/projects.xlsx"
# load_parquet("projects").toPandas().to_excel(projects_file, index=False)

-- Loading /export/ml4ds/IntelComp/Datalake/CORDIS/20220823/parquet/projects.parquet


                                                                                

In [34]:
%%time
hdfs = True
if hdfs:
    # Files in HDFS
    hdfs_dir_raw = spark._jvm.org.apache.hadoop.fs.Path(dir_parquet.as_posix())
    hdfs_file_list = [el.getPath().toUri().toString() for el in fs.listStatus(hdfs_dir_raw)]

    for f in hdfs_file_list:
        if f.endswith(".parquet"):
            print(f"Processing {f}")
            sparkDF = spark.read.parquet(f)
            print(sparkDF.count())
            sparkDF.printSchema()
            sparkDF.show(5)
            print("\n", "-" * 80, "\n")
else:
    for f in dir_parquet.iterdir():
        if f.name.endswith(".parquet"):
            print(f"Processing {f}")
            sparkDF = spark.read.parquet(f.as_posix())
            print(sparkDF.count())
            sparkDF.printSchema()
            sparkDF.show(5)
            print("\n", "-" * 80, "\n")


Processing hdfs://DTSCHDFSCluster/export/ml4ds/IntelComp/Datalake/CORDIS/20230823/parquet/organizations.parquet


                                                                                

67348
root
 |-- organisationID: double (nullable = true)
 |-- vatNumber: string (nullable = true)
 |-- name: string (nullable = true)
 |-- shortName: string (nullable = true)
 |-- SME: boolean (nullable = true)
 |-- activityType: string (nullable = true)
 |-- street: string (nullable = true)
 |-- postCode: string (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- nutsCode: string (nullable = true)
 |-- geolocation: string (nullable = true)
 |-- organizationURL: string (nullable = true)
 |-- contentUpdateDate: timestamp (nullable = true)
 |-- projectID: array (nullable = true)
 |    |-- element: long (containsNull = true)

+--------------+-----------+--------------------+----------------+-----+------------+--------------------+--------+--------------------+-------+--------+--------------------+---------------+-------------------+--------------------+
|organisationID|  vatNumber|                name|       shortName|  SME|activityType|      

                                                                                

32073
root
 |-- projectID: long (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- organisationID: double (nullable = true)
 |-- applicantName: string (nullable = true)
 |-- applicationDate: timestamp (nullable = true)
 |-- applicationPrefix: string (nullable = true)
 |-- applicationIdentifier: string (nullable = true)
 |-- applicationKind: string (nullable = true)
 |-- epoAppUrl: string (nullable = true)
 |-- priorityDate: timestamp (nullable = true)
 |-- patentType: string (nullable = true)
 |-- awardDate: timestamp (nullable = true)
 |-- awardPrefix: string (nullable = true)
 |-- awardIdentifier: string (nullable = true)
 |-- awardKind: string (nullable = true)
 |-- epoPubUrl: string (nullable = true)
 |-- patentFamilyIdentifier: long (nullable = true)
 |-- frameworkProgramme: string (nullable = true)
 |-- appln_id: string (nullable = true)

+---------+------+--------------------+--------------+--------------------+-------------------+---

                                                                                

69613
root
 |-- projectID: long (nullable = true)
 |-- acronym: string (nullable = true)
 |-- status: string (nullable = true)
 |-- title: string (nullable = true)
 |-- startDate: timestamp (nullable = true)
 |-- endDate: timestamp (nullable = true)
 |-- totalCost: string (nullable = true)
 |-- ecMaxContribution: string (nullable = true)
 |-- ecSignatureDate: timestamp (nullable = true)
 |-- frameworkProgramme: string (nullable = true)
 |-- masterCall: string (nullable = true)
 |-- subCall: string (nullable = true)
 |-- fundingScheme: string (nullable = true)
 |-- nature: string (nullable = true)
 |-- objective: string (nullable = true)
 |-- contentUpdateDate: timestamp (nullable = true)
 |-- rcn: long (nullable = true)
 |-- grantDoi: string (nullable = true)
 |-- topic: string (nullable = true)
 |-- topic_title: string (nullable = true)
 |-- countryContr: string (nullable = true)
 |-- orgContr: string (nullable = true)
 |-- coordinatorCountry: string (nullable = true)
 |-- coordinator

                                                                                

731356
root
 |-- projectID: long (nullable = true)
 |-- title: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- isPublishedAs: string (nullable = true)
 |-- repositoryUrl: string (nullable = true)
 |-- journalTitle: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- journalNumber: string (nullable = true)
 |-- publishedPages: string (nullable = true)
 |-- doi: string (nullable = true)
 |-- id: string (nullable = true)
 |-- frameworkProgramme: string (nullable = true)
 |-- publishedYear: double (nullable = true)
 |-- issn: string (nullable = true)
 |-- isbn: string (nullable = true)
 |-- projectAcronym: string (nullable = true)
 |-- collection: string (nullable = true)
 |-- contentUpdateDate: timestamp (nullable = true)
 |-- rcn: double (nullable = true)
 |-- SSID: long (nullable = true)

+---------+--------------------+-------------------+--------------------+-------------+--------------------+---------+-------------+--------------+----+---------



51353
root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- projectID: long (nullable = true)
 |-- projectAcronym: string (nullable = true)
 |-- attachment: string (nullable = true)
 |-- contentUpdateDate: timestamp (nullable = true)
 |-- rcn: long (nullable = true)
 |-- frameworkProgramme: string (nullable = true)

+------------+--------------------+---------+--------------+--------------------+-------------------+------+------------------+
|          id|               title|projectID|projectAcronym|          attachment|  contentUpdateDate|   rcn|frameworkProgramme|
+------------+--------------------+---------+--------------+--------------------+-------------------+------+------------------+
|   892354_PS|Periodic Reportin...|   892354|EXCHANGE_inLCs|/docs/results/h20...|2023-04-16 11:54:09|928093|             H2020|
|101002685_PS|Periodic Reportin...|101002685|        ARTIST|/docs/results/h20...|2023-04-16 11:52:51|928059|             H2020|
|101006667_PS|P

                                                                                