In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import format_number, mean, min, max, corr, stddev
from pyspark.sql.functions import (dayofmonth, hour, dayofyear, month, year, weekofyear, format_number, date_format, asc, desc)
from pyspark.sql.functions import explode, col, element_at, size

In [2]:
# Build a SparkSession named as "test123"
spark = SparkSession.builder \
    .appName('test_123') \
    .master('local[*]') \
    .config('spark.sql.execution.arrow.pyspark.enabled', True) \
    .config('spark.sql.session.timeZone', 'UTC') \
    .config('spark.driver.memory','8g') \
    .config('spark.ui.showConsoleProgress', True) \
    .config('spark.sql.repl.eagerEval.enabled', True) \
    .getOrCreate()

## Loading and Cleaning the Data

In [3]:
papers_ = spark.read.option("multuread", True).json("../data/AL_papers.json");
papers = papers_.select(explode(col("hits.hits")).alias("paper"))

In [5]:
#papers.printSchema()

In [6]:
short_papers = papers.select(
    col("paper.created"),
    element_at(col("paper.metadata.titles.title"),1).alias("title"),
    element_at(col("paper.metadata.abstracts.value"),1).alias("abstract"),
    col("paper.metadata.citation_count").alias("citation_count"),
    col("paper.metadata.number_of_pages"),
    col("paper.metadata.keywords"),
    col("paper.metadata.authors.full_name").alias("authors"),
    size(col("paper.metadata.references")).alias("num_refs")
);
short_papers.printSchema()

root
 |-- created: string (nullable = true)
 |-- title: string (nullable = true)
 |-- abstract: string (nullable = true)
 |-- citation_count: long (nullable = true)
 |-- number_of_pages: long (nullable = true)
 |-- keywords: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- schema: string (nullable = true)
 |    |    |-- source: string (nullable = true)
 |    |    |-- value: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- num_refs: integer (nullable = false)



## Keywords Extraction

In [31]:
from pyspark.sql.functions import udf
from pyspark.sql.types import *

### Assigned

Extracting assigned keywords

In [34]:
# returns first keyword
@udf(returnType=ArrayType(StringType()))
def list_keywords(a):
    return [aa["value"] for aa in a] if a else ["None"]

aaa = short_papers.withColumn("keywords_", list_keywords(col("keywords")))
aaa

created,title,abstract,citation_count,number_of_pages,keywords,authors,num_refs,keywords_
2021-03-26T00:00:...,Charmonia product...,"In this paper, pr...",0,15,"[{PACS, null, 14....","[Luchinsky, A.V.,...",35,"[14.40.Pq, 12.39...."
2020-07-09T00:00:...,Exclusive decays ...,Exclusive decays ...,4,10,"[{null, publisher...","[Luchinsky, A.V.,...",42,[Strong Interacti...
2019-12-11T00:00:...,Doubly heavy bary...,The theoretical a...,1,8,"[{INSPIRE, null, ...","[Berezhnoy, A.V.,...",40,"[talk, baryon: he..."
2019-05-29T00:00:...,Weak decays of do...,We consider exclu...,13,10,"[{null, publisher...","[Gerasimov, A.S.,...",21,[Electroweak inte...
2019-11-21T00:00:...,$B_c$ excitations...,Status of the Bc ...,0,8,"[{INSPIRE, null, ...","[Berezhnoy, A.V.,...",49,"[talk, B/c: excit..."
2018-12-27T00:00:...,Excited $\rho$ me...,"In this paper, ex...",2,7,"[{null, publisher...","[Luchinsky, A.V.]",22,[Phenomenological...
2018-09-27T00:00:...,Doubly heavy bary...,The theoretical a...,30,14,"[{null, publisher...","[Berezhnoy, A.V.,...",56,[Electroweak inte...
2018-01-30T00:00:...,Charmonia Product...,In the presented ...,1,9,"[{PACS, null, 14....","[Luchinsky, A.V.]",30,"[14.40.Pq, 12.39...."
2019-02-25T00:00:...,Lifetimes of Doub...,The inclusive dec...,1,11,"[{INSPIRE, null, ...","[Likhoded, A.K., ...",49,"[baryon: heavy, b..."
2017-12-11T00:00:...,Double Charmonia ...,This paper is dev...,8,14,"[{PACS, null, 13....","[Likhoded, A.K., ...",36,"[13.38.Dg, 14.40...."


Which of them are most popular?

In [35]:
aaa.select(explode("keywords_").alias("K")).groupby("K").count().sort(desc("count"))

K,count
numerical calcula...,31
CERN LHC Coll,21
quantum chromodyn...,17
LHC-B,14
14.40.Pq,13
charmonium,12
electron positron...,10
12.38.Bx,10
13.66.Bc,10
quarkonium: heavy,10


Search papers with **charmonium** keyword

In [30]:
from pyspark.sql.functions import array_contains

aaa.filter( array_contains(col("keywords_"), "charmonium"))

created,title,abstract,citation_count,number_of_pages,keywords,authors,num_refs,keywords_
2017-08-24T00:00:...,Production of hea...,Processes of sing...,3,13,"[{INSPIRE, null, ...","[Likhoded, A.K., ...",57,[quarkonium: heav...
2011-04-05T00:00:...,BC_NPI module for...,The module for th...,7,15,"[{INSPIRE, null, ...","[Berezhnoy, A.V.,...",19,[B/c: hadronic de...
2009-10-18T00:00:...,Light hadron prod...,Decays of ground ...,33,10,"[{PACS, null, 13....","[Likhoded, A.K., ...",26,"[13.35.Dx, 13.20...."
2008-10-20T00:00:...,Leading twist dis...,This paper is dev...,33,8,"[{PACS, null, 13....","[Braguta, V.V., L...",19,"[13.25.Gv, 12.38...."
2007-09-04T00:00:...,Lepton pair produ...,Coherent producti...,0,15,"[{PACS, null, 13....","[Berezhnoy, A.V.,...",10,"[13.60.-r, 13.60...."
2007-03-08T00:00:...,Charmonium produc...,The production of...,8,28,"[{INSPIRE, null, ...","[Likhoded, A.K., ...",30,[p p: inclusive r...
2007-06-15T00:00:...,Systematics of he...,"It is shown that,...",5,10,"[{PACS, null, 14....","[Gershtein, S.S.,...",37,"[14.40.Gx, 12.40...."
2006-02-08T00:00:...,The Processes e+ ...,In this paper we ...,43,8,"[{PACS, null, 13....","[Braguta, V.V., L...",13,"[13.25.Gv, 12.38...."
2006-02-08T00:00:...,Systematics of he...,In this paper we ...,54,9,"[{PACS, null, 14....","[Gershtein, S.S.,...",33,"[14.40.Gx, 12.40...."
2005-07-25T00:00:...,Excited charmoniu...,In this paper the...,73,7,"[{PACS, null, 13....","[Braguta, V.V., L...",16,"[13.25.Gv, 13.66...."


### From titles

In [36]:
from pyspark.sql.functions import split

In [40]:
short_papers.printSchema()

root
 |-- created: string (nullable = true)
 |-- title: string (nullable = true)
 |-- abstract: string (nullable = true)
 |-- citation_count: long (nullable = true)
 |-- number_of_pages: long (nullable = true)
 |-- keywords: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- schema: string (nullable = true)
 |    |    |-- source: string (nullable = true)
 |    |    |-- value: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- num_refs: integer (nullable = false)



In [42]:
short_papers

created,title,abstract,citation_count,number_of_pages,keywords,authors,num_refs
2021-03-26T00:00:...,Charmonia product...,"In this paper, pr...",0,15,"[{PACS, null, 14....","[Luchinsky, A.V.,...",35
2020-07-09T00:00:...,Exclusive decays ...,Exclusive decays ...,4,10,"[{null, publisher...","[Luchinsky, A.V.,...",42
2019-12-11T00:00:...,Doubly heavy bary...,The theoretical a...,1,8,"[{INSPIRE, null, ...","[Berezhnoy, A.V.,...",40
2019-05-29T00:00:...,Weak decays of do...,We consider exclu...,13,10,"[{null, publisher...","[Gerasimov, A.S.,...",21
2019-11-21T00:00:...,$B_c$ excitations...,Status of the Bc ...,0,8,"[{INSPIRE, null, ...","[Berezhnoy, A.V.,...",49
2018-12-27T00:00:...,Excited $\rho$ me...,"In this paper, ex...",2,7,"[{null, publisher...","[Luchinsky, A.V.]",22
2018-09-27T00:00:...,Doubly heavy bary...,The theoretical a...,30,14,"[{null, publisher...","[Berezhnoy, A.V.,...",56
2018-01-30T00:00:...,Charmonia Product...,In the presented ...,1,9,"[{PACS, null, 14....","[Luchinsky, A.V.]",30
2019-02-25T00:00:...,Lifetimes of Doub...,The inclusive dec...,1,11,"[{INSPIRE, null, ...","[Likhoded, A.K., ...",49
2017-12-11T00:00:...,Double Charmonia ...,This paper is dev...,8,14,"[{PACS, null, 13....","[Likhoded, A.K., ...",36


In [54]:
kwds_tit = short_papers.select(split(col("title"), " ").alias("W")).select(explode("W").alias("W")).groupby("W").count().sort(desc("count"))
kwds_tit.printSchema()

root
 |-- W: string (nullable = true)
 |-- count: long (nullable = false)



In [56]:
kws_title = short_papers.select("title").                \
    withColumn("title", split(col("title"), " ")).     \
    select(explode(col("title")).alias("K")).          \
    groupBy("K").count().sort(asc("count"))


In [67]:
short_papers.limit(3).select(col("abstract")).printSchema(

root
 |-- abstract: string (nullable = true)



In [61]:
kws_title = short_papers.select(col("title"))


AnalysisException: Invalid call to exprId on unresolved object

AnalysisException: Invalid call to exprId on unresolved object

In [55]:
kwds_tit.limit(10)

AnalysisException: Invalid call to nullable on unresolved object

AnalysisException: Invalid call to nullable on unresolved object