In [None]:
# Import the basic spark library
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, IntegerType
from pyspark.sql.functions import from_json,col,count,when

# Create an entry point to the PySpark Application
spark = SparkSession.builder \
      .master("local") \
      .appName("NoSQLProject") \
      .getOrCreate()
# master contains the URL of your remote spark instance bor 'local'

In [None]:
schemapub = StructType([ \
    StructField('id', StringType(), False), \
    StructField('title', StringType(), False), \
    StructField('page_start', IntegerType(), True), \
    StructField('page_end', IntegerType(), True), \
    StructField('year', IntegerType(), True), \
    StructField('citations', IntegerType(), True), \
    StructField('venue', StringType(), True), \
    StructField('keywords', StringType(), True) \
])

schemaref = StructType([ \
    StructField('references', StringType(), False), \
    StructField('referenced', StringType(), False) \
])

schemavenue = StructType([ \
    StructField('name', StringType(), False), \
    StructField('type', StringType(), True) \
])

schemawrites = StructType([ \
    StructField('author', StringType(), False), \
    StructField('publication', StringType(), False) \
])

schemaauthor = StructType([ \
    StructField('id', StringType(), False), \
    StructField('name', StringType(), False), \
    StructField('org', StringType(), True) \
])

In [None]:
pub = spark.read.option("header", True).option("delimiter", ";").schema(schemapub).csv("dataset/publication.csv")
pub = pub.withColumn('keywords', from_json('keywords', ArrayType(StringType())))


ref = spark.read.option("header", True).option("delimiter", ";").schema(schemaref).csv("dataset/reference.csv")


ven = spark.read.option("header", True).option("delimiter", ";").schema(schemavenue).csv("dataset/venue.csv")


writes = spark.read.option("header", True).option("delimiter", ";").schema(schemawrites).csv("dataset/writes.csv")


author = spark.read.option("header", True).option("delimiter", ";").schema(schemaauthor).csv("dataset/author.csv")

Publications

In [None]:
pub.printSchema()
pub.show()

References


In [None]:
ref.printSchema()
ref.show()

Venues

In [None]:
ven.printSchema()
ven.show()


Authors write a publication

In [None]:
writes.printSchema()
writes.show()

Authors

In [None]:
author.printSchema()
author.show()

**<h3>Queries</h3>**

Show the type of publications from 1990 to date.

```sql
select type  
from publication join (select * from venue )
where publication.year > 1990


In [None]:
pub.join(ven).filter(pub.year > 1990).select('year','type').show(truncate=False)

Find name of publications related to the venues written for.

In [None]:
pub.join(ven, pub.venue == ven.name, 'inner') \
            .select('title', 'name') \
            .filter(col('type') == 'C') \
            .show()

List books where their title ends with "i" as penultimum letter

```sql
select P.title
from Publication as P 
where title like '%i_'

In [None]:
pub.filter(col('title').like('%i_')).select(col('title')).show(truncate=False)

Identify the most common book in the publications table. \
Then list all the attributes and number of venues it is in.

```sql
select publication.*, cnt
from publication natural join (
select IDP as id,count(*) as cnt
from venue
group by IDP
having count(*) = (select max(A.cnt)
from (select IDP,count(*) as cnt
from venue
group by IDP) A)) B
```

Show authors with at least two publications, ordered by number of publications. \
(GROUP BY, 1 JOIN, AS)

In [None]:
writes.join(author,writes.author == author.id,'inner') \
    .groupBy('author') \
    .agg(count('publication').alias('number_of_publications')) \
    .filter(col('number_of_publications') > 1) \
    .sort('number_of_publications',ascending=True) \
    .show()


Find publications that are referenced more than the average, ordered by the number of publications in ascending order. \
(NESTED QUERY)


In [None]:
average = ref.groupBy('referenced').count() \
        .sort('count',ascending=False) \
        .groupBy() \
        .avg('count') \
        .collect()[0][0]

pub.join(ref,ref.referenced == pub.id,'left') \
    .select('title','id','referenced','references') \
    .groupBy('title') \
    .agg(count('references').alias('number_of_references')) \
    .filter(col('number_of_references') > average) \
    .sort('number_of_references',ascending=True) \
    .show()

Update the publication dataframe with the number of pages column for each article.

In [None]:
pub = pub.withColumn("pages",when( col("page_start").isNotNull(), pub['page_end']-pub['page_start']).otherwise( None ).alias("pages")) \

pub.select('title','page_start','page_end','pages').show()