In [2]:
# Import the basic spark library
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, IntegerType
from pyspark.sql.functions import from_json, col, count, when, array_contains

# Create an entry point to the PySpark Application
spark = SparkSession.builder \
      .master("local") \
      .appName("NoSQLProject") \
      .getOrCreate()
# master contains the URL of your remote spark instance bor 'local'

22/12/05 19:55:46 WARN Utils: Your hostname, linux resolves to a loopback address: 127.0.1.1; using 192.168.1.172 instead (on interface enp6s0)
22/12/05 19:55:46 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/05 19:55:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
schemapub = StructType([ \
    StructField('id', StringType(), False), \
    StructField('title', StringType(), False), \
    StructField('page_start', IntegerType(), True), \
    StructField('page_end', IntegerType(), True), \
    StructField('year', IntegerType(), True), \
    StructField('citations', IntegerType(), True), \
    StructField('venue', StringType(), True), \
    StructField('keywords', StringType(), True) \
])

schemaref = StructType([ \
    StructField('references', StringType(), False), \
    StructField('referenced', StringType(), False) \
])

schemavenue = StructType([ \
    StructField('name', StringType(), False), \
    StructField('type', StringType(), True) \
])

schemawrites = StructType([ \
    StructField('author', StringType(), False), \
    StructField('publication', StringType(), False) \
])

schemaauthor = StructType([ \
    StructField('id', StringType(), False), \
    StructField('name', StringType(), False), \
    StructField('org', StringType(), True) \
])

In [91]:
pub = spark.read.option("header", True).option("delimiter", ";").schema(schemapub).csv("dataset/publication.csv")
pub = pub.withColumn('keywords', from_json('keywords', ArrayType(StringType())))


ref = spark.read.option("header", True).option("delimiter", ";").schema(schemaref).csv("dataset/reference.csv")


ven = spark.read.option("header", True).option("delimiter", ";").schema(schemavenue).csv("dataset/venue.csv")


writes = spark.read.option("header", True).option("delimiter", ";").schema(schemawrites).csv("dataset/writes.csv")


author = spark.read.option("header", True).option("delimiter", ";").schema(schemaauthor).csv("dataset/author.csv")

Publications

In [92]:
pub.printSchema()
pub.show(5)

root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- page_start: integer (nullable = true)
 |-- page_end: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- citations: integer (nullable = true)
 |-- venue: string (nullable = true)
 |-- keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)

+----+--------------------+----------+--------+----+---------+--------------------+--------------------+
|  id|               title|page_start|page_end|year|citations|               venue|            keywords|
+----+--------------------+----------+--------+----+---------+--------------------+--------------------+
|1091|Preliminary Desig...|        89|      93|2013|        1|International Con...|[Telecommunicatio...|
|1388|Further Results o...|      null|    null|2000|        1|    Ars Combinatoria|[Graph, Discrete ...|
|1674|A methodology for...|       137|     144|2011|        1|International Con...|[Statue, Engineer...|
|1688|

References


In [93]:
ref.printSchema()
ref.show(5)

root
 |-- references: string (nullable = true)
 |-- referenced: string (nullable = true)

+----------+----------+
|references|referenced|
+----------+----------+
|   1377244|    844961|
|   1699467|   1766746|
|    988455|   1295707|
|    252754|    396995|
|   1568722|    444631|
+----------+----------+
only showing top 5 rows



Venues

In [94]:
ven.printSchema()
ven.show(5)

root
 |-- name: string (nullable = true)
 |-- type: string (nullable = true)

+--------------------+----+
|                name|type|
+--------------------+----+
|                 MMB|   C|
|IEEE Internationa...|   C|
|Intelligent Infor...|   C|
|International Con...|   C|
|Intelligent Envir...|   C|
+--------------------+----+
only showing top 5 rows



Authors write a publication

In [95]:
writes.printSchema()
writes.show(5)

root
 |-- author: string (nullable = true)
 |-- publication: string (nullable = true)

+----------+-----------+
|    author|publication|
+----------+-----------+
|2063147752|     988004|
|2121858404|     559767|
|2142126378|    1534806|
|2227406975|    1689216|
|2100069631|    1664829|
+----------+-----------+
only showing top 5 rows



Authors

In [96]:
author.printSchema()
author.show(5)

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- org: string (nullable = true)

+----------+-------------------+--------------------+
|        id|               name|                 org|
+----------+-------------------+--------------------+
|2394632022|       Jan-Bon Chen|                null|
|2791585711|J. E. Kapusnik-Uner|                null|
|2777059883|        Sadao Obana|                null|
|2618245063|        David Weiss|Computer and Info...|
|2404679103|     Michal Helwich|                null|
+----------+-------------------+--------------------+
only showing top 5 rows



**<h3>Queries</h3>**

1. Show the type of publications from 1990 to date. (WHERE and JOIN)

```sql
SELECT type  
FROM publication JOIN (SELECT * FROM venue)
WHERE publication.year > 1990


In [97]:
pub.join(ven, pub.venue == ven.name, 'inner') \
            .filter(pub.year > 1990) \
            .select('year', 'type') \
            .show(truncate=False)

+----+----+
|year|type|
+----+----+
|2013|C   |
|2000|J   |
|2011|C   |
|2009|C   |
|2009|C   |
|2004|J   |
|2011|C   |
|2003|C   |
|2012|C   |
|2014|C   |
|2013|C   |
|2002|C   |
|2006|C   |
|2003|J   |
|2008|C   |
|2002|C   |
|2013|C   |
|2009|C   |
|2008|C   |
|2008|C   |
+----+----+
only showing top 20 rows



2. Find name of publications related to the venues written for (WHERE and JOIN again)

```sql
SELECT title, name  
FROM publication AS p JOIN venue AS v 
ON p.venue = v.name WHERE v.type = 'C'


In [98]:
pub.join(ven, pub.venue == ven.name, 'inner') \
            .select('title', 'name') \
            .filter(col('type') == 'C') \
            .show(truncate = True)

+--------------------+--------------------+
|               title|                name|
+--------------------+--------------------+
|Preliminary Desig...|International Con...|
|A methodology for...|International Con...|
|Comparison of GAR...|Pattern Recogniti...|
|COMPARING GNG3D A...|International Con...|
|Improved Secret I...|International Sym...|
|A Self-Stabilizin...|Parallel and Dist...|
|Formal agent-orie...|Asian Conference ...|
|Fur Visualisation...|International Con...|
|Identifying Psych...|International Con...|
|Multisymplectic S...|International Con...|
|The Role of the B...|Americas Conferen...|
|Speech training s...|International Joi...|
|Knowledge Enginee...|Joint Conference ...|
|Design of an audi...|Conference of the...|
|A Platform for Di...|International Con...|
|A COMPUTATIONAL S...|International Con...|
|Cleaneval: a Comp...|Language Resource...|
|Leveraging legacy...|Operating Systems...|
|A pedestrian navi...|International Con...|
|Algorithms for th...|European S

3. List books where their title ends with "i" as penultimum letter (WHERE, LIKE and LIMIT)

```sql
SELECT P.title
FROM Publication AS P 
WHERE title LIKE '%i_'
LIMIT 5


In [99]:
pub.filter(col('title').like('%i_')) \
            .select(col('title')) \
            .limit(5) \
            .show(truncate=False)

+------------------------------------------------------------------------------------------------------------------+
|title                                                                                                             |
+------------------------------------------------------------------------------------------------------------------+
|Two notes from experimental study on image steganalysis                                                           |
|Acquiring entailment pairs across languages and domains: a data analysis                                          |
|Automated identification of thoracolumbar vertebrae using orthogonal matching pursuit                             |
|Agricultural Knowledge Management Systems in Practice: The Ability to Support Wereda Knowledge Centers in Ethiopia|
|Genetic Network Programming with Actor-Critic                                                                     |
+---------------------------------------------------------------

4. Find publications written by authors affiliated with a specific organization (WHERE, IN, Nested Query)

```sql
SELECT p.title AS Title
FROM Publication AS p
WHERE p.id IN ( SELECT p2.id
                FROM writes AS w JOIN author AS a ON w.author = a.id
                WHERE a.org = 'University of Illinois at Chicago.'
              )
              

In [100]:
org_pubs = pub.join(writes, writes.publication == pub.id) \
                .join(author, author.id == writes.author) \
                .filter(col('org') == 'University of Illinois at Chicago.') \
                .rdd.map(lambda x: x.publication).collect()

pub.filter(col('id').isin(org_pubs)) \
    .show()

+------+--------------------+----------+--------+----+---------+--------------------+--------------------+
|    id|               title|page_start|page_end|year|citations|               venue|            keywords|
+------+--------------------+----------+--------+----+---------+--------------------+--------------------+
| 34988|Optimization of h...|       303|     324|1997|       31|Annals of Operati...|[Genetic operator...|
|961884|Outlier Detection...|       483|     493|2008|      113|SIAM Internationa...|[Data point, Anom...|
+------+--------------------+----------+--------+----+---------+--------------------+--------------------+



5. Show publications and their number of citations (GROUP BY, JOIN, AS)

11. Find the most cited publication(s) in the publications table, list its (their) name(s), ID(s), venue(s) with type and authors (one row per author and publication). (WHERE, GROUP BY, HAVING, 2 JOINs)

```sql
SELECT p.title AS Title, p.id AS ID, p.venue AS Venue, v.type AS Type, author.name AS author
FROM publication AS p JOIN venue AS v ON p.venue = v.name
        JOIN writes AS w ON w.publication = p.id
        JOIN author AS a ON a.id = w.author
WHERE p.id IN ( SELECT p2.id 
                FROM publication AS p2 JOIN ref AS r ON p2.id = r.referenced
                GROUP BY p2.id
                HAVING COUNT(*) = (SELECT MAX(COUNT(*))
                                   FROM publication AS p3 JOIN ref AS r2 ON p3.id = r2.referenced
                                   GROUP BY p3.id)
              )


In [103]:
max = pub.join(ref, ref.references == pub.id) \
            .groupBy('id') \
            .count() \
            .groupBy() \
            .max('count') \
            .collect()[0][0]

max_pubs = pub.join(ref, ref.references == pub.id) \
            .groupBy('id') \
            .count() \
            .filter(col('count') == max) \
            .rdd.map(lambda x: x.id).collect()

pub.join(ven, ven.name == pub.venue) \
            .join(writes, writes.publication == pub.id) \
            .join(author, writes.author == author.id) \
            .drop(author.id) \
            .drop(ven.name) \
            .filter(col('id').isin(max_pubs)) \
            .select(col('id'), col('title'), col('venue'), col('type'), col('name')) \
            .withColumnRenamed('id', 'ID') \
            .withColumnRenamed('title', 'Title') \
            .withColumnRenamed('venue', 'Venue') \
            .withColumnRenamed('type', 'VType') \
            .withColumnRenamed('type', 'Author') \
            .limit(5) \
            .show(truncate = True)

+-------+--------------------+--------------------+-----+----------------+
|     ID|               Title|               Venue|VType|            name|
+-------+--------------------+--------------------+-----+----------------+
|1235110|Impact of Communi...|International Con...|    C|Dimitar Trajanov|
|1235110|Impact of Communi...|International Con...|    C|Dimitar Trajanov|
|1235110|Impact of Communi...|International Con...|    C| Sonja Filiposka|
|1235110|Impact of Communi...|International Con...|    C| Sonja Filiposka|
+-------+--------------------+--------------------+-----+----------------+

26


Show authors with at least two publications, ordered by number of publications. \
(GROUP BY, 1 JOIN, AS)

In [None]:
writes.join(author,writes.author == author.id,'inner') \
    .groupBy('author') \
    .agg(count('publication').alias('number_of_publications')) \
    .filter(col('number_of_publications') > 1) \
    .sort('number_of_publications',ascending=True) \
    .show()


Find publications that are referenced more than the average, ordered by the number of publications in ascending order. \
(NESTED QUERY)


In [None]:
average = ref.groupBy('referenced').count() \
        .sort('count',ascending=False) \
        .groupBy() \
        .avg('count') \
        .collect()[0][0]

pub.join(ref,ref.referenced == pub.id,'left') \
    .select('title','id','referenced','references') \
    .groupBy('title') \
    .agg(count('references').alias('number_of_references')) \
    .filter(col('number_of_references') > average) \
    .sort('number_of_references',ascending=True) \
    .show()

Update the publication dataframe with the number of pages column for each article.

In [None]:
pub = pub.withColumn("pages",when( col("page_start").isNotNull(), pub['page_end']-pub['page_start']).otherwise( None ).alias("pages")) \

pub.select('title','page_start','page_end','pages').show()