In [1]:
# Import the basic spark library
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, IntegerType
from pyspark.sql.functions import from_json, col, count, when, array_contains, lower

# Create an entry point to the PySpark Application
spark = SparkSession.builder \
      .master("local") \
      .appName("NoSQLProject") \
      .getOrCreate()
# master contains the URL of your remote spark instance bor 'local'

22/12/14 23:50:28 WARN Utils: Your hostname, linux resolves to a loopback address: 127.0.1.1; using 192.168.1.172 instead (on interface enp6s0)
22/12/14 23:50:28 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/14 23:50:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
schemapub = StructType([ \
    StructField('id', StringType(), False), \
    StructField('title', StringType(), False), \
    StructField('page_start', IntegerType(), True), \
    StructField('page_end', IntegerType(), True), \
    StructField('year', IntegerType(), True), \
    StructField('citations', IntegerType(), True), \
    StructField('venue', StringType(), True), \
    StructField('keywords', StringType(), True) \
])

schemaref = StructType([ \
    StructField('references', StringType(), False), \
    StructField('referenced', StringType(), False) \
])

schemavenue = StructType([ \
    StructField('name', StringType(), False), \
    StructField('type', StringType(), True) \
])

schemawrites = StructType([ \
    StructField('author', StringType(), False), \
    StructField('publication', StringType(), False) \
])

schemaauthor = StructType([ \
    StructField('id', StringType(), False), \
    StructField('name', StringType(), False), \
    StructField('org', StringType(), True) \
])

In [3]:
pub = spark.read.option("header", True).option("delimiter", ";").schema(schemapub).csv("dataset/publication.csv")
pub = pub.withColumn('keywords', from_json('keywords', ArrayType(StringType())))


ref = spark.read.option("header", True).option("delimiter", ";").schema(schemaref).csv("dataset/reference.csv")


ven = spark.read.option("header", True).option("delimiter", ";").schema(schemavenue).csv("dataset/venue.csv")


writes = spark.read.option("header", True).option("delimiter", ";").schema(schemawrites).csv("dataset/writes.csv")


author = spark.read.option("header", True).option("delimiter", ";").schema(schemaauthor).csv("dataset/author.csv")

Publications

In [4]:
pub.printSchema()
pub.show(5)

root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- page_start: integer (nullable = true)
 |-- page_end: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- citations: integer (nullable = true)
 |-- venue: string (nullable = true)
 |-- keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)

+----+--------------------+----------+--------+----+---------+--------------------+--------------------+
|  id|               title|page_start|page_end|year|citations|               venue|            keywords|
+----+--------------------+----------+--------+----+---------+--------------------+--------------------+
|1091|Preliminary Desig...|        89|      93|2013|        1|International Con...|[Telecommunicatio...|
|1388|Further Results o...|      null|    null|2000|        1|    Ars Combinatoria|[Graph, Discrete ...|
|1674|A methodology for...|       137|     144|2011|        1|International Con...|[Statue, Engineer...|
|1688|

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

References


In [5]:
ref.printSchema()
ref.show(5)

root
 |-- references: string (nullable = true)
 |-- referenced: string (nullable = true)

+----------+----------+
|references|referenced|
+----------+----------+
|     67582|    140362|
|     27301|   1426774|
|   1144321|    749663|
|   1096736|   1270031|
|    274954|    594905|
+----------+----------+
only showing top 5 rows



Venues

In [6]:
ven.printSchema()
ven.show(5)

root
 |-- name: string (nullable = true)
 |-- type: string (nullable = true)

+--------------------+----+
|                name|type|
+--------------------+----+
|International Con...|   C|
|Symposium on Expe...|   C|
| OTM Conferences (1)|   C|
|AES Candidate Con...|   C|
|Italian Research ...|   C|
+--------------------+----+
only showing top 5 rows



Authors write a publication

In [7]:
writes.printSchema()
writes.show(5)

root
 |-- author: string (nullable = true)
 |-- publication: string (nullable = true)

+----------+-----------+
|    author|publication|
+----------+-----------+
|2063147752|     988004|
|2121858404|     559767|
|2142126378|    1534806|
|2227406975|    1689216|
|2100069631|    1664829|
+----------+-----------+
only showing top 5 rows



Authors

In [8]:
author.printSchema()
author.show(5)

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- org: string (nullable = true)

+----------+----------------+--------------------+
|        id|            name|                 org|
+----------+----------------+--------------------+
|2147737326|    Doo-Hwan Bae|  Qingdao University|
|2504038831|Daniel L. Thomas|Chair of Informat...|
|2007383746|   Shyh Wei Teng|CNRS, UMR 7039, C...|
|2306001489|       A. Tuozzi| Ecole Polytechnique|
|2273196592|  Yueh-Min Huang|Bibliotheque Nati...|
+----------+----------------+--------------------+
only showing top 5 rows



**<h3>Read Queries</h3>**

<h4>1. Show the type of publications from 1990 to date. (WHERE and JOIN) </h4>

```sql
SELECT type  
FROM publication JOIN (SELECT * FROM venue)
WHERE publication.year > 1990
```

In [9]:
pub.join(ven, pub.venue == ven.name, 'inner') \
   .filter(pub.year > 1990) \
   .select('year', 'type') \
   .show(truncate = False)

+----+----+
|year|type|
+----+----+
|2013|C   |
|2000|J   |
|2011|C   |
|2009|C   |
|2009|C   |
|2004|J   |
|2011|C   |
|2003|C   |
|2012|C   |
|2014|C   |
|2013|C   |
|2002|C   |
|2006|C   |
|2003|J   |
|2008|C   |
|2002|C   |
|2013|C   |
|2009|C   |
|2008|C   |
|2008|C   |
+----+----+
only showing top 20 rows



<h4>2. Find the names of publications published in conferences(WHERE and JOIN again)</h4>

```sql
SELECT title, name  
FROM publication AS p JOIN venue AS v 
ON p.venue = v.name WHERE v.type = 'C'
```

In [10]:
pub.join(ven, pub.venue == ven.name, 'inner') \
   .select('title', 'name') \
   .filter(col('type') == 'C') \
   .show(truncate = True)

+--------------------+--------------------+
|               title|                name|
+--------------------+--------------------+
|Preliminary Desig...|International Con...|
|A methodology for...|International Con...|
|Comparison of GAR...|Pattern Recogniti...|
|COMPARING GNG3D A...|International Con...|
|Improved Secret I...|International Sym...|
|A Self-Stabilizin...|Parallel and Dist...|
|Formal agent-orie...|Asian Conference ...|
|Fur Visualisation...|International Con...|
|Identifying Psych...|International Con...|
|Multisymplectic S...|International Con...|
|The Role of the B...|Americas Conferen...|
|Speech training s...|International Joi...|
|Knowledge Enginee...|Joint Conference ...|
|Design of an audi...|Conference of the...|
|A Platform for Di...|International Con...|
|A COMPUTATIONAL S...|International Con...|
|Cleaneval: a Comp...|Language Resource...|
|Leveraging legacy...|Operating Systems...|
|A pedestrian navi...|International Con...|
|Algorithms for th...|European S

<h4>3. List books where their title ends with "i" as the penultimate letter (WHERE, LIKE and LIMIT)</h4>

```sql
SELECT p.title
FROM publication AS p
WHERE title LIKE '%i_'
LIMIT 5
```

In [11]:
pub.filter(col('title').like('%i_')) \
   .select(col('title')) \
   .limit(5) \
   .show(truncate = False)

+------------------------------------------------------------------------------------------------------------------+
|title                                                                                                             |
+------------------------------------------------------------------------------------------------------------------+
|Two notes from experimental study on image steganalysis                                                           |
|Acquiring entailment pairs across languages and domains: a data analysis                                          |
|Automated identification of thoracolumbar vertebrae using orthogonal matching pursuit                             |
|Agricultural Knowledge Management Systems in Practice: The Ability to Support Wereda Knowledge Centers in Ethiopia|
|Genetic Network Programming with Actor-Critic                                                                     |
+---------------------------------------------------------------

<h4>4. Find publications written by authors affiliated with a specific organization (WHERE, IN, Nested Query)</h4>

```sql
SELECT p.title AS Title
FROM publication AS p
WHERE p.id IN ( SELECT p2.id
                FROM writes AS w JOIN author AS a ON w.author = a.id
                WHERE a.org = 'University of Illinois at Chicago.'
              )
```              

In [12]:
org_pubs = pub.join(writes, writes.publication == pub.id) \
                .join(author, author.id == writes.author) \
                .filter(col('org') == 'University of Illinois at Chicago.') \
                .rdd.map(lambda x: x.publication).collect()

pub.filter(col('id').isin(org_pubs)).show(truncate = True)

+-------+--------------------+----------+--------+----+---------+--------------------+--------------------+
|     id|               title|page_start|page_end|year|citations|               venue|            keywords|
+-------+--------------------+----------+--------+----+---------+--------------------+--------------------+
| 212088|High dimensional ...|       176|     188|2014|        0|Similarity Search...|[Query optimizati...|
| 273782|Engineering educa...|      null|    null|1992|        0|International Con...|[Computer science...|
| 563542|Prosody for Manda...|      1133|    1136|2008|        2|Conference of the...|[Speech corpus, P...|
| 649244|Width of Points i...|       447|     452|2016|        2|Symposium on Disc...|[Discrete mathema...|
|1327162|Datalog Rewriting...|       209|     220|2014|        0|  Description Logics|[Ontology (inform...|
|1494332|Comprehensive dep...|         1|       1|2006|        3|Hot Topics in Sys...|[Middleware, Deci...|
+-------+-------------------

<h4>5. Show authors that have written at least 2 publications, sorted by number of publications (GROUP BY, JOIN, AS)</h4>

```sql
SELECT w.author AS Author, COUNT(*) AS NumberOfPublications
FROM writes AS w JOIN author AS a ON w.author = a.id
GROUP BY a.id, w.author
HAVING COUNT(*) > 1
```              

In [13]:
writes.join(author, writes.author == author.id,'inner') \
      .groupBy('author') \
      .agg(count('publication').alias('number_of_publications')) \
      .filter(col('number_of_publications') > 1) \
      .sort('number_of_publications',ascending=True) \
      .show(truncate = False)

+----------+----------------------+
|author    |number_of_publications|
+----------+----------------------+
|2100069631|2                     |
|20509179  |2                     |
|225835777 |2                     |
|2164566917|2                     |
|1983837166|2                     |
|2148400816|2                     |
|2075066679|2                     |
|220641829 |2                     |
|2065201609|2                     |
|2138334042|2                     |
|2168582154|2                     |
|2586853120|2                     |
|1966189204|2                     |
|2008578578|2                     |
|2700055908|2                     |
|2125845287|2                     |
|2304091908|2                     |
|1980371435|2                     |
|2069465661|2                     |
|2330105029|2                     |
+----------+----------------------+
only showing top 20 rows



<h4>6. Show publications (title and number of citations) whose title contains the word 'methodology' (GROUP BY and WHERE)</h4>

```sql
SELECT p.title, COUNT(*) AS citations
FROM publication AS p JOIN ref AS r ON p.id = r.referenced
WHERE p.title LIKE '%methodology%'
GROUP BY r.referenced, p.title
```

In [14]:
pub.join(ref, ref.referenced == pub.id) \
   .filter(lower(col('title')).like('%methodology%')) \
   .groupBy('referenced', 'title') \
   .count() \
   .select(col('title').alias('Title'), col('count').alias('Citations')) \
   .show(truncate = True)

+--------------------+---------+
|               Title|Citations|
+--------------------+---------+
|EVALUATION OF IT/...|        1|
|Comparing Estimat...|        9|
|Measuring Informa...|        7|
|Methodology for D...|       11|
|A Methodology and...|        9|
|Towards an Organi...|        6|
|Soft Systems Meth...|       17|
|A methodology for...|        2|
+--------------------+---------+



<h4>7. Show the IDs of publications that have cited at least 25 other publications (GROUP BY, HAVING and AS)</h4>

```sql
SELECT r.references, COUNT(*) AS references
FROM ref AS r 
GROUP BY r.references
HAVING COUNT(*) >= 25
````


In [15]:
ref.groupBy('references') \
   .count() \
   .filter(col('count') >= 25) \
   .withColumnRenamed('references', 'PublicationID') \
   .withColumnRenamed('count', 'References') \
   .show()

+-------------+----------+
|PublicationID|References|
+-------------+----------+
|      1533899|        26|
|      1552804|        28|
|       163357|        25|
+-------------+----------+



<h4>8. Show publications with at least 45 citations that contain the keyword 'Data mining' (GROUP BY, HAVING, WHERE and AS)</h4>

```sql
SELECT p.title AS Title, p.keywords AS Keywords, COUNT(*) AS citations
FROM publication AS p JOIN ref AS r ON p.id = r.referenced
WHERE array_contains(p.keywords, 'Data mining')
GROUP BY p.title, p.keywords, r.referenced
HAVING COUNT(*) >= 45
```

In [16]:
pub.join(ref, ref.referenced == pub.id) \
   .filter(array_contains(pub.keywords, 'Data mining')) \
   .groupBy('referenced', 'title', 'keywords') \
   .count() \
   .filter(col('count') >= 45) \
   .select(col('title').alias('Title'), col('keywords').alias('Keywords'), col('count').alias('Citations')) \
   .show(truncate = True)

+--------------------+--------------------+---------+
|               Title|            Keywords|Citations|
+--------------------+--------------------+---------+
|A Tool for Endosc...|[Computer vision,...|       47|
|Machine learning ...|[Data mining, Web...|       45|
|Research on Model...|[Data mining, Soc...|       47|
|Mining Correlated...|[Data mining, Com...|       45|
|Factorisation of ...|[Data mining, Inf...|       49|
|"Combining Text a...|[Data mining, Tex...|       48|
+--------------------+--------------------+---------+



<h4>9. Find publications that are referenced more than the average that contain the keywords 'Data Mining' and 'Computer Science', ordered by the number of publications in ascending order. (WHERE, Nested query, GROUP BY)</h4>

```sql
SELECT p.title, COUNT(*) AS citations
FROM publication AS p JOIN ref AS r ON p.id = r.referenced
WHERE array_contains(p.keywords, ['Data mining', 'Computer science'])
GROUP BY p.title, r.referenced
HAVING COUNT(*) > ( SELECT AVG(counter)
                    FROM (SELECT COUNT(*) AS counter
                          FROM ref AS r2
                          GROUP BY r2.referenced
                         )
                  )

````

In [17]:
average = ref.groupBy('referenced').count() \
             .sort('count',ascending=False) \
             .groupBy() \
             .avg('count') \
             .collect()[0][0]

pub.join(ref,ref.referenced == pub.id, 'left') \
   .select('title', 'id', 'referenced', 'references', 'keywords') \
   .groupBy('title', 'keywords') \
   .agg(count('references').alias('number_of_references')) \
   .filter((col('number_of_references') > average) & \
            array_contains(pub.keywords, 'Computer science') & \
            array_contains(pub.keywords, 'Data mining')) \
   .sort('number_of_references',ascending = True) \
   .show()

+--------------------+--------------------+--------------------+
|               title|            keywords|number_of_references|
+--------------------+--------------------+--------------------+
|Modularizing Onto...|[Distributed know...|                  13|
|Modeling for Opti...|[Data mining, Com...|                  13|
|TOWARDS AN EXTEND...|[Information syst...|                  15|
|Rough Sets-Based ...|[Data mining, Equ...|                  16|
|Finite model theo...|[Data mining, Fin...|                  17|
|DART: an efficien...|[Query optimizati...|                  18|
|A Minimum Descrip...|[Bottleneck, Grap...|                  18|
|A search-engine c...|[Data mining, Fea...|                  19|
|Association Rules...|[Data mining, Com...|                  19|
|Two Evolution Ind...|[Linear equation,...|                  19|
|Bridging the gaps...|[Semi-structured ...|                  20|
|A Dynamical Syste...|[Data mining, Com...|                  20|
|An algorithm for ...|[An

<h4>10. Find the publications with the average number of citations and the types of conferences they are in (WHERE, GROUP BY, HAVING, 1 JOINs)</h4>

```sql
SELECT p.title AS Title, p.id AS ID, p.venue AS Venue, v.type AS Type
FROM publication AS p JOIN venue AS v ON p.venue = v.name
WHERE p.id IN ( SELECT p2.id 
                FROM publication AS p2 JOIN ref AS r ON p2.id = r.referenced
                GROUP BY p2.id
                HAVING COUNT(*) = ( SELECT AVG(citations)
                                    FROM ( SELECT COUNT(*) AS citations
                                           FROM ref AS r 
                                           GROUP BY r.referenced
                                         )
                                   )
              )
````


In [18]:
avg = pub.join(ref, ref.referenced == pub.id) \
         .groupBy('id') \
         .count() \
         .groupBy() \
         .avg('count') \
         .collect()[0][0]

avg = int(avg)

avg_pubs = pub.join(ref, ref.referenced == pub.id) \
              .groupBy('id') \
              .count() \
              .filter(col('count') == avg) \
              .rdd.map(lambda x: x.id).collect()

pub.join(ven, ven.name == pub.venue) \
   .filter(col('id').isin(avg_pubs)) \
   .select(col('id'), col('title'), col('venue'), col('type')) \
   .withColumnRenamed('id', 'ID') \
   .withColumnRenamed('title', 'Title') \
   .withColumnRenamed('venue', 'Venue') \
   .withColumnRenamed('type', 'VType') \
   .show(truncate = True)

+-------+--------------------+--------------------+-----+
|     ID|               Title|               Venue|VType|
+-------+--------------------+--------------------+-----+
|  43929|Speech recognitio...|Conference of the...|    C|
|  74721|Exploring New Car...|International Con...|    C|
|  65230|Zur Modellierung ...|                 WLP|    C|
|  78484|Collaboration-bas...|               VVEIS|    C|
| 163313|Exponentially Smo...|International Con...|    C|
| 284492|Organizational Su...|Working Conferenc...|    C|
| 268343|An Illustrative D...|                 INC|    C|
| 330141|Evaluating the Mu...|            IP&amp;C|    C|
| 402724|Abstract Task Def...|International Con...|    C|
| 344196|Algebraic, Operat...|Australian Comput...|    C|
| 558092|Cryptanalysis of ...|International Jou...|    J|
| 630671|NLP EAC Recogniti...|Computer Analysis...|    C|
| 755599|Lessons Learned f...| Open Source Systems|    C|
| 806442|Orderings of Fini...|    Ars Combinatoria|    J|
| 730754|Suppo

<h4>11. Find the most cited publication(s) in the publications table, list its (their) name(s), ID(s), venue(s) with type and authors (one row per author and publication). (WHERE, GROUP BY, HAVING, 2 JOINs)</h4>

```sql
SELECT p.title AS Title, p.id AS ID, p.venue AS Venue, v.type AS Type, author.name AS author
FROM publication AS p JOIN venue AS v ON p.venue = v.name
        JOIN writes AS w ON w.publication = p.id
        JOIN author AS a ON a.id = w.author
WHERE p.id IN ( SELECT p2.id 
                FROM publication AS p2 JOIN ref AS r ON p2.id = r.referenced
                GROUP BY p2.id
                HAVING COUNT(*) = ( SELECT MAX(citations)
                                    FROM ( SELECT COUNT(*)
                                           FROM ref AS r
                                           GROUP BY r.referenced
                                        )
                                   
                                   )
               )
```

In [19]:
max = pub.join(ref, ref.referenced == pub.id) \
         .groupBy('id') \
         .count() \
         .groupBy() \
         .max('count') \
         .collect()[0][0]

max_pubs = pub.join(ref, ref.referenced == pub.id) \
              .groupBy('id') \
              .count() \
              .filter(col('count') == max) \
              .rdd.map(lambda x: x.id).collect()

pub.join(ven, ven.name == pub.venue) \
   .filter(col('id').isin(max_pubs)) \
   .join(writes, writes.publication == pub.id) \
   .join(author, writes.author == author.id) \
   .drop(author.id) \
   .drop(ven.name) \
   .select(col('id'), col('title'), col('venue'), col('type'), col('name')) \
   .withColumnRenamed('id', 'ID') \
   .withColumnRenamed('title', 'Title') \
   .withColumnRenamed('venue', 'Venue') \
   .withColumnRenamed('type', 'VType') \
   .withColumnRenamed('name', 'Author') \
   .show(truncate = True)

+-------+--------------------+--------------------+-----+--------------------+
|     ID|               Title|               Venue|VType|              Author|
+-------+--------------------+--------------------+-----+--------------------+
|1005427|Tough-Maximum Gra...|    Ars Combinatoria|    J|            N. Priya|
|1005427|Tough-Maximum Gra...|    Ars Combinatoria|    J|Sheshayya A. Choudum|
|1031492|Modeling reusable...|Software Engineer...|    C| Kendra M. L. Cooper|
|1031492|Modeling reusable...|Software Engineer...|    C|          Lirong Dai|
|1031492|Modeling reusable...|Software Engineer...|    C|        W. Eric Wong|
|1086356|Factorisation of ...|International Con...|    C|     Bruno Van Damme|
|1086356|Factorisation of ...|International Con...|    C|    Viviane Jonckers|
|1086356|Factorisation of ...|International Con...|    C|      Katja Verbeeck|
|1472329|Development of Co...|Proceedings of th...|    C|Johannes Freudenmann|
|1700909|Selective reformu...|International Con...| 

<h4>1. Update the publication attribute with the number of pages column for each article.</h4>

In [20]:
pub = pub.withColumn("pages"
                     , when(col("page_start") \
                            .isNotNull() \
                            , pub['page_end'] - pub['page_start']) \
                     .otherwise(None) \
                     .alias("pages")) \

pub.select('title','page_start','page_end','pages').show()

+--------------------+----------+--------+-----+
|               title|page_start|page_end|pages|
+--------------------+----------+--------+-----+
|Preliminary Desig...|        89|      93|    4|
|Further Results o...|      null|    null| null|
|A methodology for...|       137|     144|    7|
|Comparison of GAR...|       597|     602|    5|
|COMPARING GNG3D A...|        99|     102|    3|
|Vectorial fast co...|      null|    null| null|
|Improved Secret I...|       331|     335|    4|
|A Self-Stabilizin...|      1460|    1463|    3|
|Formal agent-orie...|       498|     508|   10|
|Fur Visualisation...|        41|      48|    7|
|Identifying Psych...|       728|     739|   11|
|Multisymplectic S...|       486|     495|    9|
|The Role of the B...|      null|    null| null|
|Speech training s...|       800|     802|    2|
|Software Evolutio...|         1|       5|    4|
|Knowledge Enginee...|       305|     314|    9|
|Design of an audi...|      null|    null| null|
|A Platform for Di..

<h4>2. Delete every publication that has 'keywords' = <i>null</i> or has got no citation.</h4>

In [21]:
pub = pub.filter(col('keywords').isNotNull()) \
         .filter(col('citations') > 0)

<h4>3. Add the total of the citations for each author in "<i>author</i>" table.</h4>

In [22]:
author = pub.join(writes, writes.publication == pub.id) \
            .join(author, writes.author == author.id) \
            .drop(pub.id) \
            .select(col('id'),col('citations')) \
            .groupBy('id') \
            .sum('citations') \
            .withColumnRenamed('sum(citations)','total_citations') \
            .join(author,author.id == author.id) \
            .drop(author.id) \
            .select('id','name','org','total_citations') \
# writes.withColumn()
author.show()



22/12/14 23:50:40 WARN Column: Constructing trivially true equals predicate, 'id#38 = id#38'. Perhaps you need to use aliases.
+----------+--------------------+--------------------+---------------+
|        id|                name|                 org|total_citations|
+----------+--------------------+--------------------+---------------+
|2120725579|        Lang M. Hung|Intelligent Secur...|              8|
| 324554506|       Erdogan Dogdu|Tokyo Inst. of tech.|              2|
|2112598428|       Nigel Collier|             TU WIEN|              1|
|2059436065|      Mario Piattini|Institut für Ange...|              4|
|2091504235|        Manfred Broy|Universitat Polit...|              4|
|2097392627|     Shalini Chandra|East China Normal...|              3|
|2234823802| Plácido R. Pinheiro|Unconventional Co...|              3|
|2119358681|Henrique M. G. Ma...|Faculty of Scienc...|              2|
|2004230566|     Michael Waidner|Edinburgh Napier ...|             12|
|2073859532|       Da

<h4>4. Change every instance of <i>Ecole Polytecnique</i> in <i>Ecole Polytecnique Federale de Lausanne.</i></h4>

In [23]:
author.filter(col('org') == 'Ecole Polytechnique').show(truncate=False)
author = author.withColumn('org' \
                           , when(col('org') == 'Ecole Polytechnique' \
                                  , 'Ecole Polytecnique Federale de Lausanne') \
                           .otherwise(col('org'))) 
author.filter(col('org') == 'Ecole Polytecnique Federale de Lausanne').show(truncate=False)

+----------+-------------+-------------------+---------------+
|id        |name         |org                |total_citations|
+----------+-------------+-------------------+---------------+
|2204904887|Anuj C. Desai|Ecole Polytechnique|4              |
|2306001489|A. Tuozzi    |Ecole Polytechnique|2              |
|2616136787|Hong Wu      |Ecole Polytechnique|13             |
+----------+-------------+-------------------+---------------+

+----------+-------------+---------------------------------------+---------------+
|id        |name         |org                                    |total_citations|
+----------+-------------+---------------------------------------+---------------+
|2204904887|Anuj C. Desai|Ecole Polytecnique Federale de Lausanne|4              |
|2306001489|A. Tuozzi    |Ecole Polytecnique Federale de Lausanne|2              |
|2616136787|Hong Wu      |Ecole Polytecnique Federale de Lausanne|13             |
+----------+-------------+----------------------------------

<h4>5. Create a table with only the main keyword for each publication (considering the first keyword the main one).</h4>

In [24]:
from pyspark.sql.functions import explode,first

p_exp = pub.select('id','title','page_start' \
                          ,'page_end','pages','year' \
                          ,'citations','venue' \
                          , explode(pub.keywords).alias('keyword'))


p_main_k = p_exp.groupBy('id','title','page_start' \
                         ,'page_end', 'pages' \
                         ,'year','citations','venue') \
                .agg(first('id').alias('_i') \
                     , first('title').alias('_t') \
                     , first('page_start').alias('_ps') \
                     , first('page_end').alias('_pe') \
                     , first('pages').alias('_p') \
                     , first('year').alias('_y') \
                     , first('citations').alias('_c') \
                     , first('venue').alias('_v') \
                     , first('keyword').alias('keyword')) \
                .drop('id','title','page_start','page_end' \
                      , 'pages','year','citations','venue') \
                .withColumnRenamed('_i','id') \
                .withColumnRenamed('_t','title') \
                .withColumnRenamed('_ps','page_start') \
                .withColumnRenamed('_pe','page_end') \
                .withColumnRenamed('_p','pages') \
                .withColumnRenamed('_y','year') \
                .withColumnRenamed('_c','citations') \
                .withColumnRenamed('_v','venue') \
                .sort(col('keyword').asc())

        
p_main_k.select('title','keyword').show(1,truncate=False)

+------------------------------------------------------------------------------------+----------+
|title                                                                               |keyword   |
+------------------------------------------------------------------------------------+----------+
|Towards Automatically Generated Tactile Detail Maps by 3D Printers for Blind Persons|3d printer|
+------------------------------------------------------------------------------------+----------+
only showing top 1 row

