In [1]:
filename = 'gs://st446-bucket-lx/data/author-large.txt'


In [2]:
from pyspark.sql.types import *

schema = StructType([
    StructField("author", StringType(), True),    
    StructField("journal", StringType(), True),
    StructField("title", StringType(), True),
    StructField("year", LongType(), True)
])

author_large = spark.read.csv(filename, header='false', schema=schema, sep='\t')
author_large.createOrReplaceTempView("author_large")

In [3]:
author_large.head()

                                                                                

Row(author='Jurgen Annevelink', journal='Modern Database Systems', title='Object SQL - A Language for the Design and Implementation of Object Databases.', year=1995)

In [4]:
author_large.printSchema()

root
 |-- author: string (nullable = true)
 |-- journal: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



In [5]:
author_large.show(10)

+--------------------+--------------------+--------------------+----+
|              author|             journal|               title|year|
+--------------------+--------------------+--------------------+----+
|   Jurgen Annevelink|Modern Database S...|Object SQL - A La...|1995|
|         Rafiul Ahad|Modern Database S...|Object SQL - A La...|1995|
|      Amelia Carlson|Modern Database S...|Object SQL - A La...|1995|
|   Daniel H. Fishman|Modern Database S...|Object SQL - A La...|1995|
|  Michael L. Heytens|Modern Database S...|Object SQL - A La...|1995|
|        William Kent|Modern Database S...|Object SQL - A La...|1995|
|     Jos A. Blakeley|Modern Database S...|OQL[C++]: Extendi...|1995|
|      Yuri Breitbart|Modern Database S...|Transaction Manag...|1995|
|Hector Garcia-Molina|Modern Database S...|Transaction Manag...|1995|
|Abraham Silberschatz|Modern Database S...|Transaction Manag...|1995|
+--------------------+--------------------+--------------------+----+
only showing top 10 

In [15]:
spark.sql("select a.author as author_a, b.author as author_b, count(a.author, b.author) as cnt \
            from author_large a join author_large b \
            on a.title = b.title and a.author > b.author \
            and a.author != b.author \
            group by a.author, b.author \
            order by cnt desc \
            limit 10").show()



+--------------------+-------------------+---+
|            author_a|           author_b|cnt|
+--------------------+-------------------+---+
|   Sudhakar M. Reddy|     Irith Pomeranz|249|
|   Divyakant Agrawal|      Amr El Abbadi|161|
|      Tomoya Enokido|    Makoto Takizawa|141|
|         Henri Prade|      Didier Dubois|122|
|    Tharam S. Dillon|    Elizabeth Chang|118|
|       Kee-Young Yoo|      Hyun-Sung Kim|111|
|Narayanan Vijaykr...|    Mary Jane Irwin|107|
|     Mary Jane Irwin| Mahmut T. Kandemir|100|
|           Jiajun Bu|          Chun Chen| 99|
|  Maurizio Lenzerini|Giuseppe De Giacomo| 99|
+--------------------+-------------------+---+



                                                                                