In [3]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("joins").getOrCreate()

In [4]:
spark

In [5]:
valuesP = [('koala',1,'yes'),('caterpillar',2,'yes'),('deer',3,'yes'),('human',4,'yes')]
eats_plants = spark.createDataFrame(valuesP,['name','id','eats_plants'])

valuesM = [('shark',5,'yes'),('lion',6,'yes'),('tiger',7,'yes'),('human',4,'yes')]
eats_meat = spark.createDataFrame(valuesM,['name','id','eats_meat'])

print("Plant eaters (herbivores)")
print(eats_plants.show())
print("Meat eaters (carnivores)")
print(eats_meat.show())

Plant eaters (herbivores)


                                                                                

+-----------+---+-----------+
|       name| id|eats_plants|
+-----------+---+-----------+
|      koala|  1|        yes|
|caterpillar|  2|        yes|
|       deer|  3|        yes|
|      human|  4|        yes|
+-----------+---+-----------+

None
Meat eaters (carnivores)
+-----+---+---------+
| name| id|eats_meat|
+-----+---+---------+
|shark|  5|      yes|
| lion|  6|      yes|
|tiger|  7|      yes|
|human|  4|      yes|
+-----+---+---------+

None


In [6]:
new_df = eats_plants

In [7]:
df_append = eats_plants.union(new_df)

In [8]:
eats_plants.count()

4

In [9]:
df_append.count()

8

In [10]:
# Inner join
inner_join = eats_plants.join(eats_meat, ["name","id"], "inner")
inner_join.toPandas()

Unnamed: 0,name,id,eats_plants,eats_meat
0,human,4,yes,yes


In [11]:
# Left join
left_join = eats_plants.join(eats_meat, ["name","id"], "left")
left_join.toPandas()

Unnamed: 0,name,id,eats_plants,eats_meat
0,caterpillar,2,yes,
1,deer,3,yes,
2,human,4,yes,yes
3,koala,1,yes,


In [12]:
# Right join
right_join = eats_plants.join(eats_meat, ["name","id"], "right")
right_join.toPandas()

Unnamed: 0,name,id,eats_plants,eats_meat
0,human,4,yes,yes
1,lion,6,,yes
2,shark,5,,yes
3,tiger,7,,yes


In [13]:
cond_join = eats_plants.join(eats_meat, ["name","id"], "left").filter(eats_meat.name.isNull())
cond_join.toPandas()

Unnamed: 0,name,id,eats_plants,eats_meat
0,caterpillar,2,yes,
1,deer,3,yes,
2,koala,1,yes,


In [14]:
full_join = eats_plants.join(eats_meat, ["name","id"], how="full")
full_join.toPandas()

Unnamed: 0,name,id,eats_plants,eats_meat
0,caterpillar,2,yes,
1,deer,3,yes,
2,human,4,yes,yes
3,koala,1,yes,
4,lion,6,,yes
5,shark,5,,yes
6,tiger,7,,yes


In [15]:
import os

path = "../datasets/uw-madison-courses/"

In [16]:
df_list = []
for filename in os.listdir(path):
    if filename.endswith(".csv"):
        filename_list = filename.split(".")
        df_name = filename_list[0]
        df = spark.read.csv(path+filename, inferSchema=True, header=True)
        df.name = df_name
        df_list.append(df_name)
        exec(df_name + "= df")

df_list

                                                                                

['rooms',
 'subjects',
 'courses',
 'grade_distributions',
 'schedules',
 'subject_memberships',
 'teachings',
 'sections',
 'instructors',
 'course_offerings']

In [17]:
instructors.toPandas()

Unnamed: 0,id,name
0,761703,JOHN ARCHAMBAULT
1,3677061,STEPHANIE KANN
2,788586,KATHY PREM
3,1600463,KRISTIN KLARKOWSKI
4,693634,DAVID BOHNHOFF
...,...,...
18732,491743,MARY COLLINS
18733,952509,PAUL HUNTER
18734,3644163,MEGAN GUSSICK
18735,5841497,MICHAEL SEMANIK


In [18]:
course_offerings.limit(3).toPandas()

Unnamed: 0,uuid,course_uuid,term_code,name
0,344b3ebe-da7e-314c-83ed-9425269695fd,a3e3e1c3-543d-3bb5-ae65-5f2aec4ad1de,1092,Cooperative Education Prog
1,f718e6cd-33f0-3c14-a9a6-834d9c3610a8,a3e3e1c3-543d-3bb5-ae65-5f2aec4ad1de,1082,Cooperative Education Prog
2,ea3b717c-d66b-30dc-8b37-964d9688295f,a3e3e1c3-543d-3bb5-ae65-5f2aec4ad1de,1172,Cooperative Education Prog


In [19]:
teachings.show(3)

+-------------+--------------------+
|instructor_id|        section_uuid|
+-------------+--------------------+
|       761703|45adf63c-48c9-365...|
|       761703|c6280e23-5e43-385...|
|       761703|9395dc21-15d1-3fa...|
+-------------+--------------------+
only showing top 3 rows



In [20]:
step1 = teachings.join(instructors, teachings.instructor_id == instructors.id, 'left').\
    select(['instructor_id','name','section_uuid'])
step1.show()

+-------------+----------------+--------------------+
|instructor_id|            name|        section_uuid|
+-------------+----------------+--------------------+
|       761703|JOHN ARCHAMBAULT|45adf63c-48c9-365...|
|       761703|JOHN ARCHAMBAULT|c6280e23-5e43-385...|
|       761703|JOHN ARCHAMBAULT|9395dc21-15d1-3fa...|
|      3677061|  STEPHANIE KANN|b99e440b-39db-350...|
|       761703|JOHN ARCHAMBAULT|ca1c841f-41d5-329...|
|      3677061|  STEPHANIE KANN|da41b0aa-2b81-378...|
|       761703|JOHN ARCHAMBAULT|51c4dc00-1fc7-3c7...|
|       761703|JOHN ARCHAMBAULT|53f95c0f-4ea9-374...|
|       761703|JOHN ARCHAMBAULT|574d9b35-9c76-338...|
|       761703|JOHN ARCHAMBAULT|8fc362a6-d94e-3ad...|
|       761703|JOHN ARCHAMBAULT|d50e7478-e12f-363...|
|       761703|JOHN ARCHAMBAULT|b21564d2-2bc4-3b8...|
|       788586|      KATHY PREM|8853d23e-64b1-3cf...|
|       788586|      KATHY PREM|94339199-eaf3-393...|
|       761703|JOHN ARCHAMBAULT|0d72958a-291b-33d...|
|       761703|JOHN ARCHAMBA

In [21]:
step2 = step1.join(sections, step1.section_uuid== sections.uuid, 'left').\
    select(['name','course_offering_uuid'])
step2.limit(4).toPandas()

                                                                                

Unnamed: 0,name,course_offering_uuid
0,JAMES STEELE,dfac15fb-e446-339e-9403-38b270895b6c
1,TERESA CLARK,878d4f26-4e7e-3cec-b2e3-28fd56d6489c
2,JAMES STEELE,3fc6bfe1-7929-3f2e-af13-5185f1cf7383
3,STEPHANIE KANN,ea3b717c-d66b-30dc-8b37-964d9688295f


In [22]:
step3 = step2.withColumnRenamed('name', 'instructor').\
    join(course_offerings, step2.course_offering_uuid == course_offerings.uuid, 'inner').\
        select(['instructor','name','course_offering_uuid'])
step3.limit(4).toPandas()

                                                                                

Unnamed: 0,instructor,name,course_offering_uuid
0,MICHAEL CONNORS,Special Topics,128f24cf-b7bf-3a8b-8f04-136c7b6fa556
1,RICK JENISON,Research,f513b3a7-9fdc-30f2-9f50-666870298ead
2,SUSANNE BARNETT,Advanced Independent Study,9dcee3f1-0909-318b-8a3d-72c931959656
3,THOMAS JAHNS,Master's Research or Thesis,f850ab24-740c-311a-a669-804a3fea7b0b


In [23]:
# Levenshtien Distance
# dog = dogs

from pyspark.sql.functions import levenshtein

df0 = spark.createDataFrame([("Aple","Apple","Microsoft", "IBM")],["Input", "Option1","Option2","Option3"])
df0.show()


+-----+-------+---------+-------+
|Input|Option1|  Option2|Option3|
+-----+-------+---------+-------+
| Aple|  Apple|Microsoft|    IBM|
+-----+-------+---------+-------+



In [24]:
df0.select(levenshtein('Input','Option1').alias("Apple")).show()
df0.select(levenshtein('Input','Option2').alias("Microsoft")).show()
df0.select(levenshtein('Input','Option3').alias("IBM")).show()

+-----+
|Apple|
+-----+
|    1|
+-----+

+---------+
|Microsoft|
+---------+
|        9|
+---------+

+---+
|IBM|
+---+
|  4|
+---+

