In [0]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
data1 = [
    (1, "Attack on Titan", "Action"),
    (2, "Death Note", "Thriller"),
    (3, "One Piece", "Adventure"),
    (4, "Naruto", "Action"),
]

columns1 = ["Anime_ID", "Anime_Name", "Genre"]
df1 = spark.createDataFrame(data1, columns1)
display(df1)

Anime_ID,Anime_Name,Genre
1,Attack on Titan,Action
2,Death Note,Thriller
3,One Piece,Adventure
4,Naruto,Action


In [0]:
data2 = [
    (1, 9.0, "Highly Rated"),
    (2, 9.5, "Highly Rated"),
    (3, 8.8, "Popular"),
    (5, 7.5, "Average"),
]

columns2 = ["Anime_ID", "Rating", "Review"]
df2 = spark.createDataFrame(data2, columns2)
display(df2)

Anime_ID,Rating,Review
1,9.0,Highly Rated
2,9.5,Highly Rated
3,8.8,Popular
5,7.5,Average


>Inner Join

In [0]:
inner_join = df1.join(df2, on="Anime_ID", how="inner")
display(inner_join)

Anime_ID,Anime_Name,Genre,Rating,Review
1,Attack on Titan,Action,9.0,Highly Rated
2,Death Note,Thriller,9.5,Highly Rated
3,One Piece,Adventure,8.8,Popular


>Left Join


In [0]:
left_join = df1.join(df2, on="Anime_ID", how="left")
display(left_join)


Anime_ID,Anime_Name,Genre,Rating,Review
1,Attack on Titan,Action,9.0,Highly Rated
2,Death Note,Thriller,9.5,Highly Rated
3,One Piece,Adventure,8.8,Popular
4,Naruto,Action,,


>Right Join

In [0]:
right_join = df1.join(df2, on="Anime_ID", how="right")
display(right_join)

Anime_ID,Anime_Name,Genre,Rating,Review
1,Attack on Titan,Action,9.0,Highly Rated
2,Death Note,Thriller,9.5,Highly Rated
3,One Piece,Adventure,8.8,Popular
5,,,7.5,Average


>Full Outer Join


In [0]:
full_outer_join = df1.join(df2, on="Anime_ID", how="outer")
display(full_outer_join)


Anime_ID,Anime_Name,Genre,Rating,Review
1,Attack on Titan,Action,9.0,Highly Rated
2,Death Note,Thriller,9.5,Highly Rated
3,One Piece,Adventure,8.8,Popular
4,Naruto,Action,,
5,,,7.5,Average


>Left Anti Join (Only records from left that have no match)

In [0]:
left_anti_join = df1.join(df2, on="Anime_ID", how="left_anti")
display(left_anti_join)

Anime_ID,Anime_Name,Genre
4,Naruto,Action


>Left Semi Join (Only records from left that have a match)

In [0]:
left_semi_join = df1.join(df2, on="Anime_ID", how="left_semi")
display(left_semi_join)


Anime_ID,Anime_Name,Genre
1,Attack on Titan,Action
2,Death Note,Thriller
3,One Piece,Adventure


>merged_df = df1.unionByName(df2, allowMissingColumns=True)
merged_df.show()


In [0]:
merged_df = df1.unionByName(df2, allowMissingColumns=True)
display(merged_df)

Anime_ID,Anime_Name,Genre,Rating,Review
1,Attack on Titan,Action,,
2,Death Note,Thriller,,
3,One Piece,Adventure,,
4,Naruto,Action,,
1,,,9.0,Highly Rated
2,,,9.5,Highly Rated
3,,,8.8,Popular
5,,,7.5,Average


>Cross Join (Cartesian Product)

In [0]:
cross_join = df1.crossJoin(df2)
display(cross_join)


Anime_ID,Anime_Name,Genre,Anime_ID.1,Rating,Review
1,Attack on Titan,Action,1,9.0,Highly Rated
1,Attack on Titan,Action,2,9.5,Highly Rated
1,Attack on Titan,Action,3,8.8,Popular
1,Attack on Titan,Action,5,7.5,Average
2,Death Note,Thriller,1,9.0,Highly Rated
2,Death Note,Thriller,2,9.5,Highly Rated
2,Death Note,Thriller,3,8.8,Popular
2,Death Note,Thriller,5,7.5,Average
3,One Piece,Adventure,1,9.0,Highly Rated
3,One Piece,Adventure,2,9.5,Highly Rated


>Self Join

In [0]:
self_join = df1.alias("A").join(df1.alias("B"), col("A.Genre") == col("B.Genre"), "inner")
display(self_join)

Anime_ID,Anime_Name,Genre,Anime_ID.1,Anime_Name.1,Genre.1
1,Attack on Titan,Action,1,Attack on Titan,Action
1,Attack on Titan,Action,4,Naruto,Action
4,Naruto,Action,1,Attack on Titan,Action
4,Naruto,Action,4,Naruto,Action
3,One Piece,Adventure,3,One Piece,Adventure
2,Death Note,Thriller,2,Death Note,Thriller


>Union (Combining DataFrames)

In [0]:
union_df = df1.union(df1)  
display(union_df)


Anime_ID,Anime_Name,Genre
1,Attack on Titan,Action
2,Death Note,Thriller
3,One Piece,Adventure
4,Naruto,Action
1,Attack on Titan,Action
2,Death Note,Thriller
3,One Piece,Adventure
4,Naruto,Action


In [0]:
union_df = df1.unionByName(df2, allowMissingColumns=True)
display(union_df)

Anime_ID,Anime_Name,Genre,Rating,Review
1,Attack on Titan,Action,,
2,Death Note,Thriller,,
3,One Piece,Adventure,,
4,Naruto,Action,,
1,,,9.0,Highly Rated
2,,,9.5,Highly Rated
3,,,8.8,Popular
5,,,7.5,Average


In [0]:
union_df = df1.union(df2).distinct()
display(union_df)

Anime_ID,Anime_Name,Genre
1,Attack on Titan,Action
2,Death Note,Thriller
3,One Piece,Adventure
4,Naruto,Action
1,9.0,Highly Rated
2,9.5,Highly Rated
3,8.8,Popular
5,7.5,Average


>Set Difference


In [0]:
difference_df1 = df1.subtract(df2)
display(difference_df1)

Anime_ID,Anime_Name,Genre
1,Attack on Titan,Action
2,Death Note,Thriller
3,One Piece,Adventure
4,Naruto,Action
