In [None]:
https://www.youtube.com/watch?v=SfzbR69LquU&list=PLBTZqjSKn0IeKBQDjLmzisazhqQy4iGkb&index=6

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, DateType, StringType
from pyspark.sql.functions import *
from pyspark.sql.window import Window
spark = SparkSession.builder.appName("PySparkTables").getOrCreate()

# Define schema for Friendship table
friendship_schema = StructType([
    StructField("PersonID", IntegerType(), True),
    StructField("FriendID", IntegerType(), True)
])

# Define schema for Person table
person_schema = StructType([
    StructField("PersonID", IntegerType(), True),
    StructField("Name", StringType(), True),
    StructField("Email", StringType(), True),
    StructField("Score", IntegerType(), True)
])

# Create Friendship DataFrame
friendship_data = [
    (1, 2), (1, 3), (2, 1), (2, 3),
    (3, 5), (4, 2), (4, 3), (4, 5)
]

friendship_df = spark.createDataFrame(friendship_data, schema=friendship_schema)

# Create Person DataFrame
person_data = [
    (1, "Alice", "alice2018@hotmail.com", 88),
    (2, "Bob", "bob2018@hotmail.com", 11),
    (3, "Davis", "davis2018@hotmail.com", 27),
    (4, "Tara", "tara2018@hotmail.com", 45),
    (5, "John", "john2018@hotmail.com", 63)
]

person_df = spark.createDataFrame(person_data, schema=person_schema)

# Create temporary views for SQL queries
friendship_df.createOrReplaceTempView("Friendship")
person_df.createOrReplaceTempView("Person")


print("Friendship and Person tables and views created successfully.")


Friendship and Person tables and views created successfully.


In [19]:
spark.sql("""
with score_details as (
    select f.PersonID, sum(p.Score) as total_friend_score,
    count(*) as no_of_friend from Friendship f join Person p on f.friendID = p.PersonID
    group by f.PersonID
    having total_friend_score > 100)
    
select s.*, p.name from person p inner join score_details s on p.PersonID = s.PersonID
""").show()

+--------+------------------+------------+----+
|PersonID|total_friend_score|no_of_friend|name|
+--------+------------------+------------+----+
|       2|               115|           2| Bob|
|       4|               101|           3|Tara|
+--------+------------------+------------+----+



In [43]:
df = friendship_df.alias("f").join(person_df.alias("p"), col("p.PersonID") == col("f.FriendID"), "inner").select("f.PersonID", "p.score")

In [44]:
df.show()

+--------+-----+
|PersonID|score|
+--------+-----+
|       2|   88|
|       1|   11|
|       4|   11|
|       1|   27|
|       2|   27|
|       4|   27|
|       3|   63|
|       4|   63|
+--------+-----+



In [45]:
df.groupBy("PersonID").agg(
        sum(col("score")).alias("Total_Score"),
        count("*").alias("Total friends")
        
).show()

+--------+-----------+-------------+
|PersonID|Total_Score|Total friends|
+--------+-----------+-------------+
|       1|         38|            2|
|       3|         63|            1|
|       4|        101|            3|
|       2|        115|            2|
+--------+-----------+-------------+

