In [1]:
from pyspark.sql.types import StructType, StructField, IntegerType
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
spark = SparkSession.builder.appName("PySparkTables").getOrCreate()
# Data for users_friends
users_friends_data = [
    (1, 2),
    (1, 3),
    (2, 3),
    (3, 1)
]

users_friends_schema = StructType([
    StructField("user_id", IntegerType(), False),
    StructField("friend_id", IntegerType(), False)
])

users_friends_df = spark.createDataFrame(users_friends_data, users_friends_schema)
users_friends_df.createOrReplaceTempView("users_friends")

# Data for users_pages
users_pages_data = [
    (1, 10),
    (2, 20),
    (2, 30),
    (3, 10),
    (3, 40)
]

users_pages_schema = StructType([
    StructField("user_id", IntegerType(), False),
    StructField("page_id", IntegerType(), False)
])

users_pages_df = spark.createDataFrame(users_pages_data, users_pages_schema)
users_pages_df.createOrReplaceTempView("users_pages")



In [11]:
spark.sql(
    """
        select DISTINCT f.user_id, p.page_id from users_pages p join users_friends as f on p.user_id = f.friend_id
        where NOT EXISTS (
            select 1 from users_pages u where f.user_id = u.user_id AND p.page_id = u.page_id
        )
    """).show()

+-------+-------+
|user_id|page_id|
+-------+-------+
|      2|     40|
|      1|     20|
|      1|     30|
|      2|     10|
|      1|     40|
+-------+-------+



In [None]:
from pyspark.sql import functions as F


friend_pages_df = users_friends_df.alias("f") \
    .join(users_pages_df.alias("p"), F.col("f.friend_id") == F.col("p.user_id")) \
    .select(F.col("f.user_id").alias("user_id"), F.col("p.page_id").alias("page_id"))

user_pages_df = users_pages_df.select("user_id", "page_id")

recommend_df = friend_pages_df.join(
    user_pages_df,
    on=["user_id", "page_id"],
    how="left_anti"  
).distinct()

recommend_df.show()

In [16]:
spark.sql(
"""
SELECT DISTINCT uf.user_id, fp.page_id, up.user_id
FROM users_friends uf
JOIN users_pages fp
  ON uf.friend_id = fp.user_id  
LEFT JOIN users_pages up
    on up.user_id = uf.user_id AND up.page_id = fp.page_id
WHERE up.user_id is NULL
""").show()

+-------+-------+-------+
|user_id|page_id|user_id|
+-------+-------+-------+
|      2|     40|   null|
|      1|     20|   null|
|      1|     30|   null|
|      2|     10|   null|
|      1|     40|   null|
+-------+-------+-------+



In [18]:
spark.sql("""
    SELECT DISTINCT uf.user_id, fp.page_id
    FROM users_friends uf
    JOIN users_pages fp
    ON uf.friend_id = fp.user_id
    WHERE (uf.user_id, fp.page_id) NOT IN (
        select user_id, page_id from users_pages
    )
    """).show()

+-------+-------+
|user_id|page_id|
+-------+-------+
|      2|     40|
|      1|     20|
|      1|     30|
|      2|     10|
|      1|     40|
+-------+-------+

