In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
from datetime import datetime
from pyspark.sql.functions import *
from pyspark.sql.window import Window
spark = SparkSession.builder.appName("PySparkTables").getOrCreate()
# Initialize Spark Session

# Function to Convert String to Date
def parse_date(date_str):
    return datetime.strptime(date_str, "%Y-%m-%d").date()

# Define Schema for Users Table
users_schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("join_date", DateType(), True)
])

# Users Data with Converted Dates
users_data = [
    (1, 'Jon', parse_date('2020-02-14')),
    (2, 'Jane', parse_date('2020-02-14')),
    (3, 'Jill', parse_date('2020-02-15')),
    (4, 'Josh', parse_date('2020-02-15')),
    (5, 'Jean', parse_date('2020-02-16')),
    (6, 'Justin', parse_date('2020-02-17')),
    (7, 'Jeremy', parse_date('2020-02-18'))
]

# Create Users DataFrame
users_df = spark.createDataFrame(users_data, schema=users_schema)
users_df.createOrReplaceTempView("users")

# Define Schema for Events Table
events_schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("type", StringType(), True),
    StructField("access_date", DateType(), True)
])

# Events Data with Converted Dates
events_data = [
    (1, 'Pay', parse_date('2020-03-01')),
    (2, 'Music', parse_date('2020-03-02')),
    (2, 'P', parse_date('2020-03-12')),
    (3, 'Music', parse_date('2020-03-15')),
    (4, 'Music', parse_date('2020-03-15')),
    (1, 'P', parse_date('2020-03-16')),
    (3, 'P', parse_date('2020-03-22'))
]

# Create Events DataFrame
events_df = spark.createDataFrame(events_data, schema=events_schema)
events_df.createOrReplaceTempView("events")

# Verify Tables by Running SQL Queries
spark.sql("SELECT * FROM users").show()
spark.sql("SELECT * FROM events").show()


+-------+------+----------+
|user_id|  name| join_date|
+-------+------+----------+
|      1|   Jon|2020-02-14|
|      2|  Jane|2020-02-14|
|      3|  Jill|2020-02-15|
|      4|  Josh|2020-02-15|
|      5|  Jean|2020-02-16|
|      6|Justin|2020-02-17|
|      7|Jeremy|2020-02-18|
+-------+------+----------+

+-------+-----+-----------+
|user_id| type|access_date|
+-------+-----+-----------+
|      1|  Pay| 2020-03-01|
|      2|Music| 2020-03-02|
|      2|    P| 2020-03-12|
|      3|Music| 2020-03-15|
|      4|Music| 2020-03-15|
|      1|    P| 2020-03-16|
|      3|    P| 2020-03-22|
+-------+-----+-----------+



In [7]:
spark.sql("""
    select *
    from users u
    left join events e on u.user_id=e.user_id and e.type = 'P'
    where u.user_id in (select user_id from events where type = 'Music')
""").show()



+-------+----+----------+-------+----+-----------+
|user_id|name| join_date|user_id|type|access_date|
+-------+----+----------+-------+----+-----------+
|      2|Jane|2020-02-14|      2|   P| 2020-03-12|
|      3|Jill|2020-02-15|      3|   P| 2020-03-22|
|      4|Josh|2020-02-15|   null|null|       null|
+-------+----+----------+-------+----+-----------+

