# imports

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window
import os
import sys
import numpy as np
import pandas as pd

In [None]:
# Required for Spark to find Python executable
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [None]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("InterviewChallenges") \
    .config("spark.sql.adaptive.enabled", "true") \
    .getOrCreate()

In [None]:
data = {
    'user_id': [1, 2, 3, 4, 5, 6],
    'country': ['US', 'US', 'UK', None, 'FR', 'DE'],
    'signup_date': ['2022-01-05', '2022-01-06', np.nan, '2022-01-08', '2022-01-09', 'not_a_date'],
    'purchase': ['100', 200, 'N/A', 300, 400, 500],
    'is_active': [True, False, 'yes', 'no', 1, 0]    
}

pandas_df = pd.DataFrame(data)
sales_df = spark.createDataFrame(pandas_df)

In [None]:
sales_df.show()

In [None]:
sales_df.printSchema()

In [None]:
sales_df = sales_df.withColumn('signup_date_3',
                               F.coalesce(
                                   F.try_to_timestamp(
                                       F.col('signup_date'), F.lit('yyyy-MM-dd')),
                                   F.lit('2025-01-01')).cast('date'))

sales_df.show()

In [None]:
sales_df = sales_df.withColumn(
    'is_active_2',
    F.when(F.col('is_active').isin(['1', 'yes', 'true']),
           F.lit(True))
    .otherwise(F.lit(False))
)

sales_df.show()

In [None]:
sales_df = sales_df.withColumn(
    'purchase_2',
    F.when(
        F.col('purchase').isin(['N/A']),
        0
    )
    .otherwise(F.col('purchase')))

sales_df.show()

In [None]:
rn_purchase = Window().rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
sales_df \
    .withColumn('running_purchase', F.sum('purchase_2').over(rn_purchase)) \
    .show()

In [None]:
# df['is_active_2'] = df['is_active'].apply(lambda x: True if x in (True, 1, 'yes') else False)
# active_df = df[df['is_active_2'] == True]

active_df = sales_df.where(F.col('is_active_2') == True)

active_df.show()

In [None]:
# avg_purchase = active_df.groupby('country')['purchase_2'].mean()
 
# print(avg_purchase)

avg_purchase = active_df.groupby('country').agg(
    F.avg(F.col('purchase_2')).alias('avg_purchase')
)

avg_purchase.show()