In [1]:
from pyspark.sql import SparkSession
# Create a SparkSession
spark = SparkSession.builder \
    .appName("Read CSV Example") \
    .getOrCreate()

In [2]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
    .appName("Read CSV Example") \
    .getOrCreate()
# accounts
df_accounts = spark.read \
    .option("header", "true")  \
    .option("inferSchema", "true")  \
    .csv("accounts.csv", sep=";")

df_accounts.count()


500000

In [3]:
from pyspark.sql import SparkSession
# Create a SparkSession
spark = SparkSession.builder \
    .appName("Read CSV Example") \
    .getOrCreate()

# country_abbreviation
df_country_abbreviation = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("country_abbreviation.csv", sep=";") 

df_country_abbreviation.count()

121

In [4]:
from pyspark.sql import SparkSession
# Create a SparkSession
spark = SparkSession.builder \
    .appName("Read CSV Example") \
    .getOrCreate()

# transactions
df_transactions = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("transactions.csv", sep=";")

df_transactions.count()



5000000

In [None]:
spark.stop()

• Calculate how many accounts of each type there are using Spark SQL. The return type is a dataframe [account_type: string, account_type_count: int]

In [5]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder.appName("AccountTypeCount").getOrCreate()

df_transactions = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("transactions.csv", sep=";")

df_accounts = spark.read \
    .option("header", "true")  \
    .option("inferSchema", "true")  \
    .csv("accounts.csv", sep=";")

df_accounts.createOrReplaceTempView("accounts")

df_transactions.createOrReplaceTempView("transactions")

df_accounts_result = spark.sql("""
    SELECT t.account_type AS account_type, COUNT(t.account_type) AS account_type_count 
    FROM transactions t
    LEFT JOIN accounts a
    ON a.id = t.id
    GROUP BY t.account_type
    """)

df_accounts_result.show()

+------------+------------------+
|account_type|account_type_count|
+------------+------------------+
|    Personal|           1667072|
|Professional|           1667358|
|    Business|           1665570|
+------------+------------------+



 •	Calculate only the balance and the latest date for each account from transactions.csv. To calculate the balance, summarize all the transactions for  
    each account. The return type is a dataframe [account_id: string, balance: string, latest_date: date].

In [6]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
    .appName("Read CSV Example") \
    .getOrCreate()

df_transactions = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("transactions.csv", sep=";")

df_transactions.createOrReplaceTempView("transactions")


df_accounts_last_date_result = spark.sql("""
    SELECT id AS account_id,
           CAST(SUM(amount) AS STRING) AS balance,
           MAX(transaction_date) as latest_date
    FROM transactions 
    GROUP BY id
    """)

df_accounts_last_date_result.show()

+----------+------------------+-----------+
|account_id|           balance|latest_date|
+----------+------------------+-----------+
|    482333|          27174.07| 2020-07-17|
|    222048|          48004.81| 2020-07-20|
|    328078|          36948.25| 2020-02-01|
|    192401|          36736.98| 2020-01-30|
|    273916| 47475.37999999999| 2021-05-30|
|    485103|          62198.93| 2021-05-22|
|    300282|55103.619999999995| 2021-05-01|
|     20683|          56448.72| 2021-10-27|
|     15846| 58671.90999999999| 2020-12-23|
|    446783| 98085.51000000001| 2021-12-11|
|     92182|           42335.3| 2020-08-08|
|    477485|          22114.03| 2020-05-23|
|    171142|40428.899999999994| 2021-04-07|
|    317762|          40025.55| 2021-12-02|
|     65478|           57941.9| 2021-10-06|
|    306768|          26566.93| 2019-12-19|
|    380411|          43652.94| 2020-06-02|
|    304681|          37827.69| 2021-03-26|
|    475638|           44509.1| 2021-11-23|
|     97413|          39611.24| 

2.	Write a function using Spark Python or Spark Scala API to calculate total earnings (sum of transactions above 0) for each user from Switzerland by year as a pivot table. The result dataframe should contain user full names as one field split by whitespace, years, and earning values. 

In [1]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import sum, col, split, concat_ws
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
    .appName("Read CSV Example") \
    .getOrCreate()

df_transactions = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("transactions.csv", sep=";")

# country_abbreviation
df_country_abbreviation = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("country_abbreviation.csv", sep=";") 


df_accounts = spark.read \
    .option("header", "true")  \
    .option("inferSchema", "true")  \
    .csv("accounts.csv", sep=";")


def get_total_earnings():
    df_transactions_1 = df_transactions \
        .join(df_country_abbreviation,
             df_transactions.country == df_country_abbreviation.abbreviation,
             "inner") \
        .join(df_accounts.drop("country"), 
             df_transactions.id == df_accounts.id,
             "inner")

    df_filtered_1 = df_transactions_1.filter((col("country") == "CH") & (col("amount") > 0))
    df_filtered_2 = df_filtered_1.withColumn("year", split(col("transaction_date"), "-")[0].cast("int"))
    df_filtered_3 = df_filtered_2.withColumn("full_name", concat_ws(" ", df_filtered_2.first_name, df_filtered_2.last_name))

    df_partitioned = df_filtered_3.repartition("country")
    
    df_result = df_partitioned.groupBy("full_name").pivot("year").sum("amount")
    df_result.show()


get_total_earnings()

+-----------------+------------------+------------------+-------+--------+-------+-------+------------------+--------+-------+-------+-------+
|        full_name|              2011|              2012|   2013|    2014|   2015|   2016|              2017|    2018|   2019|   2020|   2021|
+-----------------+------------------+------------------+-------+--------+-------+-------+------------------+--------+-------+-------+-------+
|      Luke Carter|              null|1585.4099999999999|  93.69|    null|   null|   null|           7029.37| 8340.16|   null|   null|   null|
|       Myra Owens|           7290.28|              null|   null| 7508.42|   null|   null|           19543.1|    null|   null|   null|   null|
|   Kelsey Spencer|              null|           2523.35|   null|    null|   null|   null|              null|    null|   null|   null|   null|
|    Darcy Edwards|              null|              null|   null|    null|   null|   null|           7892.65| 8538.91|2252.85|   null|   null|