In [12]:
import os
import re
import datetime
from os import path

In [13]:
def extraction_and_loading(customer_file, transaction_file):
  print("Reading customer file...")
  customer_data = spark.read.csv(customer_file,
                             sep=',',
                             header=True,
                             inferSchema=True,
                             nullValue='NA')
    
  print("Cleaning customer file data...")
  customer_data = customer_data.dropDuplicates().sort('customerid')

  print("\nFirst 5 rows of Customer data:")
  customer_data.show(5)
    
  print("Printing table schema:")
  customer_data.printSchema()
    
  customer_data.write.format('parquet').save('/content/drive/MyDrive/data_engineer_assessment/raw_zone/raw_parquet_files/customer_data_{}_{}_{}.parquet'.format(date.day, date.month, date.year))
  print("\nCustomer file data successfully written to parquet.")
  print("---------------------------------------------")

  print("\nReading transaction file data...")
  transaction_data = spark.read.csv(transaction_file,
                         sep=',',
                         header=True,
                         inferSchema=True,
                         nullValue='NA')
    
  print("Printing table schema:")
  transaction_data.printSchema()

  print("\nCleaning transaction file data...")
  transaction_data = transaction_data.dropDuplicates().sort('transaction_date')
  transaction_data = transaction_data.withColumnRenamed("_c0", "transaction_id")

  print("\nFirst 5 rows of transaction file data:")
  transaction_data.show(5)

  transaction_data.write.format('parquet').save('/content/drive/MyDrive/data_engineer_assessment/raw_zone/raw_parquet_files/transaction_data_{}_{}_{}.parquet'.format(date.day, date.month, date.year))
  print("\nTransaction file data successfully written to parquet.")

In [17]:
directory = "/content/drive/MyDrive/data_engineer_assessment/source_data"


for filename in os.listdir(directory):

  try:
    result = bool(dt.strptime(filename, '%d-%m-%Y')) # Checks whether name of file contains date within source data
  
    if result == True:
      match = re.search(r'\d{2}-\d{2}-\d{4}', filename)
      date = datetime.datetime.strptime(match.group(), '%d-%m-%Y').date() # Retrieves date from file

      customer_file_name = "/content/drive/MyDrive/data_engineer_assessment/refined_zone/customer_data_{}_{}_{}.parquet".format(date.day, date.month, date.year)
      customer_file = "/content/drive/MyDrive/data_engineer_assessment/landing_zone/{}-0{}-{}/customers_{}-0{}-{}.csv".format(date.day, date.month, date.year, date.day, date.month, date.year)
      transaction_file_name = "/content/drive/MyDrive/data_engineer_assessment/refined_zone/transaction_data_{}_{}_{}.parquet".format(date.day, date.month, date.year)
      transaction_file = "/content/drive/MyDrive/data_engineer_assessment/landing_zone/{}-0{}-{}/transactions_{}-0{}-{} 00_00_00.csv".format(date.day, date.month, date.year, date.year, date.month, date.day)

      refined_directory = "/content/drive/MyDrive/data_engineer_assessment/refined_zone/"

      if path.exists(customer_file_name) == False and path.exists(transaction_file_name) == False: # Checks whether filename is within source data folder
        extraction_and_loading(customer_file, transaction_file)

    if result == False:
      continue

  except:
    continue

Reading customer file: /content/drive/MyDrive/data_engineer_assessment/landing_zone/15-01-2022/customers_15-01-2022.csv...
Cleaning customer file data...

First 5 rows of Customer data:
+----------+-------------------+-----------------+------------+-----------------+---------------+
|customerid|          birthdate|bank_account_type|   bank_name|employment_status|education_level|
+----------+-------------------+-----------------+------------+-----------------+---------------+
|      1000|1973-10-10 00:00:00|          Savings|Capitec Bank|             null|           null|
|      1001|1986-01-21 00:00:00|          Savings|Capitec Bank|        Permanent|           null|
|      1002|1987-04-01 00:00:00|          Savings|   Tyme Bank|             null|           null|
|      1003|1991-07-19 00:00:00|          Savings|Capitec Bank|        Permanent|           null|
|      1004|1982-11-22 00:00:00|          Savings|Capitec Bank|        Permanent|           null|
+----------+------------------

In [24]:
from datetime import datetime
date = datetime.today() # Retrieves the date today to create history of when pipeline started.

transaction_list = []

directory = "/content/drive/MyDrive/data_engineer_assessment/raw_zone/raw_parquet_files"
for i in os.listdir(directory):
  transaction_list.append(i)
  transactions_filtered = fnmatch.filter(transaction_list, 'transaction*') # Find all files with transaction data
  customer_filtered = fnmatch.filter(transaction_list, 'customer*') # Finds all files with customer data

In [28]:
transaction_data = spark.read.parquet("/content/drive/MyDrive/data_engineer_assessment/raw_zone/raw_parquet_files/{}".format(transactions_filtered[0])) # Reads first parquet file in transactions list 
customer_data = spark.read.parquet("/content/drive/MyDrive/data_engineer_assessment/raw_zone/raw_parquet_files/{}".format(customer_filtered[0])) # Reads first parquet file in customers list

for i in range(1, len(transactions_filtered)): # Loops through all available transcation files 
  transaction_data.write.mode('append').parquet(transactions_filtered[i])

for i in range(1, len(customer_filtered)): # Loops through all available customer files
  customer_data.write.mode('append').parquet(customer_filtered[i])

# Creates one main parquet file with all other parquet files appended. 
transaction_data.write.format('parquet').mode('overwrite').partitionBy("transaction_id").save('/content/drive/MyDrive/data_engineer_assessment/refined_zone/Refined_Parquet_Files/transaction_data_{}_{}_{}.parquet'.format(date.day, date.month, date.year))

# Creates one main parquet file with all other parquet files appended. 
customer_data.write.format('parquet').mode('overwrite').partitionBy("customerid").save('/content/drive/MyDrive/data_engineer_assessment/refined_zone/Refined_Parquet_Files/customer_data_{}_{}_{}.parquet'.format(date.day, date.month, date.year))