In [2]:
import pandas as pd 
import numpy as np
import sqlalchemy
import pyodbc

import findspark
findspark.init()
findspark.find()

from pyspark.sql import SparkSession
import pyspark.sql.functions as f
import pyspark.sql.window as w
import pyspark.sql.types as t

In [3]:
spark = SparkSession.builder.appName('TheVoice-ETL').getOrCreate()

In [4]:
# Load into dataframes
call_type_mrr =         spark.read.parquet(r'C:\Users\alex\Desktop\TheVoice - PySpark\parquet_files\call_type.parquet')
countries_mrr =         spark.read.parquet(r'C:\Users\alex\Desktop\TheVoice - PySpark\parquet_files\countries.parquet')
customer_mrr =          spark.read.parquet(r'C:\Users\alex\Desktop\TheVoice - PySpark\parquet_files\customer.parquet')
customer_invoice_mrr =  spark.read.parquet(r'C:\Users\alex\Desktop\TheVoice - PySpark\parquet_files\customer_invoice.parquet')
customer_lines_mrr =    spark.read.parquet(r'C:\Users\alex\Desktop\TheVoice - PySpark\parquet_files\customer_lines.parquet')
opfileopp_mrr =         spark.read.parquet(r'C:\Users\alex\Desktop\TheVoice - PySpark\parquet_files\pfileopp.parquet')
package_catalog_mrr =   spark.read.parquet(r'C:\Users\alex\Desktop\TheVoice - PySpark\parquet_files\package_catalog.parquet')
usage_main_mrr =        spark.read.parquet(r'C:\Users\alex\Desktop\TheVoice - PySpark\parquet_files\usage_main.parquet')
xxCountryType_mrr =     spark.read.parquet(r'C:\Users\alex\Desktop\TheVoice - PySpark\parquet_files\xxCountryType.parquet')

In [24]:
call_type_mrr.printSchema()

root
 |-- call_type_code: string (nullable = true)
 |-- call_type_desc: string (nullable = true)
 |-- priceperminuter: double (nullable = true)
 |-- call_type: string (nullable = true)



In [48]:
# DimCallTypes staging

# variable set
price_per_minute = 0.5

# window for the key column
windowSpec_callType = w.Window.orderBy('call_type_code')

# final df 
DimCallTypes_stg = call_type_mrr.withColumn('KeyCallType', f.row_number().over(windowSpec_callType) + 1000)\
    .withColumnRenamed('call_type_code', 'DescCallTypeCode')\
    .withColumnRenamed('call_type_desc', 'DescCallType')\
    .withColumn('DescFullCallType', 
        f.concat_ws('-', f.col('DescCallTypeCode'), f.col('DescCallType')))\
    .withColumn('DescCallTypePriceCategory', f.when(f.col('priceperminuter') > price_per_minute, 'Discounted Price').otherwise('Normal Price'))\
    .withColumnRenamed('call_type', 'DescCallTypeCategory')\
    .select('KeyCallType', 'DescCallTypeCode', 'DescCallType', 'DescFullCallType', 'DescCallTypePriceCategory', 'DescCallTypeCategory')#.show()


In [None]:
countries_mrr.select('COUNTRY_CODE', 'REGION', 'AREA').join(xxCountryType_mrr.select(f.expr('COUNTRY_CODE as COUNTRY_CODE_2'),'COUNTRY_PRE'), countries_mrr['COUNTRY_CODE'] == xxCountryType_mrr['COUNTRY_CODE'], how='inner')\
    .withColumnRenamed('COUNTRY_PRE', 'KeyCountry')\
    .withColumnRenamed('COUNTRY_CODE', 'DescCountry')\
    .withColumnRenamed('REGION', 'DescRegion')\
    .withColumnRenamed('AREA', 'DescArea')\
    .select('KeyCountry', 'DescCountry', 'DescRegion', 'DescArea').show(5)

In [74]:
# DimCountries staging

# select proper column and rename the country code column
xxCountryType_mrr = xxCountryType_mrr.select('COUNTRY_CODE', 'COUNTRY_PRE').withColumnRenamed('COUNTRY_CODE','COUNTRY_CODE2')

# final df
DimCountries_stg = countries_mrr.select('COUNTRY_CODE', 'REGION', 'AREA').join(
    xxCountryType_mrr,
    countries_mrr['COUNTRY_CODE'] == xxCountryType_mrr['COUNTRY_CODE2'],
    how='inner')\
    .withColumnRenamed('COUNTRY_PRE', 'KeyCountry')\
    .withColumnRenamed('COUNTRY_CODE', 'DescCountry')\
    .withColumnRenamed('REGION', 'DescRegion')\
    .withColumnRenamed('AREA', 'DescArea')\
    .select('KeyCountry', 'DescCountry', 'DescRegion', 'DescArea')#.show(5)

+----------+--------------------+----------+-----------------+
|KeyCountry|         DescCountry|DescRegion|         DescArea|
+----------+--------------------+----------+-----------------+
|        93|         Afghanistan|      Asia|       South Asia|
|       355|             Albania|    Europe|South East Europe|
|       213|             Algeria|    Africa|  Northern Africa|
|       376|             Andorra|    Europe|South West Europe|
|       244|              Angola|    Africa|  Southern Africa|
|        54|           Argentina|  Americas|    South America|
|       374|             Armenia|      Asia|  South West Asia|
|       297|               Aruba|  Americas|      West Indies|
|        61|           Australia|   Oceania|          Pacific|
|        43|             Austria|    Europe|   Central Europe|
|       994|          Azerbaijan|      Asia|  South West Asia|
|       973|             Bahrain|      Asia|  South West Asia|
|       880|          Bangladesh|      Asia|       Sout