In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pandas as pd

spark.conf.set("spark.sql.legacy.timeParserPolicy","LEGACY")

In [0]:
%run "../Data_Quality_and_profiling/include/configuration"

In [0]:
%run "../Data_Quality_and_profiling/include/common_functions"

In [0]:
omd_sdf =  spark.read.options(delimiter=',') \
           .option("header" , True) \
           .csv(f'{dq_raw_folder_path}France_Final_v5.csv')


## Driver Code

In [0]:
df = preprocess_data(omd_sdf, numeric_cols = ['Net Spend (Local)','Impressions','Reach', 'Clicks','Valued Views','Spot Length (sec)','GRPs','Adjusted GRPs','TRPs','Impacts (000s)','Engagements'])
df = auto_date_parser(parse_df=df,date_cols = ['Campaign Start Date','Campaign End Date','Week'], final_date_fmt = "yyyy-MM-dd")

df = df.withColumn("Net Spend (Local)", df["Net Spend (Local)"].cast(IntegerType()).alias("Net Spend (Local)"))

#Dropping rows with all na 
df = df.na.drop("all")



In [0]:
data_quality_cols_regex = {}
date_cols = [col for col in df.columns if 'Date' in col or 'Week' in col]
date_regex = '[0-9]{4}-(((0[13578]|(10|12))-(0[1-9]|[1-2][0-9]|3[0-1]))|(02-(0[1-9]|[1-2][0-9]))|((0[469]|11)-(0[1-9]|[1-2][0-9]|30)))$'
data_quality_cols_regex =  dict.fromkeys(date_cols,date_regex)


# Constants
numeric_cols = ['Net Spend (Local)','Impressions','Reach']
aggregate_cols = [ 'Brand Generation','Product','Week','Year-Week']
result_limit = 100


In [0]:
### 1. NULL Checks
resultdf = get_null_perc(spark, df, null_cols= df.columns)
print("NULL/Empty Percentage for Columns")
resultdf.show(result_limit,False)

In [0]:
###2. Summary, Average, Standard Deviation, Percentiles for Numeric Columns
resultdf = get_summary_numeric(df, numeric_cols)
print("Summary for Numeric Columns")
resultdf.show(result_limit, False)

In [0]:
###3. Distinct Count
print("Distinct Counts for Aggregate Columns")
resultdf = get_distinct_counts(spark, df, aggregate_cols)
resultdf.show(result_limit, False)

In [0]:
###4. Distribution Count
print("Distribution Count for Aggregate Columns")
result = get_distribution_counts(spark, df, aggregate_cols)
for i in result:
	print("======== Distribution for - " + i.columns[0] + " ========")
	i.show(result_limit, False)

In [0]:
###5. Data Quality
print("Data Quality Issue Percentage for Columns")
resultdf = get_mismatch_perc(spark, df, data_quality_cols_regex)
resultdf.show(result_limit, False)

In [0]:
null_check_columns = ['CampaignID', 'Sub Campaign', 'Digital Platform']

err_df = err_df_null_rule(threshold_per = 10, null_check_columns = null_check_columns, df = df.toPandas())

In [0]:
dest_path = generate_report(raw_df = df.toPandas(),err_df = err_df,destination_path=dbfs_processed_path)
print(f'DQ file is generated in the location {dest_path}')

In [0]:
stage_data_to_s3()

### Reconciliation TOOL

In [0]:
media_df = df
media_mapping_df =pd.read_excel(f'{dbfs_mapping_path}ROI_mappings_FR.xlsx',sheet_name = 'Media_Mappings')
media_mapping_df = spark.createDataFrame(media_mapping_df.astype(str))
media_mapping_campaign_df = pd.read_excel(f'{dbfs_mapping_path}ROI_mappings_FR.xlsx', sheet_name = 'TV Campaign Mapping')
media_mapping_campaign_df = spark.createDataFrame(media_mapping_campaign_df.astype(str))

In [0]:
reconcilliation_tool(config_string, get_fuzzy_match = True, tol_val = 1.0)

Product,MEDIA_BRAND,_merge
6SS-CORE,6SS-CORE,both
BENIGN,,Raw Media Data
BENIGN-CORE,BENIGN-CORE,both
BENIGN-CORE,BENIGN-CORE,both
,BENIGN-ESSENTIALS,Media Mapping Data
CORE,,Raw Media Data
DAWNS-CORE,DAWNS-CORE,both
FRACTAL,FRACTAL,both
FRACTAL-CORE,FRACTAL-CORE,both
,FRED-CORE,Media Mapping Data


Product,MEDIA_BRAND
MARVINS EDGE,MARVINS EDGE
TSC-POWER,TSC-POWER
ICHOR-MELONSODA,ICHOR-MELONSODA
WALNUTS CORE,WALNUTS-CORE
WALNUTS CORE,WALNUTS CORE
MULTI-PRODUCT,MULTI-PRODUCT
BENIGN-CORE,BENIGN-CORE
MARVINS-EASTBLUE,MARVINS-EASTBLUE
MARVINS EDGE IIT,MARVINS EDGE IIT
PREFECT,PREFECT


Brand Generation,GROUP_NAME,_merge
,0,Media Mapping Data
,0,Media Mapping Data
,0,Media Mapping Data
,0,Media Mapping Data
,0,Media Mapping Data
,0,Media Mapping Data
,0,Media Mapping Data
,0,Media Mapping Data
,0,Media Mapping Data
,0,Media Mapping Data


Brand Generation,GROUP_NAME
Somras,Somras
SOMRAS,Somras


In [0]:
reconcilliation_tool(config_string1, get_fuzzy_match = True, tol_val = 1.0)

Campaign,Raw TV Campaign,_merge
6S,6S,both
6S,6S,both
6S,6S,both
6S,6S,both
Benign fin d'annÃ©e,,Raw Media Data
Burning Desires,,Raw Media Data
CELEBRE,,Raw Media Data
CHEFSPECIAL,CHEFSPECIAL,both
CITRUS,,Raw Media Data
CORE,CORE,both


Campaign,Raw TV Campaign
Equity Power,Equity Power
Power,Power
,6S
Citrus,Citrus
6S,6S
GRIN HOLEE,GRIN HOLEE
Grin,GREEN
Grin,GRIN
Eastblue,Eastblue
PREFECT,Prefect


In [0]:
dbutils.fs.mounts()

In [0]:
dbutils.fs.ls('/mnt/azdatabrickscourse/raw/DQ_Raw/')

In [0]:
%sh
rm -r /dbfs/mnt/azdatabrickscourse/raw/DQ_Raw/