# 0. Filtering
Before we are going to clean up the data and apply pre-processing steps, we will first see if there are not already products and services which are already directly matching with the EP catalogue (ASSUMING THAT THE EP CATALOGUS CONTAINS CLEAN DATA). This reduces the number of records we would need to pre-process. Since we still need those records for the final output, we will flag them by adding a link to the EP catalogue.

## 0.1 Import the required libraries


In [11]:
import pandas as pd
import os 

## 0.2 Load in the raw data

In [12]:
if 'DATABRICKS_RUNTIME_VERSION' in os.environ:
    # Determine the location of the dataframe containing the company based products and services
    europages_activities_catalogue_location = "abfss://preprocessing@storagetiltdevelop.dfs.core.windows.net/data/example_data/input/scraped_data/scraped_EP_products_catalogue.csv"
    # Determine the location of the dataframe containing the translated text
    base_tilt_data_location = "abfss://preprocessing@storagetiltdevelop.dfs.core.windows.net/data/example_data/input/tilt_base_products_and_services_unprocessed.csv"
    # Determine the location of the dataframe containing the translated text
    italy_tilt_data_location = "abfss://preprocessing@storagetiltdevelop.dfs.core.windows.net/data/example_data/input/tilt_italy_products_and_services_unprocessed.csv"
    
    # Read the dataframe
    europages_activities_catalogue = spark.read.option("header",True).csv(europages_activities_catalogue_location).toPandas()
    # use raw_df
    base_tilt_data = spark.read.option("header",True).csv(base_tilt_data_location).toPandas()
    # use raw_df
    italy_tilt_data = spark.read.option("header",True).csv(italy_tilt_data_location).toPandas()

else:
    # Determine the location of the dataframe EuroPages catalogue
    europages_activities_catalogue_location = "../../data/example_data/input/scraped_data/scraped_EP_products_catalogue.csv"
    # Determine the location of the dataframe containing unprocessed products and services
    base_tilt_data_location = "../../data/example_data/input/tilt_base_products_and_services_unprocessed.csv"
    # Determine the location of the dataframe containing the unprocessed products and services
    italy_tilt_data_location = "../../data/example_data/input/tilt_italy_products_and_services_unprocessed.csv"
    
    # Read the dataframe
    europages_activities_catalogue  = pd.read_csv(europages_activities_catalogue_location)
    # Read the base_data
    base_tilt_data = pd.read_csv(base_tilt_data_location)
    # Read the italy data
    italy_tilt_data = pd.read_csv(italy_tilt_data_location)

## 0.3 Filtering

In [13]:
def exclude_covered_records(df_1, df_2):
    # Merge the two dataframes based on the 'products_and_services' column
    merged_df = df_1.merge(df_2, on='products_and_services', how='inner')

    # Remove ID column and rename products_id_y to linke_EP_products_id
    merged_df = merged_df.drop(['ID', 'products_id_y'], axis=1)
    merged_df = merged_df.rename(columns={'products_id_x': 'products_id'})
    
    # add column called to_process which labels the records that need to be processed
    merged_df['to_process'] = False
    

    # concatentate the merged_df with the records in df_1 that arent in merged_df based of products_id
    df_1 = df_1[~df_1['products_id'].isin(merged_df['products_id'])]
    merged_df = pd.concat([merged_df, df_1], ignore_index=True)	
    # replace the NaN values in the to_process column with "yes"
    merged_df['to_process'] = merged_df['to_process'].fillna(True)

    return merged_df

### Base Data

In [14]:
filtered_base_data = exclude_covered_records(base_tilt_data, europages_activities_catalogue)

### tilt Italy Data

In [15]:
filtered_italy_data = exclude_covered_records(italy_tilt_data, europages_activities_catalogue)

## 0.4 Export the filtered dataframe

In [16]:
if 'DATABRICKS_RUNTIME_VERSION' in os.environ:
    # Define the path for the new dataframe
    output_filtered_base_data = "abfss://preprocessing@storagetiltdevelop.dfs.core.windows.net/data/example_data/output/base_data/base_flagged_products.csv"
    # Convert the pandas dataframe to a spark sql dataframe
    filtered_base_data_spark = spark.createDataFrame(filtered_base_data)
    # Write the new dataframe to the path
    filtered_base_data_spark.write.csv(output_filtered_base_data, mode="overwrite", header=True)

    # Define the path for the new dataframe
    output_filtered_italy_data = "abfss://preprocessing@storagetiltdevelop.dfs.core.windows.net/data/example_data/output/italy_data/italy_flagged_products.csv"
    # Convert the pandas dataframe to a spark sql dataframe
    filtered_italy_data_spark = spark.createDataFrame(filtered_italy_data)
    # Write the new dataframe to the path
    filtered_italy_data_spark.write.csv(output_filtered_italy_data, mode="overwrite", header=True)
else:
    # Define the path for the new dataframe
    output_filtered_base_data = "../../data/example_data/output/base_data/base_flagged_products.csv"
    # Define the path for the new dataframe
    output_filtered_italy_data = "../../data/example_data/output/italy_data/italy_flagged_products.csv"

    # Write the new dataframe to the path
    filtered_base_data.to_csv(output_filtered_base_data,index = False)
    # Write the new dataframe to the path
    filtered_italy_data.to_csv(output_filtered_italy_data,index = False)