# 0. Filtering
Before we are going to clean up the data and apply pre-processing steps, we will first see if there are not already products and services which are already directly matching with the EP catalogue (ASSUMING THAT THE EP CATALOGUS CONTAINS CLEAN DATA). This reduces the number of records we would need to pre-process. Since we still need those records for the final output, we will flag them by adding a link to the EP catalogue.

## 0.1 Import the required libraries


In [10]:
import pandas as pd

## 0.2 Load in the raw data

### EuroPages Catalogue

In [11]:
# Determine the location of the dataframe containing the company based products and services
europages_activities_catalogue_location = "../../data/example_data/input/scraped_data/scraped_EP_products_catalogue.csv"

# Read the dataframe
europages_activities_catalogue  = pd.read_csv(europages_activities_catalogue_location)

# Display dataframe
display(europages_activities_catalogue)

Unnamed: 0,products_and_services,products_id,ID
0,a0 size photocopy,d767925b2b31998720be47150c317b8b,1
1,a4 self-adhesive sheets,8b14539d24b4e721f7b0a2f4355e2698,2
2,abamectin,3f9bdd95cac6914274f80462954d0c04,3
3,abattoir with agreed store,e4ddbcf93fb7a754239358b82e0892de,4
4,abattoirs,0c43e8a2b18ffba9573fd91343865dca,5
...,...,...,...
42449,zootechnical feed,31038b5276adc87194b0f647ae36abf8,42450
42450,zootechnical foods,5306a7f02167e79365c7626ac48b4ff5,42451
42451,zootechnical integrators,62f4c57406cd9fb1190b67ea25cb48ce,42452
42452,zootechnics - equipment,3d86fe5ed83d4c4fe2304a311967a39e,42453


### Base Data

In [12]:
# Determine the location of the dataframe containing the translated text
base_tilt_data_location = "../../data/example_data/input/tilt_base_products_and_services_unprocessed.csv"

# use raw_df
base_tilt_data = pd.read_csv(base_tilt_data_location)

# Display the dataframe
display(base_tilt_data)

Unnamed: 0,products_id,products_and_services
0,164399edbf8e880dc2e856f50d51e720bd0a8abe,"fish, frozen and deep-frozen"
1,b0d3c55743b1b858ec2843c8870116bb8af543fd,drilling and test boring - equipment
2,b14c038972e6a52bfbf3ffbe77def57a62c5b9cf,well-management services
3,abadc2542b4b5c1ecfe41c22afb2347b1d9b65af,electronic data processing - software
4,60c58ad2ef34d96fae028f1039fab03dec9eb9a2,communication
...,...,...
32058,a56bfdd9971ddba76de33e5dd394faab63d2c58c,trading in non-ferrous products
32059,d16685f9db86a7e446d5a4c763a17016ffdfa613,precision weights for scales
32060,37c8e6d302d907a76f49d45a91949c86dd5fcc03,weights and masses - measurement and verificat...
32061,4aa756effa61af41058cf80f475a03b439232cfe,manicure scissors


### tilt Italy Data

In [13]:
# Determine the location of the dataframe containing the translated text
italy_tilt_data_location = "../../data/example_data/input/tilt_italy_products_and_services_unprocessed.csv"

# use raw_df
italy_tilt_data = pd.read_csv(italy_tilt_data_location)

# Display the dataframe
display(italy_tilt_data)

Unnamed: 0,products_and_services,products_id
0,persian blue salt,50abde66-58b7-4fb3-a007-1077fa41a010
1,organic saffron bio,0ea739bc-c3fb-4476-b8d6-96ba5085aa00
2,rossoro,2b898f73-e699-4b52-926d-476b51dacd42
3,peas,a7849fec-bf97-48b0-b89b-cb910a8d6dc8
4,cauliflowers,e09840c2-df4a-4920-b91b-e23177b5f5b3
...,...,...
3430,enamels,77890bab-e5c2-40d1-bcf7-f11fb9ac8c09
3431,breathable and mould resistant plastic coverings,d675d682-018e-4829-ac66-e61155d7a45e
3432,quartz paints,e5f78c8c-b79e-4a12-8451-867349f09cc6
3433,carpet,1f4814fc-759f-49b6-b2cc-9ba1c77582bd


## 0.3 Filtering

In [14]:
def exclude_covered_records(df_1, df_2):
    # Merge the two dataframes based on the 'products_and_services' column
    merged_df = df_1.merge(df_2, on='products_and_services', how='inner')

    # Remove ID column and rename products_id_y to linke_EP_products_id
    merged_df = merged_df.drop(['ID', 'products_id_y'], axis=1)
    merged_df = merged_df.rename(columns={'products_id_x': 'products_id'})
    
    # add column called to_process which labels the records that need to be processed
    merged_df['to_process'] = False
    

    # concatentate the merged_df with the records in df_1 that arent in merged_df based of products_id
    df_1 = df_1[~df_1['products_id'].isin(merged_df['products_id'])]
    merged_df = pd.concat([merged_df, df_1], ignore_index=True)	
    # replace the NaN values in the to_process column with "yes"
    merged_df['to_process'] = merged_df['to_process'].fillna(True)

    return merged_df

### Base Data

In [15]:
filtered_base_data = exclude_covered_records(base_tilt_data, europages_activities_catalogue)

### tilt Italy Data

In [16]:
filtered_italy_data = exclude_covered_records(italy_tilt_data, europages_activities_catalogue)

## 0.4 Export the filtered dataframe

### Base Data

In [17]:
# Define the path for the new dataframe
output_filtered_base_data = "../../data/example_data/output/base_data/base_flagged_products.csv"

# Write the new dataframe to the path
filtered_base_data.to_csv(output_filtered_base_data,index = False)

### tilt Italy Data

In [18]:
# Define the path for the new dataframe
output_filtered_italy_data = "../../data/example_data/output/italy_data/italy_flagged_products.csv"

# Write the new dataframe to the path
filtered_italy_data.to_csv(output_filtered_italy_data,index = False)