In [2]:
%load_ext autoreload
%autoreload 2

In [4]:
from pathlib import Path
import data_transformation as transformer
import crewai_mapper as mapper
import data_ingestion as ingestion

## 1. Data ingestion

In [5]:
INPUT_PATH = Path("./input_data/Openstaande posten crediteuren 300423 SAP.XLSX")
OUTPUT_PATH = Path("./output_data")

In [6]:
VALIDATION_RULES = ingestion.VALIDATION_RULES

In [7]:
df = ingestion.file_ingestion(path_dir=INPUT_PATH)

## 2. Transform data

In [8]:
sub_df = transformer.transform_nonstandard_SAP(df=df, new_column_name="Cust. name")
csv =sub_df.head(10).to_csv(sep=',')

In [9]:
DESTINATION_COLUMNS = VALIDATION_RULES["parametres"]["names"]
SOURCE_COLUMNS = sub_df.columns.to_list()
TO_DISCARD = VALIDATION_RULES["parametres"]["to_discard"]

## 3. CrewAI mapping suggestion

In [10]:
response = mapper.get_mapping(expected=DESTINATION_COLUMNS, input=csv)



[1m> Entering new CrewAgentExecutor chain...[0m
[32;1m[1;3mThere are several columns in the data set that could match to the ['Debiteurnaam', 'Factuurnummer', 'Datum', 'Bedrag_EUR'] columns. 
The closest matches for each column in the target set is as follows:
'Debiteurnaam' --> 'Cust. name'
'Factuurnummer' --> 'Belegnummer'
'Datum' --> 'Belegdatum'
'Bedrag_EUR' --> 'Betrag in Hauswährung'

Final Answer:
'Debiteurnaam' --> 'Cust. name'
'Factuurnummer' --> 'Belegnummer'
'Datum' --> 'Belegdatum'
'Bedrag_EUR' --> 'Betrag in Hauswährung'

The above are the potential matches for the ['Debiteurnaam', 'Factuurnummer', 'Datum', 'Bedrag_EUR'] columns from the input data set. The 'Cust. Name' column in the input data set could match the 'Debiteurnaam' column in the target set. The 'Belegnummer' column in the input data set could match the 'Factuurnummer' column in the target set. The 'Belegdatum' column in the input data set could match the 'Datum' column in the target set. Finally, the 'B

## 4. Map data according to suggestion

In [11]:
source_columns = transformer.extract_substrings(response, SOURCE_COLUMNS)
destination_columns = transformer.extract_substrings(response, DESTINATION_COLUMNS)
mapping = transformer.create_mapping(source_columns=source_columns, 
                                     destination_columns=destination_columns)

In [14]:
mapped_df = transformer.mapper(mapping=mapping, to_discard=TO_DISCARD, df=df)

In [15]:
print(mapped_df)

    Symbol Posten offen/ausg.  Factuurnummer      Datum  Bedrag_EUR
0                         NaN   1.200402e+09 2023-04-30    51671.15
1                         NaN   1.200402e+09 2023-04-30     3919.22
2                 @5C\Qoffen@            NaN        NaT    55590.37
3              Konto 30036349            NaN        NaT    55590.37
4                         NaN   1.200401e+09 2023-04-25     -115.65
..                        ...            ...        ...         ...
683            Konto 30139870            NaN        NaT  -165902.22
684                       NaN   1.100129e+09 2023-04-12      -92.56
685               @5C\Qoffen@            NaN        NaT      -92.56
686            Konto 30140793            NaN        NaT      -92.56
687                       NaN            NaN        NaT -1736919.08

[688 rows x 4 columns]
