# Data Optimization - Johnson & Johnson

In [1]:
import glob as glob
import csv
import pandas as pd
import col_types

files_processed = []
if glob.glob('./data/files_processed.txt'):
    with open('./data/files_processed.txt', 'r') as file:
        reader = csv.reader(file, delimiter='\n')
        for row in reader:
            files_processed += row
            
files_full = [file for file in glob.glob('./data/full/*.txt') if file not in files_processed]
files_delta = [file for file in glob.glob('./data/delta/*.txt') if file not in files_processed]
files = files_full + files_delta

# Raw data

## Leitura dos arquivos

In [2]:
df_vec = []

for file in files_full:
    df_vec.append(pd.read_csv(file, sep='|', dtype=col_types.types_dict, parse_dates=col_types.parse_dates))

if df_vec:
    df = pd.concat(df_vec)
    df = df.reset_index(drop=True)

## Agrupando dados

Requisitos:

Group rows by

- poNumber
- costCenter
- primaryInternalOrder
- profitCenter
- generalLedgerAccount
- needByDate
- poEndDate
- poStartDate
- receivableIndicator
- projectWbs
- matOrSrc
- accountingActivityCode

In [None]:
if not df.empty:
    df = df.reset_index(names='id')
    df['delta'] = -1

    accrual = df.groupby(['poNumber', 'costCenter', 'primaryInternalOrder',
    'profitCenter', 'generalLedgerAccount', 'needByDate',
    'poEndDate', 'poStartDate', 'receivableIndicator',
    'projectWbs', 'matOrSrc'], dropna=False, as_index=False)

    df['id'] = accrual['id'].transform(lambda x: x.min())

In [None]:
if not df.empty:
    df.loc[df['id'] == 6]

No momento de agrupar, é necessário somar os campos que serão mesclados.

`SUM(poValueInGlobalCurrency)`

`SUM(poValueInLocalCurrency)`

`SUM(poValueInDocCurrency)`

`SUM(gdsReceiptValueInGlobalCurrency)`

`SUM(gdsReceiptValueInlocalCurrency)`

`SUM(gdsReceiptValueInDocCurrency)`

`SUM(invoiceReceiptValueInGlobalCurrency)`

`SUM(invoiceReceiptValueInLocalCurrency)`

`SUM(invoiceReceiptValueInDocCurrency)`

`SUM(deliverTo)`

Fazer persistir colunas que somem após operação de soma, fazer como:
`'columnName': 'first'` na propriedade do aggreggate `.agg()`

In [None]:
if not df.empty:
    accrued_df = accrual.agg({
    'id': 'min',
    'delta': 'max',

    'poNumber': 'first',
    'costCenter': 'first',
    'primaryInternalOrder': 'first',
    'profitCenter': 'first',
    'generalLedgerAccount': 'first',
    'needByDate': 'first',
    'poEndDate': 'first',
    'poStartDate': 'first',
    'receivableIndicator': 'first',
    'projectWbs': 'first',
    'matOrSrc': 'first',

    'poName': 'first',
    'poRequisitionerWwid': 'first',
    'poRequisitionerWwid': 'first',
    'poRequisitionerName': 'first',
    'poPreparerWwid': 'first',
    'poPreparerName': 'first',
    'costCenterDesc': 'first',
    'generalLedgerAccountDesc': 'first',
    'projectWbs': 'first',
    'supplierNumber': 'first',
    'supplierName': 'first',
    'supplierEmailAddress': 'first',
    'poType': 'first',
    'poStatus': 'first',
    'poCloseStatus': 'first',
    'poCreationDate': 'first',
    'receiptDates': 'first',
    'invoiceDates': 'first',
    'invoicePaidStatus': 'first',
    'transactionDate': 'first',
    'clearingDocumentRef': 'first',
    'clearingDateReference': 'first',
    'localCurrencyForPoValue': 'first',
    'documentCurrencyForPoValue': 'first',
    'localCurrencyForGoodsReceipt': 'first',
    'documentCurrencyForGoodsReceipt': 'first',
    'localCurrencyForInvoiceReceipt': 'first',
    'docCurrencyForInvoiceReceipt': 'first',
    'poValueInGlobalCurrency': 'first',
    'poValueInLocalCurrency': 'first',
    'poValueInDocCurrency': 'first',
    'gdsReceiptValueInGlobalCurrency': 'first',
    'gdsReceiptValueInlocalCurrency': 'first',
    'gdsReceiptValueInDocCurrency': 'first',
    'invoiceReceiptValueInGlobalCurrency': 'first',
    'invoiceReceiptValueInLocalCurrency': 'first',
    'invoiceReceiptValueInDocCurrency': 'first',
    'aribaBu': 'first',
    'mrc': 'first',
    'companyCode': 'first',
    'legalEntity': 'first',
    'fsid': 'first',
    'region': 'first',
    'businessArea': 'first',
    'shipTo': 'first',
    'deliverTo': 'first',
    'commodityType': 'first',
    'excludeDownpaymentRequestsForPayments': 'first',
    'sourceSystemApprovableId': 'first',
    'requisitionNumber': 'first',
    'receivableIndicator': 'first',
    'poLineNumber': 'first',
    'splitLineNumber': 'first',
    
    'poValueInGlobalCurrency': 'sum',
    'poValueInLocalCurrency': 'sum',
    'poValueInDocCurrency': 'sum',
    'gdsReceiptValueInGlobalCurrency': 'sum',
    'gdsReceiptValueInlocalCurrency': 'sum',
    'gdsReceiptValueInDocCurrency': 'sum',
    'invoiceReceiptValueInGlobalCurrency': 'sum',
    'invoiceReceiptValueInLocalCurrency': 'sum',
    'invoiceReceiptValueInDocCurrency': 'sum',
    'deliverTo': 'sum'
    }).reset_index(drop=True)

    accrued_df['delta'] = accrued_df['delta'] + 1

In [None]:
if not df.empty:
    df.loc[df['id'] == 6]

In [None]:
if not df.empty:
    accrued_df.loc[accrued_df['id'] == 6]

In [None]:
final_df = pd.concat([df, accrued_df])
final_df = final_df.reset_index(drop=True)

## Salvando arquivos

In [None]:
if not df.empty:
    final_df.to_csv('./data/accruedDataJnJ.csv', index=False, sep='|')

    with open("./data/files_processed.txt", "w") as txt_file:
        for line in files_full:
            txt_file.write(line + "\n")

## Gerando amostras de dados

Aqui serão geradas amostras de dados para fins de estudo. Seleciona-se as 10 POs que mais se repetiram e as 10 que menos se repetiram.

In [None]:
if not df.empty:
    accruedFrames = [
        accrued_df.sort_values(['poValueInGlobalCurrency'], ascending=False).head(2), 
        accrued_df.sort_values(['poValueInGlobalCurrency'], ascending=True).head(2)
    ]

    rawFrames = []

    for accruedFrame in accruedFrames:
        for index, row in accruedFrame.iterrows():
            isNull = row.isnull();
            rawFrames.append(df.loc[
                ((df['poNumber'] == row['poNumber']) | (isNull['poNumber'] & df['poNumber'].isnull())) &
                ((df['costCenter'] == row['costCenter'])  | (isNull['costCenter'] & df['costCenter'].isnull())) &
                ((df['primaryInternalOrder'] == row['primaryInternalOrder'])  | (isNull['primaryInternalOrder'] & df['primaryInternalOrder'].isnull())) &
                ((df['profitCenter'] == row['profitCenter'])  | (isNull['profitCenter'] & df['profitCenter'].isnull())) &
                ((df['generalLedgerAccount'] == row['generalLedgerAccount'])  | (isNull['generalLedgerAccount'] & df['generalLedgerAccount'].isnull())) &
                ((df['needByDate'] == row['needByDate'])  | (isNull['needByDate'] & df['needByDate'].isnull())) &
                ((df['poEndDate'] == row['poEndDate'])  | (isNull['poEndDate'] & df['poEndDate'].isnull())) &
                ((df['receivableIndicator'] == row['receivableIndicator'])  | (isNull['receivableIndicator'] & df['receivableIndicator'].isnull())) &
                ((df['projectWbs'] == row['projectWbs'])  | (isNull['projectWbs'] & df['projectWbs'].isnull())) &
                ((df['matOrSrc'] == row['matOrSrc'])  | (isNull['matOrSrc'] & df['matOrSrc'].isnull()))
            ])

    pd.concat(rawFrames).to_csv('./data/sampleData-rawPOs.csv', sep='|', index=False)
    pd.concat(accruedFrames).to_csv('./data/sampleData-accruedPOs.csv', sep='|', index=False)


# Processando Delta

## Leitura de arquivos Delta

In [3]:
df_vec = []

for file in files_delta:
    df_vec.append(pd.read_csv(file, sep='|', dtype=col_types.types_dict, parse_dates=col_types.parse_dates))

if df_vec:
    df = pd.concat(df_vec)
    df = df.reset_index(drop=True)

In [None]:
df