# Data Optimization - Johnson & Johnson

In [6]:
import glob as glob
import csv
import pandas as pd
import col_types

files_processed = []
if glob.glob('./data/files_processed.txt'):
    with open('./data/files_processed.txt', 'r') as file:
        reader = csv.reader(file, delimiter='\n')
        for row in reader:
            files_processed += row
            
files_full = [file for file in glob.glob('./data/full/*.txt') if file not in files_processed]
files_delta = [file for file in glob.glob('./data/delta/cvtJnJVisionDelta09*.txt') if file not in files_processed]
files = files_full + files_delta

# Raw data

## Leitura dos arquivos

In [7]:
full_df_vec = []

for file in files_full:
    full_df_vec.append(pd.read_csv(file, sep='|', dtype=col_types.types_dict, parse_dates=col_types.parse_dates))

if full_df_vec:
    full_df = pd.concat(full_df_vec)
    full_df = full_df.reset_index(drop=True)

## Agrupando dados

Requisitos:

Group rows by

- poNumber
- costCenter
- primaryInternalOrder
- profitCenter
- generalLedgerAccount
- needByDate
- poEndDate
- poStartDate
- receivableIndicator
- projectWbs
- matOrSrc
- accountingActivityCode

In [8]:
if not full_df.empty:
    full_df = full_df.reset_index(names='id')
    full_df['delta'] = -1

    accrual = full_df.groupby(['poNumber', 'costCenter', 'primaryInternalOrder',
    'profitCenter', 'generalLedgerAccount', 'needByDate',
    'poEndDate', 'poStartDate', 'receivableIndicator',
    'projectWbs', 'matOrSrc'], dropna=False)

    full_df['id'] = accrual['id'].transform('min')

No momento de agrupar, é necessário somar os campos que serão mesclados.

`SUM(poValueInGlobalCurrency)`

`SUM(poValueInLocalCurrency)`

`SUM(poValueInDocCurrency)`

`SUM(gdsReceiptValueInGlobalCurrency)`

`SUM(gdsReceiptValueInlocalCurrency)`

`SUM(gdsReceiptValueInDocCurrency)`

`SUM(invoiceReceiptValueInGlobalCurrency)`

`SUM(invoiceReceiptValueInLocalCurrency)`

`SUM(invoiceReceiptValueInDocCurrency)`

`SUM(deliverTo)`

Fazer persistir colunas que somem após operação de soma, fazer como:
`'columnName': 'first'` na propriedade do aggreggate `.agg()`

In [9]:
if not full_df.empty:
    accrued_df = accrual.agg({
    'id': 'min',
    'delta': 'max',

    'poNumber': 'first',
    'costCenter': 'first',
    'primaryInternalOrder': 'first',
    'profitCenter': 'first',
    'generalLedgerAccount': 'first',
    'needByDate': 'first',
    'poEndDate': 'first',
    'poStartDate': 'first',
    'receivableIndicator': 'first',
    'projectWbs': 'first',
    'matOrSrc': 'first',

    'poName': 'first',
    'poRequisitionerWwid': 'first',
    'poRequisitionerWwid': 'first',
    'poRequisitionerName': 'first',
    'poPreparerWwid': 'first',
    'poPreparerName': 'first',
    'costCenterDesc': 'first',
    'generalLedgerAccountDesc': 'first',
    'projectWbs': 'first',
    'supplierNumber': 'first',
    'supplierName': 'first',
    'supplierEmailAddress': 'first',
    'poType': 'first',
    'poStatus': 'first',
    'poCloseStatus': 'first',
    'poCreationDate': 'first',
    'receiptDates': 'first',
    'invoiceDates': 'first',
    'invoicePaidStatus': 'first',
    'transactionDate': 'first',
    'clearingDocumentRef': 'first',
    'clearingDateReference': 'first',
    'localCurrencyForPoValue': 'first',
    'documentCurrencyForPoValue': 'first',
    'localCurrencyForGoodsReceipt': 'first',
    'documentCurrencyForGoodsReceipt': 'first',
    'localCurrencyForInvoiceReceipt': 'first',
    'docCurrencyForInvoiceReceipt': 'first',
    'poValueInGlobalCurrency': 'first',
    'poValueInLocalCurrency': 'first',
    'poValueInDocCurrency': 'first',
    'gdsReceiptValueInGlobalCurrency': 'first',
    'gdsReceiptValueInlocalCurrency': 'first',
    'gdsReceiptValueInDocCurrency': 'first',
    'invoiceReceiptValueInGlobalCurrency': 'first',
    'invoiceReceiptValueInLocalCurrency': 'first',
    'invoiceReceiptValueInDocCurrency': 'first',
    'aribaBu': 'first',
    'mrc': 'first',
    'companyCode': 'first',
    'legalEntity': 'first',
    'fsid': 'first',
    'region': 'first',
    'businessArea': 'first',
    'shipTo': 'first',
    'deliverTo': 'first',
    'commodityType': 'first',
    'excludeDownpaymentRequestsForPayments': 'first',
    'sourceSystemApprovableId': 'first',
    'requisitionNumber': 'first',
    'receivableIndicator': 'first',
    'poLineNumber': 'first',
    'splitLineNumber': 'first',
    
    'poValueInGlobalCurrency': 'sum',
    'poValueInLocalCurrency': 'sum',
    'poValueInDocCurrency': 'sum',
    'gdsReceiptValueInGlobalCurrency': 'sum',
    'gdsReceiptValueInlocalCurrency': 'sum',
    'gdsReceiptValueInDocCurrency': 'sum',
    'invoiceReceiptValueInGlobalCurrency': 'sum',
    'invoiceReceiptValueInLocalCurrency': 'sum',
    'invoiceReceiptValueInDocCurrency': 'sum',
    'deliverTo': 'sum'
    }).reset_index(drop=True)

    accrued_df['delta'] = accrued_df['delta'] + 1

In [10]:
#Sinaliza qual PO sofreu aggregate e qual é raw (original)
full_df['isRaw'] = True
accrued_df['isRaw'] = False

final_df = pd.concat([full_df, accrued_df])
final_df = final_df.reset_index(drop=True)

## Salvando arquivos

In [11]:
if not final_df.empty:
    final_df.to_csv('./data/accruedDataJnJ.csv', index=False, sep='|')

    with open("./data/files_processed.txt", "w") as txt_file:
        for line in files_full:
            txt_file.write(line + "\n")

## Gerando amostras de dados

Aqui serão geradas amostras de dados para fins de estudo. Seleciona-se as 10 POs que mais se repetiram e as 10 que menos se repetiram.

if not full_df.empty:
    accruedFrames = [
        accrued_df.sort_values(['poValueInGlobalCurrency'], ascending=False).head(2), 
        accrued_df.sort_values(['poValueInGlobalCurrency'], ascending=True).head(2)
    ]

    rawFrames = []

    for accruedFrame in accruedFrames:
        for index, row in accruedFrame.iterrows():
            isNull = row.isnull();
            rawFrames.append(df.loc[
                ((df['poNumber'] == row['poNumber']) | (isNull['poNumber'] & df['poNumber'].isnull())) &
                ((df['costCenter'] == row['costCenter'])  | (isNull['costCenter'] & df['costCenter'].isnull())) &
                ((df['primaryInternalOrder'] == row['primaryInternalOrder'])  | (isNull['primaryInternalOrder'] & df['primaryInternalOrder'].isnull())) &
                ((df['profitCenter'] == row['profitCenter'])  | (isNull['profitCenter'] & df['profitCenter'].isnull())) &
                ((df['generalLedgerAccount'] == row['generalLedgerAccount'])  | (isNull['generalLedgerAccount'] & df['generalLedgerAccount'].isnull())) &
                ((df['needByDate'] == row['needByDate'])  | (isNull['needByDate'] & df['needByDate'].isnull())) &
                ((df['poEndDate'] == row['poEndDate'])  | (isNull['poEndDate'] & df['poEndDate'].isnull())) &
                ((df['receivableIndicator'] == row['receivableIndicator'])  | (isNull['receivableIndicator'] & df['receivableIndicator'].isnull())) &
                ((df['projectWbs'] == row['projectWbs'])  | (isNull['projectWbs'] & df['projectWbs'].isnull())) &
                ((df['matOrSrc'] == row['matOrSrc'])  | (isNull['matOrSrc'] & df['matOrSrc'].isnull()))
            ])

    pd.concat(rawFrames).to_csv('./data/sampleData-rawPOs.csv', sep='|', index=False)
    pd.concat(accruedFrames).to_csv('./data/sampleData-accruedPOs.csv', sep='|', index=False)


# Processando Delta

## Leitura de arquivos Delta

In [12]:
delta_df_vec = []

#Lê arquivos delta e armazena num vetor
for file in files_delta:
    df = pd.read_csv(file, sep='|', dtype=col_types.types_dict, parse_dates=col_types.parse_dates)
    df['delta'] = -2
    delta_df_vec.append(df)


#Lê arquivos já accruados e seleciona as POs raws (que não são frutos de um aggregate)
accruedDataJnJ_df = pd.read_csv('./data/accruedDataJnJ.csv', sep='|', dtype=col_types.types_dict, parse_dates=col_types.parse_dates)
raw_df = accruedDataJnJ_df.loc[accruedDataJnJ_df['isRaw'] == True]

#Preenche as POs do delta com novos IDs (começando do último)
last_id = accruedDataJnJ_df['id'].max()
delta_df = pd.concat(delta_df_vec)
delta_df.insert(0, 'id', range(last_id + 1, last_id + len(delta_df) + 1))

delta_raw_df_vec = [delta_df, raw_df]

#Concatena todos data frames e cria coluna de poId para identificar qual PO foi editada ou se é PO nova
if delta_df_vec:
    delta_raw_df = pd.concat(delta_raw_df_vec)
    delta_raw_df = delta_raw_df.reset_index(drop=True)
    delta_raw_df['poId'] = delta_raw_df['poNumber'] + delta_raw_df['poLineNumber'] + delta_raw_df['splitLineNumber']

## POs novas

In [None]:
if delta_df_vec:

    accrual = delta_raw_df.groupby(['poNumber', 'poLineNumber', 'splitLineNumber'], dropna=False)

    #Conta quantas vezes a PO se repete para ver se é nova ou editada
    delta_raw_df['count'] = accrual['poId'].transform('count')
    new_po_df = delta_raw_df.loc[(delta_raw_df['count'] == 1) & (delta_raw_df['delta'] == -2)]
    new_po_df['delta'] = -1
    new_po_df['isRaw'] = True
    del new_po_df['poId']
    del new_po_df['count']

In [14]:
#POs que já existiam
old_po_df = accruedDataJnJ_df.loc[accruedDataJnJ_df['delta'] == accruedDataJnJ_df['delta'].max()]

#POs que já existiam concatenadas com POs novas
to_accrual_df = pd.concat([old_po_df, new_po_df])

#Agrupando as POs
new_accrual = to_accrual_df.groupby(['poNumber', 'costCenter', 'primaryInternalOrder',
    'profitCenter', 'generalLedgerAccount', 'needByDate',
    'poEndDate', 'poStartDate', 'receivableIndicator',
    'projectWbs', 'matOrSrc'], dropna=False)

#Salva o menor id dos grupos nas novas POs
to_accrual_df['id'] = new_accrual['id'].transform('min')
new_po_df = to_accrual_df.loc[to_accrual_df['delta'] == -1]

Agrega os grupos, selecionando o menor ID de cada grupo e o maior delta (talvez não seja necessário?)

In [15]:
new_accrued_df = new_accrual.agg({
    'id': 'min',
    'isRaw': 'first',
    'delta': 'max',

    'poNumber': 'first',
    'costCenter': 'first',
    'primaryInternalOrder': 'first',
    'profitCenter': 'first',
    'generalLedgerAccount': 'first',
    'needByDate': 'first',
    'poEndDate': 'first',
    'poStartDate': 'first',
    'receivableIndicator': 'first',
    'projectWbs': 'first',
    'matOrSrc': 'first',

    'poName': 'first',
    'poRequisitionerWwid': 'first',
    'poRequisitionerWwid': 'first',
    'poRequisitionerName': 'first',
    'poPreparerWwid': 'first',
    'poPreparerName': 'first',
    'costCenterDesc': 'first',
    'generalLedgerAccountDesc': 'first',
    'projectWbs': 'first',
    'supplierNumber': 'first',
    'supplierName': 'first',
    'supplierEmailAddress': 'first',
    'poType': 'first',
    'poStatus': 'first',
    'poCloseStatus': 'first',
    'poCreationDate': 'first',
    'receiptDates': 'first',
    'invoiceDates': 'first',
    'invoicePaidStatus': 'first',
    'transactionDate': 'first',
    'clearingDocumentRef': 'first',
    'clearingDateReference': 'first',
    'localCurrencyForPoValue': 'first',
    'documentCurrencyForPoValue': 'first',
    'localCurrencyForGoodsReceipt': 'first',
    'documentCurrencyForGoodsReceipt': 'first',
    'localCurrencyForInvoiceReceipt': 'first',
    'docCurrencyForInvoiceReceipt': 'first',
    'poValueInGlobalCurrency': 'first',
    'poValueInLocalCurrency': 'first',
    'poValueInDocCurrency': 'first',
    'gdsReceiptValueInGlobalCurrency': 'first',
    'gdsReceiptValueInlocalCurrency': 'first',
    'gdsReceiptValueInDocCurrency': 'first',
    'invoiceReceiptValueInGlobalCurrency': 'first',
    'invoiceReceiptValueInLocalCurrency': 'first',
    'invoiceReceiptValueInDocCurrency': 'first',
    'aribaBu': 'first',
    'mrc': 'first',
    'companyCode': 'first',
    'legalEntity': 'first',
    'fsid': 'first',
    'region': 'first',
    'businessArea': 'first',
    'shipTo': 'first',
    'deliverTo': 'first',
    'commodityType': 'first',
    'excludeDownpaymentRequestsForPayments': 'first',
    'sourceSystemApprovableId': 'first',
    'requisitionNumber': 'first',
    'receivableIndicator': 'first',
    'poLineNumber': 'first',
    'splitLineNumber': 'first',
    
    'poValueInGlobalCurrency': 'sum',
    'poValueInLocalCurrency': 'sum',
    'poValueInDocCurrency': 'sum',
    'gdsReceiptValueInGlobalCurrency': 'sum',
    'gdsReceiptValueInlocalCurrency': 'sum',
    'gdsReceiptValueInDocCurrency': 'sum',
    'invoiceReceiptValueInGlobalCurrency': 'sum',
    'invoiceReceiptValueInLocalCurrency': 'sum',
    'invoiceReceiptValueInDocCurrency': 'sum',
    'deliverTo': 'sum'
    })

new_accrued_df['delta'] = new_accrued_df['delta'].max() + 1

In [16]:
new_accrued_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,id,isRaw,delta,poNumber,costCenter,primaryInternalOrder,profitCenter,generalLedgerAccount,needByDate,poEndDate,...,region,businessArea,shipTo,deliverTo,commodityType,excludeDownpaymentRequestsForPayments,sourceSystemApprovableId,requisitionNumber,poLineNumber,splitLineNumber
poNumber,costCenter,primaryInternalOrder,profitCenter,generalLedgerAccount,needByDate,poEndDate,poStartDate,receivableIndicator,projectWbs,matOrSrc,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
551245727,5065,,,6251100601,2011-11-01,2011-03-31,2011-03-01,Y,,S,653610,True,1,551245727,5065,,,6251100601,2011-11-01,2011-03-31,...,ARBNE,#1,Calle 15 31146NIT 890.101.815 9 Acopi Yumbo VA,0.0,INDIRECT,N,,PR5709950-V2,1,1
551665419,#,,CONSUMER,5144100108,2012-12-20,NaT,NaT,Y,,M,487461,True,1,551665419,#,,CONSUMER,5144100108,2012-12-20,NaT,...,ARBNE,#1,Calle 15 31146NIT 890.101.815 9 Acopi Yumbo VA,1.0,INDIRECT,N,,PR6709884,1,1
551665423,#,,CONSUMER,5144100108,2012-12-20,NaT,NaT,Y,,M,491755,True,1,551665423,#,,CONSUMER,5144100108,2012-12-20,NaT,...,ARBNE,#1,Calle 15 31146NIT 890.101.815 9 Acopi Yumbo VA,1.0,INDIRECT,N,,PR6709859,1,1
551667297,3621,,,6251100602,2012-12-17,2013-01-11,2012-12-09,Y,,S,653540,True,1,551667297,3621,,,6251100602,2012-12-17,2013-01-11,...,ARBNE,#1,30500572309 Ruta 8 Km 63.500 Fatima BU,0.0,INDIRECT,N,,PR6702310,1,1
551673012,#,,CONSUMER,6553300104,2013-01-18,NaT,NaT,Y,,M,626689,True,1,551673012,#,,CONSUMER,6553300104,2013-01-18,NaT,...,ARBNE,#1,Carrera 36 13100 ACOPI YUMBONIT 860 Acopi Yum...,6302.0,INDIRECT,N,,PR6740956-V2,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P23339691R,,448030TNL768,,635300008,2020-09-08,2020-09-08,NaT,D,,S,505932,True,1,P23339691R,,448030TNL768,,635300008,2020-09-08,2020-09-08,...,ARBEM,1439,Graaf Engelbertlaan 75 Breda #,0.0,BOTH,N,OnDemand,PR4844153,1,1
P23339692R,2161136,,,,2020-09-07,NaT,NaT,D,,M,567061,True,1,P23339692R,2161136,,,,2020-09-07,NaT,...,ARBEM,1246,"The National Technological Park, Plassey Limer...",0.0,INDIRECT,Y,OnDemand,PR4783581,1,1
P23339693R,2161116,,,,2021-05-24,NaT,NaT,D,,M,588231,True,1,P23339693R,2161116,,,,2021-05-24,NaT,...,ARBEM,1246,"The National Technological Park, Plassey Limer...",0.0,INDIRECT,Y,OnDemand,PR4810306,1,1
P23339694R,2030032674,,,65555324,2020-09-18,2020-09-18,NaT,D,,S,501304,True,1,P23339694R,2030032674,,,65555324,2020-09-18,2020-09-18,...,ARBEM,1142,Simpson Parkway Kirkton Campus Livingston #,0.0,INDIRECT,N,OnDemand,PR4843310,1,1


In [17]:
new_po_df['isRaw'] = True
new_accrued_df['isRaw'] = False

final_df = pd.concat([accruedDataJnJ_df, new_po_df, new_accrued_df])
final_df = final_df.reset_index(drop=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_po_df['isRaw'] = True


## Salvando arquivos

In [20]:
if not final_df.empty:
    final_df.to_csv('./data/accruedDataJnJ.csv', index=False, sep='|')

    #with open("./data/deltas_processed.txt", "w") as txt_file:
    #    for line in files_delta:
    #        txt_file.write(line + "\n")

In [21]:
test = pd.read_csv('./data/accruedDataJnJ.csv', sep='|', dtype=col_types.types_dict, parse_dates=col_types.parse_dates)