## <span style="color: #777777;">Working Data Overview</span>

### <span style="color: #777777;">1. GNFR Invoice</span>

### <span style="color: #777777;">1.1. Data Overview</span>

In [1]:
# Shape and Invoice amount per file
import pandas as pd

dtype = {

    
    'Source.Name': 'str',
    'INVOICE_NBR': 'str',
    'INVOICE_LINE_NBR': 'str',
    'INVOICE_QTY': 'str',
    'PO_NBR': 'str',
    'SUPPLIER_NBR': 'str',
}

df = pd.read_csv('GNFR I2P Data.csv', encoding='UTF-8-SIG', dtype=dtype)

df_shape = df.shape
print("DataFrame shape:", df_shape)

# Calculate sum of INVOICE_LINE_AMOUNT for each Source.Name
df_amount_sum = df.groupby('Source.Name')['INVOICE_LINE_AMOUNT'].sum()

# Export df_amount_sum to a CSV file

#df_amount_sum.to_csv('i2p_invoice_amount_sum.csv', header=True, encoding='UTF-8-SIG')


print("\nSum of INVOICE_LINE_AMOUNT for each Source.Name:")
print(df_amount_sum)


FileNotFoundError: [Errno 2] No such file or directory: 'GNFR I2P Data.csv'

In [3]:
# Count rows for file
df_row_count = df['Source.Name'].value_counts()

# Sort df_row_count by the order of df_amount_sum's Source.Name column
df_row_count = df_row_count.reindex(df_amount_sum.index)

# Print row count for each Source.Name
print("Row count for each Source.Name:")
print(df_row_count)

df_row_count.to_csv('nonpo_row_count.csv', header=True, encoding='UTF-8-SIG')

# Calculate total row count
total_row_count = df_row_count.sum()
print("Total Row count:")
print(total_row_count)




Row count for each Source.Name:
Source.Name
TBC_GNFRPO_01012022-01312022.CSV    19542
TBC_GNFRPO_01012023-01312023.CSV    17789
TBC_GNFRPO_02012022-02282022.CSV    17106
TBC_GNFRPO_02012023-02282023.CSV    17378
TBC_GNFRPO_03012022-03312022.CSV    19951
TBC_GNFRPO_03012023-03312023.CSV    17774
TBC_GNFRPO_05012021-05312021.CSV     5980
TBC_GNFRPO_06012021-06302021.CSV     4639
TBC_GNFRPO_06012022-06302022.CSV    20257
TBC_GNFRPO_07012021-07312021.CSV     5765
TBC_GNFRPO_07012022-07312022.CSV    17824
TBC_GNFRPO_08012021-08312021.CSV     4234
TBC_GNFRPO_09012021-09302021.CSV     6598
TBC_GNFRPO_09012022-09302022.CSV    21022
TBC_GNFRPO_10012021-10312021.CSV     5565
TBC_GNFRPO_10012022-10312022.CSV    19568
TBC_GNFRPO_11012021-11302021.CSV     6381
TBC_GNFRPO_11012022-11302022.CSV    16221
TBC_GNFRPO_12012021-12312021.CSV    15508
TBC_GNFRPO_12012022-12312022.CSV    20429
Name: Source.Name, dtype: int64
Total Row count:
279531


In [3]:
#data types

df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 323412 entries, 0 to 323411
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   INVOICE_DATE       323412 non-null  object 
 1   Source.Name        323412 non-null  object 
 2   INVOICE_LINE_NBR   323412 non-null  int64  
 3   INVOICE_NBR        323412 non-null  object 
 4   ReferenceField     323412 non-null  object 
 5   INVOICE_PAID_DATE  323412 non-null  object 
 6   INVOICE_SOURCE     323412 non-null  object 
 7   INVOICE_TYPE       323412 non-null  object 
 8   INVOICE_QTY        0 non-null       float64
 9   PO_NBR             323412 non-null  object 
 10  PO_ORDER_DATE      323412 non-null  object 
dtypes: float64(1), int64(1), object(9)
memory usage: 27.1+ MB


### <span style="color: #777777;">1.2. Transformation / Reports</span>

In [4]:
#1. There are Multiple 'INVOICE_NBR' per 'ReferenceField': 

#groups the data by 'ReferenceField', calculates the count of distinct 'INVOICE_NBR' values within each group.

ref_result = df.groupby('ReferenceField')['INVOICE_NBR'].nunique().reset_index()
ref_result = ref_result.rename(columns={'INVOICE_NBR': 'Count of Distinct INVOICE_NBR'})
ref_result = ref_result.sort_values(by='Count of Distinct INVOICE_NBR', ascending=False)

print(ref_result)

#ref_result.to_csv('Multiple_INVOICE_NBR_per_ReferenceField.csv', encoding='UTF-8-SIG')

# Count of ReferenceField with one count of Distinct INVOICE_NBR
one_count = ref_result[ref_result['Count of Distinct INVOICE_NBR'] == 1]['ReferenceField'].count()
print("Count of ReferenceField with one count of Distinct INVOICE_NBR:", one_count)

# Count of ReferenceField with more than one count of Distinct INVOICE_NBR
multiple_count = ref_result[ref_result['Count of Distinct INVOICE_NBR'] > 1]['ReferenceField'].count()
print("Count of ReferenceField with more than one count of Distinct INVOICE_NBR:", multiple_count)


          ReferenceField  Count of Distinct INVOICE_NBR
32711                  2                             38
8580                   1                             33
101377      912-673-6184                             25
97651   8535114591276833                             24
6491           063132808                             24
...                  ...                            ...
49417       235084368001                              1
49416       235070299001                              1
49415       235066666001                              1
49414       235064531001                              1
146499           Z347150                              1

[146500 rows x 2 columns]
Count of ReferenceField with one count of Distinct INVOICE_NBR: 144487
Count of ReferenceField with more than one count of Distinct INVOICE_NBR: 2013


In [53]:
# Table export gnfr Reference Field Inconsistencies

import numpy as np

# Get the list of ReferenceField values that have more than one distinct INVOICE_NBR
multiple_invoice_ref_fields = result[result['Count of Distinct INVOICE_NBR'] > 1]['ReferenceField']

# Filter the original dataframe with this list
df_multiple_invoice = df[df['ReferenceField'].isin(multiple_invoice_ref_fields)]

# Check the count of distinct ReferenceField in the filtered dataframe
print("Total of distinct exported reference fields:", df_multiple_invoice['ReferenceField'].nunique())

dtype = {
    
'Source.Name': 'str',
'ACCOUNT_CDE': 'str',
'INVOICE_NBR': 'str',
'INVOICE_LINE_NBR': 'str',
'SUPPLIER_NBR': 'str',
'INVOICE_LINE_AMOUNT': 'float',
}

df_multiple_invoice = df_multiple_invoice.astype(dtype)
df_multiple_invoice.replace('', np.nan, inplace=True)
df_multiple_invoice.to_csv('gnfr_multiple_invoice_records.csv', index=False, encoding='UTF-8-SIG')

df_shape = df_multiple_invoice.shape
print("DataFrame shape:", df_shape)


#df_multiple_invoice.to_csv('gnfr_multiple_invoice_records.csv', index=False, encoding='UTF-8-SIG')
df_multiple_invoice.to_csv('C:\\Users\\LOGICSOUERCE02\\Desktop\\gnfr_multiple_invoice_records.csv', index=False, encoding='UTF-8-SIG',na_rep='')

Total of distinct exported reference fields: 2013
DataFrame shape: (8430, 12)


In [54]:
#Test to make sure the multiple invoices are being exported correctly

# Convert the series to sets
set_multiple_invoice_ref_fields = set(multiple_invoice_ref_fields)
set_df_multiple_invoice = set(df_multiple_invoice['ReferenceField'].unique())

# Get the difference
diff_ref_fields = set_multiple_invoice_ref_fields - set_df_multiple_invoice

# Print the count of reference fields present in 'result' but not in 'df_multiple_invoice'
print("Count of reference fields present in 'result' but not in 'df_multiple_invoice':", len(diff_ref_fields))

df_multiple_invoice_dedup = df_multiple_invoice.drop_duplicates(subset='ReferenceField')

print("Total of distinct exported reference fields after de-duplication:", df_multiple_invoice_dedup['ReferenceField'].nunique())

Count of reference fields present in 'result' but not in 'df_multiple_invoice': 0
Total of distinct exported reference fields after de-duplication: 2013


In [6]:
#2. There are Multiple 'ReferenceField' per 'INVOICE_NBR'
inv_result = df.groupby('INVOICE_NBR')['ReferenceField'].nunique().reset_index()
inv_result = inv_result.rename(columns={'ReferenceField': 'Count of Distinct ReferenceField'})
inv_result = inv_result.sort_values(by='Count of Distinct ReferenceField', ascending=False)

print(inv_result.sort_values)

# Count of ReferenceField with one count of Distinct INVOICE_NBR
one_count = inv_result[inv_result['Count of Distinct ReferenceField'] == 1]['INVOICE_NBR'].count()
print("Count of INVOICE_NBR with one count of Distinct ReferenceField :", one_count)

# Count of ReferenceField with more than one count of Distinct INVOICE_NBR
multiple_count = inv_result[inv_result['Count of Distinct ReferenceField'] > 1]['INVOICE_NBR'].count()
print("Count of INVOICE_NBR with more than one count of Distinct ReferenceField:", multiple_count)

#inv_result.to_csv('Multiple_ReferenceField_per_INVOICE_NBR_.csv', encoding='UTF-8-SIG')

<bound method DataFrame.sort_values of        INVOICE_NBR  Count of Distinct ReferenceField
71399   5107203768                                 2
0       5105600111                                 1
100142  5108637035                                 1
100136  5108636743                                 1
100137  5108636745                                 1
...            ...                               ...
50071   5106757048                                 1
50072   5106757049                                 1
50073   5106757050                                 1
50074   5106757061                                 1
150210  5109999553                                 1

[150211 rows x 2 columns]>
Count of INVOICE_NBR with one count of Distinct ReferenceField : 150210
Count of INVOICE_NBR with more than one count of Distinct ReferenceField: 1


In [8]:
#gnfr update table final check and export
import numpy as np

#table export nonpo Reference Field Update List 

#filtered DataFrame with rows where a single INVOICE_NBR has more than one distinct reference field
update_data = df[df['ReferenceField'].isin(ref_result[ref_result['Count of Distinct INVOICE_NBR'] == 1]['ReferenceField'])]

#remove duplicates in 'ReferenceField' column
update_data.drop_duplicates(subset='ReferenceField', inplace=True)

update_data = update_data[update_data['INVOICE_NBR'] != '5107203768']
 
update_data = update_data[['INVOICE_NBR', 'ReferenceField']]


# Print the filtered DataFrame
print(update_data)

dtype = {
    'INVOICE_NBR': 'str',
    'ReferenceField': 'str',
}

update_data = update_data .astype(dtype)
update_data .replace('', np.nan, inplace=True)
update_data .to_csv('gnfr_invoice_update.csv', index=False, encoding='UTF-8-SIG')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  update_data.drop_duplicates(subset='ReferenceField', inplace=True)


       INVOICE_NBR ReferenceField
0       5109902481        2945886
13      5109851350        QP801HN
14      5109851364        QP800HN
15      5109851365        QP815HN
16      5109851366        QP810HN
...            ...            ...
323405  5107066802    093022-7088
323407  5107305448      202204710
323409  5107205343         323493
323410  5107372171        000301C
323411  5107242787           3360

[144486 rows x 2 columns]


### <span style="color: #777777;">2. NONPO Invoice</span>

### <span style="color: #777777;">2.1. Data Overview</span>

In [3]:
# Data shape and row count
import pandas as pd
import os

# Specify the data types for the columns
dtype = {
'Source.Name': 'str',
'ACCOUNT_CDE': 'str',
'INVOICE_NBR': 'str',
'INVOICE_LINE_NBR': 'str',
'SUPPLIER_NBR': 'str',
}

# Specify the directory where the csv files are
directory_path = 'C:\\Users\\LOGICSOUERCE02\\Downloads\\TBC RELATED\\TBC Invoice Update\\1 - NONPO\\Monthly I2P Data\\TO CONCAT'

# List to hold dataframes
dfs = []

# Iterate over the files in the directory
for filename in os.listdir(directory_path):
    if filename.endswith('.csv'):  # Make sure we're working with CSV files
        file_path = os.path.join(directory_path, filename)
        df = pd.read_csv(file_path, encoding='UTF-8-SIG', encoding_errors='ignore', dtype=dtype)
        dfs.append(df)

# Concatenate all the dataframes into one
all_data = pd.concat(dfs, ignore_index=True)

print(all_data.shape)

df3 = all_data.groupby(['Source.Name']).size().reset_index(name='Row_Count')
df3.to_csv('nonpo_row_count.csv', header=True, index=False)



print(df3)


(2679583, 17)
                        Source.Name  Row_Count
0   TBC_NONPO_01012022-01312022.CSV     104888
1   TBC_NONPO_01012023-01312023.CSV      55799
2   TBC_NONPO_02012022-02282022.CSV     102315
3   TBC_NONPO_02012023-02282023.CSV      40367
4   TBC_NONPO_03012022-03312022.CSV     136918
5   TBC_NONPO_03012023-03312023.CSV      48604
6   TBC_NONPO_05012021-05312021.CSV     143361
7   TBC_NONPO_06012021-06302021.CSV     121229
8   TBC_NONPO_06012022-06302022.CSV     134072
9   TBC_NONPO_07012021-07312021.CSV     168277
10  TBC_NONPO_07012022-07312022.CSV     119392
11  TBC_NONPO_08012021-08312021.CSV     122428
12  TBC_NONPO_08012022-08312022.CSV     115422
13  TBC_NONPO_09012021-09302021.CSV     255268
14  TBC_NONPO_09012022-09302022.CSV     134228
15  TBC_NONPO_10012021-10312021.CSV     155738
16  TBC_NONPO_10012022-10312022.CSV     121357
17  TBC_NONPO_11012021-11302021.CSV     158050
18  TBC_NONPO_11012022-11302022.CSV     125537
19  TBC_NONPO_12012021-12312021.CSV     159966

In [3]:
# Row count after the filtering

all_data['SUPPLIER_NBR'] = all_data['SUPPLIER_NBR'].astype(str)
all_data['ACCOUNT_CDE'] = all_data['ACCOUNT_CDE'].astype(str)

filtered_df = all_data[
    ~(
        (all_data['SUPPLIER_NBR'].str.startswith('E')) |
        ((all_data['ACCOUNT_CDE'] == '121020') & (all_data['SUPPLIER_NBR'] == '176294'))
    )
]

filtered_row_count = filtered_df.groupby(['Source.Name']).size().reset_index(name='Row_Count')
filtered_row_count.to_csv('filtered_datase1.csv', header=True, index=False)


print(filtered_row_count)

                        Source.Name  Row_Count
0   TBC_NONPO_01012022-01312022.CSV     100787
1   TBC_NONPO_01012023-01312023.CSV      55309
2   TBC_NONPO_02012022-02282022.CSV      97400
3   TBC_NONPO_02012023-02282023.CSV      39871
4   TBC_NONPO_03012022-03312022.CSV     131456
5   TBC_NONPO_03012023-03312023.CSV      47827
6   TBC_NONPO_05012021-05312021.CSV     138719
7   TBC_NONPO_06012021-06302021.CSV     115821
8   TBC_NONPO_06012022-06302022.CSV     127503
9   TBC_NONPO_07012021-07312021.CSV     160553
10  TBC_NONPO_07012022-07312022.CSV     111132
11  TBC_NONPO_08012021-08312021.CSV     116382
12  TBC_NONPO_08012022-08312022.CSV     108649
13  TBC_NONPO_09012021-09302021.CSV     250670
14  TBC_NONPO_09012022-09302022.CSV     127045
15  TBC_NONPO_10012021-10312021.CSV     148534
16  TBC_NONPO_10012022-10312022.CSV     119957
17  TBC_NONPO_11012021-11302021.CSV     152526
18  TBC_NONPO_11012022-11302022.CSV     124107
19  TBC_NONPO_12012021-12312021.CSV     153553
20  TBC_NONPO

In [4]:
# Calculate sum of INVOICE_LINE_AMOUNT for each Source.Name
df3_amount_sum = all_data.groupby('Source.Name')['INVOICE_LINE_AMOUNT'].sum()

df3_amount_sum.to_csv('database1_amount_sum.csv', header=True, encoding='UTF-8-SIG')

print("\nSum of INVOICE_LINE_AMOUNT for each Source.Name:")
print(df3_amount_sum)



Sum of INVOICE_LINE_AMOUNT for each Source.Name:
Source.Name
TBC_NONPO_01012022-01312022.CSV    6.497143e+07
TBC_NONPO_01012023-01312023.CSV    1.278833e+08
TBC_NONPO_02012022-02282022.CSV    4.726563e+07
TBC_NONPO_02012023-02282023.CSV    1.570666e+08
TBC_NONPO_03012022-03312022.CSV    8.401463e+07
TBC_NONPO_03012023-03312023.CSV    1.711548e+08
TBC_NONPO_05012021-05312021.CSV    4.304693e+07
TBC_NONPO_06012021-06302021.CSV    1.407635e+08
TBC_NONPO_06012022-06302022.CSV    1.337452e+08
TBC_NONPO_07012021-07312021.CSV    5.987502e+07
TBC_NONPO_07012022-07312022.CSV    1.992246e+08
TBC_NONPO_08012021-08312021.CSV    3.848289e+07
TBC_NONPO_08012022-08312022.CSV    2.178124e+08
TBC_NONPO_09012021-09302021.CSV    6.804769e+07
TBC_NONPO_09012022-09302022.CSV    2.114267e+08
TBC_NONPO_10012021-10312021.CSV    6.367520e+07
TBC_NONPO_10012022-10312022.CSV    2.931100e+08
TBC_NONPO_11012021-11302021.CSV    4.962345e+07
TBC_NONPO_11012022-11302022.CSV    1.996113e+08
TBC_NONPO_12012021-1231202

In [5]:
# Calculate filtered sum of INVOICE_LINE_AMOUNT for each Source.Name
filtered_amount_sum = filtered_df.groupby('Source.Name')['INVOICE_LINE_AMOUNT'].sum()

filtered_amount_sum.to_csv('filtered_database1_amount_sum.csv', header=True, encoding='UTF-8-SIG')

print("\nSum of INVOICE_LINE_AMOUNT for each Source.Name:")
print(filtered_amount_sum)


Sum of INVOICE_LINE_AMOUNT for each Source.Name:
Source.Name
TBC_NONPO_01012022-01312022.CSV    6.449962e+07
TBC_NONPO_01012023-01312023.CSV    1.277524e+08
TBC_NONPO_02012022-02282022.CSV    4.672874e+07
TBC_NONPO_02012023-02282023.CSV    1.568240e+08
TBC_NONPO_03012022-03312022.CSV    8.341433e+07
TBC_NONPO_03012023-03312023.CSV    1.709531e+08
TBC_NONPO_05012021-05312021.CSV    4.248243e+07
TBC_NONPO_06012021-06302021.CSV    1.401176e+08
TBC_NONPO_06012022-06302022.CSV    1.331826e+08
TBC_NONPO_07012021-07312021.CSV    5.900345e+07
TBC_NONPO_07012022-07312022.CSV    1.984132e+08
TBC_NONPO_08012021-08312021.CSV    3.779398e+07
TBC_NONPO_08012022-08312022.CSV    2.170890e+08
TBC_NONPO_09012021-09302021.CSV    6.751828e+07
TBC_NONPO_09012022-09302022.CSV    2.106672e+08
TBC_NONPO_10012021-10312021.CSV    6.300804e+07
TBC_NONPO_10012022-10312022.CSV    2.926953e+08
TBC_NONPO_11012021-11302021.CSV    4.907541e+07
TBC_NONPO_11012022-11302022.CSV    1.988727e+08
TBC_NONPO_12012021-1231202

In [9]:
all_data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5035947 entries, 0 to 5035946
Data columns (total 12 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Source.Name        object 
 1   INVOICE_DATE       object 
 2   INVOICE_NBR        object 
 3   ReferenceField     object 
 4   INVOICE_PAID_DATE  object 
 5   INVOICE_LINE_NBR   float64
 6   INVOICE_QTY        float64
 7   INVOICE_SOURCE     object 
 8   INVOICE_TYPE       object 
 9   PO_NBR             object 
 10  PO_ORDER_DATE      float64
 11  SUPPLIER_NBR       object 
dtypes: float64(3), object(9)
memory usage: 461.1+ MB


### <span style="color: #777777;">2.2. Transformation / Reports</span>