### Project and Department Master

In [None]:
import pandas as pd

# Define the dtypes for the 2024 DataFrame
dtype_2024 = {
    'PROJECT _ Department': 'str',
    'PROJECT _ Project Without Year': 'str',
    'Project - Great Plains Description': 'str',
    'PROJECT _ Project Type': 'str',
    'PROJECT _ Corporate Project Name': 'str',
}

# Define the dtypes for the 2023 DataFrame
dtype_2023 = {
    'PROJECT _ Department': 'str',
    'PROJECT _ Sub Department': 'str',
    'PROJECT _ Department - Lowest Level': 'str',
    'Project - Great Plains Description': 'str',
    'PROJECT _ Project Type': 'str',
    'PROJECT _ Corporate Project Name': 'str',
    'PROJECT _ Project Without Year': 'str'
}

# Read the 2024 masterfile
pro_dep_master_2024 = pd.read_excel('Procurement Project & Org Data.xlsx', 
                                    sheet_name='Projects & Department2', 
                                    skiprows=4, 
                                    dtype=dtype_2024)

# Read the 2023 masterfile
pro_dep_master_2023 = pd.read_csv('Project and Department Master.csv', 
                                  encoding='UTF-8-SIG', 
                                  skiprows=1,
                                  dtype=dtype_2023)

# Print the shape of the 2024 masterfile
pro_dep_master_shape_2024 = pro_dep_master_2024.shape
print(f"Project Department Masterfile 2024 Shape: {pro_dep_master_shape_2024}")

# Print the shape of the 2023 masterfile
pro_dep_master_shape_2023 = pro_dep_master_2023.shape
print(f"Project Department Masterfile 2023 Shape: {pro_dep_master_shape_2023}")


Project Department Masterfile 2024 Shape: (1570, 5)
Project Department Masterfile 2023 Shape: (1949, 7)


In [None]:
# Verify that the column name is exactly the same in both DataFrames
print("2024 Columns:", pro_dep_master_2024.columns.tolist())
print("2023 Columns:", pro_dep_master_2023.columns.tolist())


2024 Columns: ['PROJECT _ Department', 'PROJECT _ Project Without Year', 'Project - Great Plains Description', 'PROJECT _ Project Type', 'PROJECT _ Corporate Project Name']
2023 Columns: ['PROJECT _ Department', 'PROJECT _ Sub Department', 'PROJECT _ Department - Lowest Level', 'Project - Great Plains Description', 'PROJECT _ Project Type', 'PROJECT _ Corporate Project Name', 'PROJECT _ Project Without Year']


In [None]:
# Identify common columns between the two DataFrames
common_columns = pro_dep_master_2024.columns.intersection(pro_dep_master_2023.columns).tolist()
print(f"Common columns: {common_columns}")

# Merge the two DataFrames based on the common columns to find matching entries
matched_entries = pd.merge(pro_dep_master_2023, pro_dep_master_2024, on='PROJECT _ Project Without Year', how='inner')

# Get the number of matching entries
num_matched_entries = matched_entries.shape[0]
print(f"Number of entries from 2023 report that exist in 2024 report: {num_matched_entries}")

Common columns: ['PROJECT _ Department', 'PROJECT _ Project Without Year', 'Project - Great Plains Description', 'PROJECT _ Project Type', 'PROJECT _ Corporate Project Name']
Number of entries from 2023 report that exist in 2024 report: 1360


In [None]:
# Perform a left merge and set an indicator to True
exclusive_to_2023 = pd.merge(pro_dep_master_2023, pro_dep_master_2024, 
                             on='PROJECT _ Project Without Year', 
                             how='left', 
                             indicator=True)

# Perform a left merge with the dataframes switched and set an indicator to True
exclusive_to_2024 = pd.merge(pro_dep_master_2024, pro_dep_master_2023, 
                             on='PROJECT _ Project Without Year', 
                             how='left', 
                             indicator=True)

# Filter rows where the '_merge' column is 'left_only', meaning they're exclusive to the 2023 report
exclusive_to_2023 = exclusive_to_2023[exclusive_to_2023['_merge'] == 'left_only']

# Filter rows where the '_merge' column is 'left_only', meaning they're exclusive to the 2024 report
exclusive_to_2024 = exclusive_to_2024[exclusive_to_2024['_merge'] == 'left_only']

# Get the number of exclusive entries for 2023
num_exclusive_2023 = exclusive_to_2023.shape[0]
print(f"Number of entries exclusive to the 2023 report: {num_exclusive_2023}")


# Get the number of exclusive entries for 2024
num_exclusive_2024 = exclusive_to_2024.shape[0]
print(f"Number of entries exclusive to the 2024 report: {num_exclusive_2024}")

Number of entries exclusive to the 2023 report: 589
Number of entries exclusive to the 2024 report: 210


In [None]:
# Merge the two DataFrames based on 'PROJECT _ Project Without Year' to find matching entries
matching_entries = pd.merge(pro_dep_master_2023, pro_dep_master_2024, 
                            on='PROJECT _ Project Without Year', 
                            suffixes=('_2023', '_2024'))

# Now compare the 'PROJECT _ Project Type' and 'Project - Great Plains Description' fields
# We create a new DataFrame that flags if the values are the same or different
matching_entries['project_type_same'] = matching_entries['PROJECT _ Project Type_2023'] == matching_entries['PROJECT _ Project Type_2024']
matching_entries['project_description_same'] = matching_entries['Project - Great Plains Description_2023'] == matching_entries['Project - Great Plains Description_2024']

# Count the number of matching and different entries for 'PROJECT _ Project Type'
project_type_same_count = matching_entries['project_type_same'].sum()
project_type_different_count = (~matching_entries['project_type_same']).sum()

# Count the number of matching and different entries for 'Project - Great Plains Description'
project_description_same_count = matching_entries['project_description_same'].sum()
project_description_different_count = (~matching_entries['project_description_same']).sum()

print(f"Number of 'PROJECT _ Project Type' entries the same: {project_type_same_count}")
print(f"Number of 'PROJECT _ Project Type' entries different: {project_type_different_count}")
print(f"Number of 'Project - Great Plains Description' entries the same: {project_description_same_count}")
print(f"Number of 'Project - Great Plains Description' entries different: {project_description_different_count}")




Number of 'PROJECT _ Project Type' entries the same: 1358
Number of 'PROJECT _ Project Type' entries different: 2
Number of 'Project - Great Plains Description' entries the same: 1322
Number of 'Project - Great Plains Description' entries different: 38


In [None]:
matching_entries.to_csv('project_depatment_matching_entries.csv', encoding= 'UTF-8-SIG', index=False)

### Org Unit Master

In [None]:
import pandas as pd

dtype = {
    'Org Unit - Code' :'str',
    'Org Unit - Description' :'str',
    'ORG UNIT _ Business Unit Rollup _ Description' :'str',
    'ORG UNIT _ Region Rollup _ Description' :'str',

}

# Read the 2024 Excel file, skipping the first 4 rows
org_unit_master_2024 = pd.read_excel('Procurement Project & Org Data.xlsx', 
                               sheet_name='Org Units',
                               skiprows=3, 
                               dtype=dtype)

# read the 2023 csv file, skipping the first row
org_unit_master_2023 = pd.read_csv('Org Unit Master.csv',
                                   skiprows=1,
                                   encoding='UTF-8-SIG',
                                   dtype=dtype)

# Print the shape of the Org unit master 2024
org_unt_master_shape24= org_unit_master_2024.shape
print(f"Org Unit Masterfile Shape 2024: {org_unt_master_shape24}")

# Print the shape of the Org unit master 2024
org_unt_master_shape23= org_unit_master_2023.shape
print(f"Org Unit Masterfile Shape 2023: {org_unt_master_shape23}")

Org Unit Masterfile Shape 2024: (202, 4)
Org Unit Masterfile Shape 2023: (182, 4)


In [None]:
# Verify that the column name is exactly the same in both DataFrames
print("2024 Columns:", org_unit_master_2024.columns.tolist())
print("2023 Columns:", org_unit_master_2023.columns.tolist())

2024 Columns: ['Org Unit - Code', 'Org Unit - Description', 'ORG UNIT _ Business Unit Rollup _ Description', 'ORG UNIT _ Region Rollup _ Description']
2023 Columns: ['ORG UNIT _ Region Rollup _ Description', 'ORG UNIT _ Business Unit Rollup _ Description', 'Org Unit - Description', 'Org Unit - Code']


In [None]:
# Identify common columns between the two DataFrames
common_columns = org_unit_master_2024.columns.intersection(org_unit_master_2023.columns).tolist()
print(f"Common columns: {common_columns}")

# Merge the two DataFrames based on the common columns to find matching entries
matched_entries = pd.merge(org_unit_master_2023, org_unit_master_2024, on='Org Unit - Code', how='inner')

# Get the number of matching entries
num_matched_entries = matched_entries.shape[0]
print(f"Number of entries from 2023 report that exist in 2024 report: {num_matched_entries}")

Common columns: ['Org Unit - Code', 'Org Unit - Description', 'ORG UNIT _ Business Unit Rollup _ Description', 'ORG UNIT _ Region Rollup _ Description']
Number of entries from 2023 report that exist in 2024 report: 182


In [None]:
# Perform a left merge and set an indicator to True
exclusive_to_2023 = pd.merge(org_unit_master_2023, org_unit_master_2024, 
                             on='Org Unit - Code', 
                             how='left', 
                             indicator=True)

# Perform a left merge with the dataframes switched and set an indicator to True
exclusive_to_2024 = pd.merge(org_unit_master_2024, org_unit_master_2023, 
                             on='Org Unit - Code', 
                             how='left', 
                             indicator=True)

# Filter rows where the '_merge' column is 'left_only', meaning they're exclusive to the 2023 report
exclusive_to_2023 = exclusive_to_2023[exclusive_to_2023['_merge'] == 'left_only']

# Filter rows where the '_merge' column is 'left_only', meaning they're exclusive to the 2024 report
exclusive_to_2024 = exclusive_to_2024[exclusive_to_2024['_merge'] == 'left_only']

# Get the number of exclusive entries for 2023
num_exclusive_2023 = exclusive_to_2023.shape[0]
print(f"Number of entries exclusive to the 2023 report: {num_exclusive_2023}")


# Get the number of exclusive entries for 2024
num_exclusive_2024 = exclusive_to_2024.shape[0]
print(f"Number of entries exclusive to the 2024 report: {num_exclusive_2024}")

Number of entries exclusive to the 2023 report: 0
Number of entries exclusive to the 2024 report: 20


In [None]:
# Merge the two DataFrames based on 'PROJECT _ Project Without Year' to find matching entries
matching_entries = pd.merge(org_unit_master_2023, org_unit_master_2024, 
                            on='Org Unit - Code', 
                            suffixes=('_2023', '_2024'))

# Now compare the 'Business Unit Rollup' and 'Region Rollup' fields
# We create a new DataFrame that flags if the values are the same or different
matching_entries['bu_rollup_same'] = matching_entries['ORG UNIT _ Business Unit Rollup _ Description_2023'] == matching_entries['ORG UNIT _ Business Unit Rollup _ Description_2024']
matching_entries['region_rollup_same'] = matching_entries['ORG UNIT _ Region Rollup _ Description_2023'] == matching_entries['ORG UNIT _ Region Rollup _ Description_2024']

# Count the number of matching and different entries for 'PROJECT _ Project Type'
bu_same_count = matching_entries['bu_rollup_same'].sum()
bu_different_count = (~matching_entries['bu_rollup_same']).sum()

# Count the number of matching and different entries for 'Project - Great Plains Description'
region_same_count = matching_entries['region_rollup_same'].sum()
region_different_count = (~matching_entries['region_rollup_same']).sum()

print(f"Number of Business Unit Rollup entries the same: {bu_same_count}")
print(f"Number of Business Unit Rollup entries different: {bu_different_count}")
print(f"Number of Region Rollup entries the same: {region_same_count}")
print(f"Number of Region Rollup entries different: {region_different_count}")

Number of Business Unit Rollup entries the same: 174
Number of Business Unit Rollup entries different: 8
Number of Region Rollup entries the same: 182
Number of Region Rollup entries different: 0


In [None]:
matching_entries.to_csv('org_unit_diferent_23-24.csv', encoding= 'UTF-8-SIG', index=False)