### Task: Load workbook and display sheet names
 Description: This code loads an Excel workbook  using pandas and displays all the sheet names.

In [1]:
#Import Libraries
import pandas as pd

In [2]:
# Load the workbook
xls = pd.ExcelFile('GHANA ORIGIN REP.xlsx')
# Display all sheet names
print(xls.sheet_names)

['SUMMARY', '2025-QUALITY REP', 'FUNDING', 'SAMPA-PURCHASES', 'SAMPA-DRYING', 'SAMPA-DISPATCHES ', 'SAMPA W.H', 'NKRANKWANTA-PURCHASES', 'NKRANKWANTA-DRYING', 'NKRANKWANTA-DISPATCHES', 'NKRANKWANTA W.H', 'DROBO-PURCHASES', 'DROBO-DRYING', 'DROBO-DISPATCHES', 'DROBO W.H', 'WENCHI-PURCHASES', 'WENCHI-DRYING', 'WENCHI-DISPATCHES', 'WENCHI W.H', 'TECHIMAN-PURCHASES', 'TECHIMAN-DRYING', 'TECHIMAN-DISPATCHES', 'TECHIMAN W.H', 'SAWLA-PURCHASES', 'SAWLA-DRYING', 'SAWLA-DISPATCHES', 'SAWLA W.H ', 'LUC DISP', 'EX-TEMA', 'TOTALS', 'Comparisons']


In [3]:
# ========================================
# Project: Workbook Processing
# Task: Select all Purchases sheets using pandas
# Description: This code selects all sheets related to "Purchases"
# from the loaded workbook using pandas and stores them in a dictionary.
# ========================================


In [4]:
# Load the workbook
xls = pd.ExcelFile('GHANA ORIGIN REP.xlsx')

# List of all sheets related to "Purchases"
purchases_sheets = [
    'SAMPA-PURCHASES',
    'NKRANKWANTA-PURCHASES',
    'DROBO-PURCHASES',
    'WENCHI-PURCHASES',
    'TECHIMAN-PURCHASES',
    'SAWLA-PURCHASES'
]

# Selecting the sheets using pandas
selected_sheets = {sheet: pd.read_excel(xls, sheet) for sheet in purchases_sheets}

# Display the names of the selected sheets
print("Selected Purchases Sheets:")
for sheet_name in selected_sheets:
    print(sheet_name)


Selected Purchases Sheets:
SAMPA-PURCHASES
NKRANKWANTA-PURCHASES
DROBO-PURCHASES
WENCHI-PURCHASES
TECHIMAN-PURCHASES
SAWLA-PURCHASES


### Task: Process Purchases sheets into separate DataFrames with specified columns
Description: This code processes each "Purchases" sheet by skipping the first row, adding a new "Station" column with the sheet name (excluding "Purchases"), and storing the data in separate DataFrames with only the specified columns.


In [5]:
# List of columns to keep
columns_to_keep = [
    'Date', 'Name', 'Grn #', 'KOR', 'Moisture %', '# of Bags', 'Cum bags',
    'wgt/bag', 'Net wgt', 'Cum wgt', 'Rate', 'Value', 'Cum value',
    'Cheque No.', 'Receipt No.', 'Funding', 'SAP PO ID', 'Tra/Org', 'Wk',
    'Month'
]


# Load the workbook
xls = pd.ExcelFile('GHANA ORIGIN REP.xlsx')

# List of all sheets related to "Purchases"
purchases_sheets = [
    'SAMPA-PURCHASES',
    'NKRANKWANTA-PURCHASES',
    'DROBO-PURCHASES',
    'WENCHI-PURCHASES',
    'TECHIMAN-PURCHASES',
    'SAWLA-PURCHASES'
]

# Process each sheet
for sheet in purchases_sheets:
    # Load the sheet into a DataFrame, skipping the first row
    df = pd.read_excel(xls, sheet_name=sheet, header=1)
    
    # Handle column name variations (e.g., 'Tra/Organic' instead of 'Tra/Org')
    df.columns = df.columns.str.replace('Trc/Organic', 'Tra/Org', regex=False)
    
    # Filter columns that exist in the DataFrame
    existing_columns = [col for col in columns_to_keep if col in df.columns]
    df = df[existing_columns]
    
    # Dynamically create a DataFrame variable for each sheet
    globals()[f'{sheet.replace("-PURCHASES", "").lower()}_df'] = df

    # Display the first few rows to verify
    #print(f"Processed {sheet} (First 5 rows):")
    #print(df.head(), "\n")

### Organic / Traceability column for Sawla and 

In [6]:
sawla_df['Tra/Org'] = None  # Or set a specific value if needed
# List the newly created DataFrames
new_dfs = [var for var in globals() if var.endswith('_df')]
print(new_dfs)
techiman_df.columns

['sampa_df', 'nkrankwanta_df', 'drobo_df', 'wenchi_df', 'techiman_df', 'sawla_df']


Index(['Date', 'Name', 'Grn #', 'KOR', 'Moisture %', '# of Bags', 'Cum bags',
       'wgt/bag', 'Net wgt', 'Cum wgt', 'Rate', 'Value', 'Cum value',
       'Cheque No.', 'Receipt No.', 'Funding', 'SAP PO ID', 'Tra/Org', 'Wk',
       'Month'],
      dtype='object')

### Remove N/As

In [7]:
# List of all DataFrames (already created)
dfs = ['sampa_df', 'nkrankwanta_df', 'drobo_df', 'wenchi_df', 'techiman_df', 'sawla_df']

# Date columns to check for NaN (you can modify this list based on your actual columns)
date_columns = ['Date', 'Month']

# Loop through each DataFrame and remove rows where any of the date columns have NaN values
for df_name in dfs:
    df = globals().get(df_name)
    
    if df is not None:
        # Remove rows with NaN in any of the date columns
        df.dropna(subset=date_columns, how='any', inplace=True)
        
        # Display the first few rows to verify
        #print(f"Updated {df_name} after removing rows with NaN in date columns:")
        #print(df.head(), "\n")


### Add station columns to each df

In [8]:
# Dictionary of sheet names and their corresponding station names
station_mapping = {
    'SAMPA-PURCHASES': 'Sampa',
    'NKRANKWANTA-PURCHASES': 'Nkrankwanta',
    'DROBO-PURCHASES': 'Drobo',
    'WENCHI-PURCHASES': 'Wenchi',
    'TECHIMAN-PURCHASES': 'Techiman',
    'SAWLA-PURCHASES': 'Sawla'
}

# Add the 'Station' column for each DataFrame
for sheet, station in station_mapping.items():
    # Get the DataFrame corresponding to the sheet
    df = globals().get(f'{sheet.replace("-PURCHASES", "").lower()}_df')
    
    if df is not None:
        # Add the 'Station' column with the appropriate station name
        df['Station'] = station
        
        # Display the first few rows to verify
        #print(f"Updated {sheet} with 'Station' column:")
       # print(df.head(), "\n")


### DF SHAPES

In [9]:
# Loop through each DataFrame and print its shape
for df_name in dfs:
    df = globals().get(df_name)
    
    if df is not None:
        # Print the shape of the DataFrame
        print(f"Shape of {df_name}: {df.shape}")

Shape of sampa_df: (1, 21)
Shape of nkrankwanta_df: (0, 21)
Shape of drobo_df: (13, 21)
Shape of wenchi_df: (20, 21)
Shape of techiman_df: (26, 21)
Shape of sawla_df: (0, 21)


In [12]:
drobo_df.head()

Unnamed: 0,Date,Name,Grn #,KOR,Moisture %,# of Bags,Cum bags,wgt/bag,Net wgt,Cum wgt,...,Value,Cum value,Cheque No.,Receipt No.,Funding,SAP PO ID,Tra/Org,Wk,Month,Station
0,2025-01-15,Gabriel Kwabena Fosu,51.0,50.78,12.1,11.0,11,86.181818,948.0,948,...,20856,20856,352.0,101.0,C&C,,,3.0,Jan,Drobo
1,2025-01-16,Gabriel Kwabena Fosu,52.0,52.09,11.9,19.0,30,84.736842,1610.0,2558,...,35420,56276,353.0,102.0,C&C,,,3.0,Jan,Drobo
2,2025-01-22,Gabriel Kwabena Fosu,53.0,50.16,11.7,7.0,37,90.428571,633.0,3191,...,13926,70202,355.0,104.0,PF,,,4.0,Jan,Drobo
3,2025-01-23,Evans Kwabena Ankamah,54.0,48.75,13.8,107.0,144,92.934579,9944.0,13135,...,218768,288970,356.0,105.0,PF,,,4.0,Jan,Drobo
4,2025-01-24,Musah Seidu,3541.0,51.39,13.9,8.0,152,85.875,687.0,13822,...,15114,304084,354.0,103.0,PF,,,4.0,Jan,Drobo


In [13]:
# List of all DataFrames (already created)
dfs = ['sampa_df', 'nkrankwanta_df', 'drobo_df', 'wenchi_df', 'techiman_df', 'sawla_df']

# List to hold the DataFrames
df_list = []

# Loop through each DataFrame and append to the list
for df_name in dfs:
    df = globals().get(df_name)
    
    if df is not None:
        df_list.append(df)

# Concatenate all DataFrames in the list into a single DataFrame
combined_df = pd.concat(df_list, ignore_index=True)

# Display the shape of the combined DataFrame
print(f"Shape of the combined DataFrame: {combined_df.shape}")

# Display the first few rows to verify
#print(combined_df.head())
# Save the combined DataFrame to an Excel file
combined_df.to_excel('combined_data.xlsx', index=False)

# Confirm that the file was saved
print("Combined DataFrame has been saved as 'combined_data.xlsx'")
print(combined_df.shape)
combined_df.head()

Shape of the combined DataFrame: (60, 21)
Combined DataFrame has been saved as 'combined_data.xlsx'
(60, 21)


Unnamed: 0,Date,Name,Grn #,KOR,Moisture %,# of Bags,Cum bags,wgt/bag,Net wgt,Cum wgt,...,Value,Cum value,Cheque No.,Receipt No.,Funding,SAP PO ID,Tra/Org,Wk,Month,Station
0,2025-01-30 00:00:00,Peter Kofi Sah,701.0,,,65.0,65.0,96.615385,6280.0,6280.0,...,141300.0,141300.0,456.0,3151.0,PF,,,4.0,Jan,Sampa
1,2025-01-15 00:00:00,Gabriel Kwabena Fosu,51.0,50.78,12.1,11.0,11.0,86.181818,948.0,948.0,...,20856.0,20856.0,352.0,101.0,C&C,,,3.0,Jan,Drobo
2,2025-01-16 00:00:00,Gabriel Kwabena Fosu,52.0,52.09,11.9,19.0,30.0,84.736842,1610.0,2558.0,...,35420.0,56276.0,353.0,102.0,C&C,,,3.0,Jan,Drobo
3,2025-01-22 00:00:00,Gabriel Kwabena Fosu,53.0,50.16,11.7,7.0,37.0,90.428571,633.0,3191.0,...,13926.0,70202.0,355.0,104.0,PF,,,4.0,Jan,Drobo
4,2025-01-23 00:00:00,Evans Kwabena Ankamah,54.0,48.75,13.8,107.0,144.0,92.934579,9944.0,13135.0,...,218768.0,288970.0,356.0,105.0,PF,,,4.0,Jan,Drobo


In [14]:
# Convert the 'Date' column to datetime format (adjust the column name if necessary)
combined_df['Date'] = pd.to_datetime(combined_df['Date'], errors='coerce')

# Display the first few rows to verify
print(combined_df.head())


        Date                   Name  Grn #    KOR  Moisture %  # of Bags  \
0 2025-01-30         Peter Kofi Sah  701.0    NaN         NaN       65.0   
1 2025-01-15   Gabriel Kwabena Fosu   51.0  50.78        12.1       11.0   
2 2025-01-16   Gabriel Kwabena Fosu   52.0  52.09        11.9       19.0   
3 2025-01-22   Gabriel Kwabena Fosu   53.0  50.16        11.7        7.0   
4 2025-01-23  Evans Kwabena Ankamah   54.0  48.75        13.8      107.0   

   Cum bags    wgt/bag  Net wgt  Cum wgt  ...     Value  Cum value  \
0      65.0  96.615385   6280.0   6280.0  ...  141300.0   141300.0   
1      11.0  86.181818    948.0    948.0  ...   20856.0    20856.0   
2      30.0  84.736842   1610.0   2558.0  ...   35420.0    56276.0   
3      37.0  90.428571    633.0   3191.0  ...   13926.0    70202.0   
4     144.0  92.934579   9944.0  13135.0  ...  218768.0   288970.0   

   Cheque No. Receipt No. Funding SAP PO ID  Tra/Org   Wk  Month Station  
0       456.0      3151.0      PF       NaN    

In [16]:
combined_df.to_excel('combined_data.xlsx', index=False)