In [1]:
#Import Libraries
import pandas as pd
import numpy as np
from tabulate import tabulate

In [2]:
xls_dispatches = pd.ExcelFile('GHANA ORIGIN REP.xlsx')
print(xls_dispatches.sheet_names)

['SUMMARY', '2025-QUALITY REP', 'FUNDING', 'SAMPA-PURCHASES', 'SAMPA-DRYING', 'SAMPA-DISPATCHES ', 'SAMPA W.H', 'NKRANKWANTA-PURCHASES', 'NKRANKWANTA-DRYING', 'NKRANKWANTA-DISPATCHES', 'NKRANKWANTA W.H', 'DROBO-PURCHASES', 'DROBO-DRYING', 'DROBO-DISPATCHES', 'DROBO W.H', 'WENCHI-PURCHASES', 'WENCHI-DRYING', 'WENCHI-DISPATCHES', 'WENCHI W.H', 'TECHIMAN-PURCHASES', 'TECHIMAN-DRYING', 'TECHIMAN-DISPATCHES', 'TECHIMAN W.H', 'SAWLA-PURCHASES', 'SAWLA-DRYING', 'SAWLA-DISPATCHES', 'SAWLA W.H ', 'LUC DISP', 'EX-TEMA', 'TOTALS', 'BL & CTNR-WISE REPORTCASHEW', 'Comparisons']


In [3]:
# Corrected list of sheets based on the actual sheet names
dispatches_sheets = [
    'SAMPA-DISPATCHES ',
    'NKRANKWANTA-DISPATCHES',
    'DROBO-DISPATCHES',
    'WENCHI-DISPATCHES',
    'TECHIMAN-DISPATCHES',
    'SAWLA-DISPATCHES'
]

# Selecting the sheets
selected_sheets = {}
for sheet in dispatches_sheets:
    try:
        selected_sheets[sheet.strip()] = pd.read_excel(xls_dispatches, sheet_name=sheet)
    except ValueError as e:
        print(f"Error loading sheet '{sheet}': {e}")

# Display the names of the successfully loaded sheets
print("Selected dispatches Sheets:")
for sheet_name in selected_sheets:
    print(sheet_name)

Selected dispatches Sheets:
SAMPA-DISPATCHES
NKRANKWANTA-DISPATCHES
DROBO-DISPATCHES
WENCHI-DISPATCHES
TECHIMAN-DISPATCHES
SAWLA-DISPATCHES


In [4]:
# Convert the drying sheets into separate DataFrames
sampa_df = pd.read_excel(xls_dispatches, sheet_name='SAMPA-DISPATCHES ', header=2)
nkrankwanta_df = pd.read_excel(xls_dispatches, sheet_name='NKRANKWANTA-DISPATCHES', header=2)
drobo_df = pd.read_excel(xls_dispatches, sheet_name='DROBO-DISPATCHES', header=2)
wenchi_df = pd.read_excel(xls_dispatches, sheet_name='WENCHI-DISPATCHES', header=2)
techiman_df = pd.read_excel(xls_dispatches, sheet_name='TECHIMAN-DISPATCHES', header=2)
sawla_df = pd.read_excel(xls_dispatches, sheet_name='SAWLA-DISPATCHES', header=2)

# Display the shape of each DataFrame for verification
print(f"Sampa DataFrame shape: {sampa_df.shape}")
print(f"Nkrankwanta DataFrame shape: {nkrankwanta_df.shape}")
print(f"Drobo DataFrame shape: {drobo_df.shape}")
print(f"Wenchi DataFrame shape: {wenchi_df.shape}")
print(f"Techiman DataFrame shape: {techiman_df.shape}")
print(f"Sawla DataFrame shape: {sawla_df.shape}")

Sampa DataFrame shape: (258, 30)
Nkrankwanta DataFrame shape: (258, 30)
Drobo DataFrame shape: (258, 30)
Wenchi DataFrame shape: (257, 25)
Techiman DataFrame shape: (257, 29)
Sawla DataFrame shape: (257, 28)


In [5]:
# Adding the new column 'Trc/Organic' to all DataFrames
sampa_df['Trc/Organic'] = None  # Replace None with the appropriate default value or logic
nkrankwanta_df['Trc/Organic'] = None
drobo_df['Trc/Organic'] = None
sawla_df['Trc/Organic'] = None
wenchi_df.rename(columns={'TRC/ORGANIC': 'Trc/Organic'}, inplace=True)
sawla_df.rename(columns={'Status': 'STATUS'}, inplace=True)
# Confirm the new column has been added
#print(sampa_df.head())
#print(nkrankwanta_df.head())
#print(drobo_df.head())
#print(sawla_df.head())


In [6]:
# Check for rows where DATE or DATE.1 might have invalid date strings
for df in [sampa_df, nkrankwanta_df, drobo_df, wenchi_df, techiman_df, sawla_df]:
    invalid_date_rows = df[df['Date'].apply(pd.to_datetime, errors='coerce').isna()]
    #print(invalid_date_rows[['DATE', 'DATE.1']])
    # Ensure all dates are in the expected format (e.g., 'YYYY-MM-DD')
    df['Date'] = df['Date'].apply(lambda x: pd.to_datetime(x, errors='coerce') if isinstance(x, str) else x)
    df['Date.1'] = df['Date.1'].apply(lambda x: pd.to_datetime(x, errors='coerce') if isinstance(x, str) else x)
    df['Date.2'] = df['Date.2'].apply(lambda x: pd.to_datetime(x, errors='coerce') if isinstance(x, str) else x)
    print(df[['Date', 'Date.1', 'Date.2']].head(1))

   Date  Date.1  Date.2
0   NaN     NaN     0.0
        Date     Date.1  Date.2
0 2025-01-31 2025-02-03     0.0
        Date  Date.1  Date.2
0 2025-02-03     NaN     0.0
        Date     Date.1  Date.2
0 2025-01-22 2025-01-27     0.0
        Date     Date.1  Date.2
0 2025-01-22 2025-01-27     NaN
   Date  Date.1  Date.2
0   NaN     NaN     0.0


  invalid_date_rows = df[df['Date'].apply(pd.to_datetime, errors='coerce').isna()]
  df['Date'] = df['Date'].apply(lambda x: pd.to_datetime(x, errors='coerce') if isinstance(x, str) else x)


In [7]:
# Removing rows with NA in the 'Date' column for all DataFrames
sampa_df = sampa_df.dropna(subset=['Date'])
nkrankwanta_df = nkrankwanta_df.dropna(subset=['Date'])
drobo_df = drobo_df.dropna(subset=['Date'])
sawla_df = sawla_df.dropna(subset=['Date'])
techiman_df = techiman_df.dropna(subset=['Date'])
wenchi_df = wenchi_df.dropna(subset=['Date'])

# Display the updated DataFrames to verify the changes
print("Rows after removing NA based on 'Date':")
print("Sampa:", len(sampa_df))
print("Nkrankwanta:", len(nkrankwanta_df))
print("Drobo:", len(drobo_df))
print("Sawla:", len(sawla_df))
print("Techiman:", len(techiman_df))

print("Wenchi:", len(wenchi_df) if 'wenchi_df' in locals() else "Not included")


Rows after removing NA based on 'Date':
Sampa: 0
Nkrankwanta: 1
Drobo: 1
Sawla: 0
Techiman: 4
Wenchi: 3


In [8]:
# Add Station column for each DataFrame
sampa_df['Area'] = 'Sampa'
nkrankwanta_df['Area'] = 'Nkrankwanta'
drobo_df['Area'] = 'Drobo'
wenchi_df['Area'] = 'Wenchi'
techiman_df['Area'] = 'Techiman'
sawla_df['Area'] = 'Sawla'

# Get the column names from each DataFrame
sampa_columns = set(sampa_df.columns)
nkrankwanta_columns = set(nkrankwanta_df.columns)
drobo_columns = set(drobo_df.columns)
wenchi_columns = set(wenchi_df.columns)
techiman_columns = set(techiman_df.columns)
sawla_columns = set(sawla_df.columns)

# Find the intersection (common columns) across all the DataFrames
common_columns = sampa_columns.intersection(nkrankwanta_columns, drobo_columns, wenchi_columns, techiman_columns, sawla_columns)

# Convert common_columns to a list
common_columns_list = list(common_columns)


In [9]:
# Desired column order
column_order = [
    'STATUS', 'Area', 'Date', 'Waybill #', 'Truck #', '# of bags', 'Net weight', 
    'Nut Count', 'Moisture', 'Kor', 'Date.1', '# of bags.1', 'Net weight.1', 
    'Weight loss/gain.1', '% loss/gain.1', 'Nut count.1', 'Moisture.1', 
    'Kor.1', 'Date.2', '# of bags.2', 'Net weight.2', 'Weight loss/gain.2', 
    'NC.2', 'Moisture.2', 'Kor.2', 'Trc/Organic'
]

# Ensure all DataFrames have the same columns and order them
dfs = [techiman_df, sawla_df, sampa_df, wenchi_df, drobo_df, nkrankwanta_df]
dfs_ordered = [df.reindex(columns=column_order) for df in dfs]

# Concatenate all DataFrames
combined_df = pd.concat(dfs_ordered, ignore_index=True)

# Sort the DataFrame by the 'Date' column in ascending order (oldest to newest)
combined_df = combined_df.sort_values(by='Date', ascending=True)

# Display the resulting DataFrame
print("Combined DataFramer:")
print(combined_df.shape)
combined_df.head(3)

# Save the combined DataFrame to an Excel file
output_file = 'Ghana RCN Dispatches.xlsx'

# Save to Excel, specifying the sheet name and index preference
combined_df.to_excel(output_file, index=False, sheet_name='Combined Dispatches')

print(f"Dispatches DataFrame has been saved to {output_file}")


Combined DataFramer:
(9, 26)


  combined_df = pd.concat(dfs_ordered, ignore_index=True)


Dispatches DataFrame has been saved to Ghana RCN Dispatches.xlsx


In [10]:
common_columns

{'# of bags',
 '# of bags.1',
 '# of bags.2',
 '% loss/gain',
 'Area',
 'Date',
 'Date.1',
 'Date.2',
 'Kor',
 'Kor.1',
 'Kor.2',
 'Moisture',
 'Moisture.1',
 'Moisture.2',
 'NC',
 'Net weight',
 'Net weight.1',
 'Net weight.2',
 'Nut Count',
 'Nut count',
 'STATUS',
 'Trc/Organic',
 'Truck #',
 'Waybill #',
 'Weight loss/gain',
 'Weight loss/gain.1'}