In [1]:
import pandas as pd
import re

'''
    Manually fix "140 William Street  Perth WA " ,
    "Building B, Level 2 140 Royal Street East Perth WA 60004" 
    and "5 Newman Court Fremantle WA 6004"
    in the original file
'''
# Read the Excel file into a DataFrame
df = pd.read_excel('UpdatedAgainTenders.xlsx') 

# Read the postcode file
postcode_df = pd.read_excel('PostCodes.xlsx')

In [2]:
# Change the data type of the 'Geo Postcode' column to string
postcode_df['Geo Postcode'] = postcode_df['Geo Postcode'].astype(str)

In [3]:
# Function to extract postcode from an address
def extract_postcode(address):
    postcode = re.search(r'\b\d{4}\b', str(address))
    if postcode:
        return postcode.group(0)
    return None

# Apply the function to the 'Address' column and store the result in a new column 'Postcode'
df['Postcode'] = df['Client Agency Address'].apply(lambda x: extract_postcode(x))


In [4]:
print(df['Postcode'].isna().sum())

19


In [5]:
# Filter rows where 'Postcode' is NA
na_postcode_rows = df[df['Postcode'].isna()]

print("Rows with NA Postcode and their Address:")
print(na_postcode_rows[['Client Agency Address', 'Postcode']])

Rows with NA Postcode and their Address:
     Client Agency Address Postcode
6949              . . .  .     None
6950              . . .  .     None
6951              . . .  .     None
6952              . . .  .     None
6953              . . .  .     None
6954              . . .  .     None
6955              . . .  .     None
6956              . . .  .     None
6957              . . .  .     None
6958              . . .  .     None
6959              . . .  .     None
6960              . . .  .     None
6961              . . .  .     None
6962              . . .  .     None
6963              . . .  .     None
6964              . . .  .     None
6965              . . .  .     None
6966              . . .  .     None
6967              . . .  .     None


In [6]:
# Create an empty dictionary to hold the suburbs
postcode_to_suburbs = {}

# Populate the dictionary
for _, row in postcode_df.iterrows():
    suburb = row['Geo Suburb']
    postcode = str(row['Geo Postcode'])  # Convert to string if it's not already
    if postcode not in postcode_to_suburbs:
        postcode_to_suburbs[postcode] = []
    postcode_to_suburbs[postcode].append(suburb)

In [7]:
def extract_suburb(address):
    for postcode, suburbs in postcode_to_suburbs.items():
        for suburb in suburbs:
            if suburb in address and postcode in address:
                return suburb
    return None


In [8]:
# Extract the 'Suburb' column
df['Suburb'] = df['Client Agency Address'].apply(extract_suburb)

In [9]:
# Filter rows where 'Suburb' is NA
na_suburb_rows = df[df['Suburb'].isna()]

print("Rows with NA Suburb and their Address:")
print(na_suburb_rows[['Client Agency Address', 'Suburb']])

Rows with NA Suburb and their Address:
     Client Agency Address Suburb
6949              . . .  .   None
6950              . . .  .   None
6951              . . .  .   None
6952              . . .  .   None
6953              . . .  .   None
6954              . . .  .   None
6955              . . .  .   None
6956              . . .  .   None
6957              . . .  .   None
6958              . . .  .   None
6959              . . .  .   None
6960              . . .  .   None
6961              . . .  .   None
6962              . . .  .   None
6963              . . .  .   None
6964              . . .  .   None
6965              . . .  .   None
6966              . . .  .   None
6967              . . .  .   None


In [10]:
# Merge the DataFrames on the 'Postcode' and 'Geo Postcode' columns
merged_df = pd.merge(df, postcode_df, left_on=['Postcode', 'Suburb'], right_on=['Geo Postcode', 'Geo Suburb'], how='left')

# Drop the 'Geo Postcode' column 
merged_df.drop(['Geo Postcode','Geo Suburb'], axis=1, inplace=True)


In [11]:
# Filter rows where 'Geo Latitude' is NA
na_Latitude_rows = merged_df[merged_df['Geo Latitude'].isna()]

print("Rows with NA Geo Latitude and their Postcode:")
print(na_Latitude_rows[ 'Postcode'])

Rows with NA Geo Latitude and their Postcode:
6949    None
6950    None
6951    None
6952    None
6953    None
6954    None
6955    None
6956    None
6957    None
6958    None
6959    None
6960    None
6961    None
6962    None
6963    None
6964    None
6965    None
6966    None
6967    None
Name: Postcode, dtype: object


In [12]:
# Save the DataFrame back to Excel
merged_df.to_excel('tender_postcode.xlsx', index=False) 
