In [2]:
import os
import zipfile
import shutil
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from transformers import BertTokenizer

In [3]:
# extract tenders from zip files using mitch's code 

def search(search_path):

    ref_dict = {}

# recursively through all files and folders
    for root, dirs, files in os.walk(search_path):
        for filename in files:
            if filename.endswith(".zip"):
                # get ref num
                ref = filename.rsplit("-specification.zip", 1)[0]
                file_path = os.path.join(root, filename)

                # open read zip
                with zipfile.ZipFile(file_path, 'r') as zip_ref:
                    file_list = zip_ref.namelist()
                    doc_files = [file for file in file_list if "request" in file.lower() and file.lower().endswith(('.doc', '.docx'))]

                    if doc_files:
                        # add sub files, if any
                        ref_dict.setdefault(ref, []).extend([(doc_name, file_path) for doc_name in doc_files])

    for ref, doc_list in ref_dict.items():
        print(f"Reference: {ref}")
        for doc_name, file_path in doc_list:
            print(f"Document Name: {doc_name}, ZIP File Path: {file_path}")

    return ref_dict

###########################################
### filter for one relevant doc per ref ###
###########################################

def copy(copy_path, ref_dict):
    # make output folder, in not already there
    os.makedirs(copy_path, exist_ok=True)

    for ref, doc_list in ref_dict.items():
        for doc_name, file_path in doc_list:
            # copy relevant docs from zip to output folder
            with zipfile.ZipFile(file_path, 'r') as zip_ref:
                for file in zip_ref.namelist():
                    if file == doc_name:
                        extracted_path = os.path.join(copy_path, f"{ref}.docx")
                        with zip_ref.open(file) as source, open(extracted_path, "wb") as target:
                            shutil.copyfileobj(source, target)
                        print(f"Extracted: {doc_name} from {file_path} to {extracted_path}")
    return

zip_files_path = "D:/study/units/capstone/Tenders/Tenders"


output_folder = "D:/study/units/capstone/Tenders/TendersDoc"

def extract(search_path, copy_path):
    ref_dict = search(search_path)
    copy(copy_path, ref_dict)    
    return

extract(zip_files_path, output_folder)

Reference: 02022021DLGSCPTT
Document Name: 02022021DLGSCPTT Request.docx, ZIP File Path: D:/study/units/capstone/Tenders/Tenders\02022021DLGSCPTT-specification.zip
Reference: 18044848001
Document Name: ADDENDUM 1 TO REQUEST DOCUMENT.DOC, ZIP File Path: D:/study/units/capstone/Tenders/Tenders\18044848001-specification.zip
Reference: 18086197002
Document Name: eDoc - Request For Tender V 3.docx, ZIP File Path: D:/study/units/capstone/Tenders/Tenders\18086197002-specification.zip
Reference: 18122020DLGSCPTT
Document Name: Request #18122020DLGSCPTT - Cleaning Services - AEC - Final.DOCX, ZIP File Path: D:/study/units/capstone/Tenders/Tenders\18122020DLGSCPTT-specification.zip
Reference: 19059841006
Document Name: Request P010 Fire and Smoke Wall Compartments Construction.DOC, ZIP File Path: D:/study/units/capstone/Tenders/Tenders\19059841006-specification.zip
Reference: 19064022
Document Name: ADDENDUM TO REQUEST DOCUMENT.DOC, ZIP File Path: D:/study/units/capstone/Tenders/Tenders\19064022

Extracted: BMW0181719 Mechanical HVAC Services Request PART B.docx from D:/study/units/capstone/Tenders/Tenders\BMW0181719-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\BMW0181719.docx
Extracted: Minor Works SAMPLE-REQUEST FOR STATUTORY DECLARATION-V1-FINAL-010719.doc from D:/study/units/capstone/Tenders/Tenders\BMW0194219-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\BMW0194219.docx
Extracted: REQUEST - PART B - BMW0264419 Liquid Waste.docx from D:/study/units/capstone/Tenders/Tenders\BMW0264419-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\BMW0264419.docx
Extracted: BMW0316920 REQUEST - Provision of Routine Maintenance Services and Breakdown Repairs for Mechanical Services.docx from D:/study/units/capstone/Tenders/Tenders\BMW0316920-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\BMW0316920.docx
Extracted: REQUEST - BMW0463519 - Doors &amp; Gates - PART B.docx from D:/study/units/capstone/Tenders/Tenders\BMW0463519

Extracted: Final Request.DOCX from D:/study/units/capstone/Tenders/Tenders\CRT201904338B-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\CRT201904338B.docx
Extracted: CRTF200000107 - Request - Specialised Training Services - Kalgoorlie.docx from D:/study/units/capstone/Tenders/Tenders\CRTF200000107-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\CRTF200000107.docx
Extracted: CRTF200000936 - Development - Northam Cleaning - Request - FINAL.DOCX from D:/study/units/capstone/Tenders/Tenders\CRTF200000936-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\CRTF200000936.docx
Extracted: CRT200001038 Request FINAL.docx from D:/study/units/capstone/Tenders/Tenders\CRTF200001038-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\CRTF200001038.docx
Extracted: CTF2021008 Request - Final.docx from D:/study/units/capstone/Tenders/Tenders\CTF2021008-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\CTF2021008.docx
Extracted: DBC

Extracted: DBCARIAT2519 - Request - Addendum 2_FINAL.doc from D:/study/units/capstone/Tenders/Tenders\DBCARIAT2519-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DBCARIAT2519.docx
Extracted: Tender Request DBCARIA T2722 - Addendum 1 250222.docx from D:/study/units/capstone/Tenders/Tenders\DBCARIAT2722-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DBCARIAT2722.docx
Extracted: DBCARIAT3023 - Request for Tender - Maintenance and Irrigation Contractor.docx from D:/study/units/capstone/Tenders/Tenders\DBCARIAT3023-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DBCARIAT3023.docx
Extracted: DBCARIAT3423 - REQUEST FOR TENDER - Rottnest Island Defence Heritage Oliver Hill Engine Room Museum.docx from D:/study/units/capstone/Tenders/Tenders\DBCARIAT3423-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DBCARIAT3423.docx
Extracted: DBCARIAT3523 Request For Tender v2.docx from D:/study/units/capstone/Tenders/Tenders\DBCARIAT3523-spe

Extracted: DFES202621 - Request.DOCX from D:/study/units/capstone/Tenders/Tenders\DFES202621-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DFES202621.docx
Extracted: DFES202721 - Request - Final.docx from D:/study/units/capstone/Tenders/Tenders\DFES202721-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DFES202721.docx
Extracted: DFES203221 - Request - Final.docx from D:/study/units/capstone/Tenders/Tenders\DFES203221-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DFES203221.docx
Extracted: DFES205621 Request Final.DOCX from D:/study/units/capstone/Tenders/Tenders\DFES205621-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DFES205621.docx
Extracted: DFES206321 Request.docx from D:/study/units/capstone/Tenders/Tenders\DFES206321-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DFES206321.docx
Extracted: 207721 - Request.docx from D:/study/units/capstone/Tenders/Tenders\DFES207721-specification.zip to D:/stud

Extracted: DOC202010787 - Request for Quote - DOC Cleaning Roebourne .docx from D:/study/units/capstone/Tenders/Tenders\DOC202010787-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DOC202010787.docx
Extracted: DOC202017468 - Final Request.docx from D:/study/units/capstone/Tenders/Tenders\DOC202017468-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DOC202017468.docx
Extracted: Request DOC202017555 Schedule 2 Specification V1.0.docx from D:/study/units/capstone/Tenders/Tenders\DOC202017555-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DOC202017555.docx
Extracted: Request DOC202017555 Schedule 8 SLA V1.0.docx from D:/study/units/capstone/Tenders/Tenders\DOC202017555-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DOC202017555.docx
Extracted: Request DOC202017555 V1.0.docx from D:/study/units/capstone/Tenders/Tenders\DOC202017555-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DOC202017555.docx
Extracted: Cro

Extracted: Attachment 2 - Request Conditions.docx from D:/study/units/capstone/Tenders/Tenders\DoH20194700-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DoH20194700.docx
Extracted: DOH20194700 NEPPTS Request for Tender FINAL.docx from D:/study/units/capstone/Tenders/Tenders\DoH20194700-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DoH20194700.docx
Extracted: DoH20205122 - Final Request.docx from D:/study/units/capstone/Tenders/Tenders\DoH20205122-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DoH20205122.docx
Extracted: DoH20205227 - Request - Final.docx from D:/study/units/capstone/Tenders/Tenders\DoH20205227-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DoH20205227.docx
Extracted: Community Services Request for Tender - Palliative Care Hotline - Final 24072020.docx from D:/study/units/capstone/Tenders/Tenders\DoH20205339-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DoH20205339.docx
Extracted: Do

Extracted: 2021-25310 - Request - Addendum #2 - 14 Sept 21.docx from D:/study/units/capstone/Tenders/Tenders\DOJ202125310-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DOJ202125310.docx
Extracted: 2021-25310 - Request - FINAL - Sept 2021-2.docx from D:/study/units/capstone/Tenders/Tenders\DOJ202125310-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DOJ202125310.docx
Extracted: 2021-25310 - Request - Addendum #3 - 24 Sept 21.docx from D:/study/units/capstone/Tenders/Tenders\DOJ202125310-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DOJ202125310.docx
Extracted: Addendum #1 - Request DOJ202132458.docx from D:/study/units/capstone/Tenders/Tenders\DOJ202132458-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DOJ202132458.docx
Extracted: DoJ 2021-32458 - Request - FINAL September 2022.docx from D:/study/units/capstone/Tenders/Tenders\DOJ202132458-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DOJ202132458.doc

Extracted: DOT404919 Cleaning Services at Jurien Bay Boat Harbour Request#1.docx from D:/study/units/capstone/Tenders/Tenders\DOT404919-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DOT404919.docx
Extracted: DOT405122 - Request - Skip Bin Hire and Weekly Rubbish Removal Services Exmouth Boat Harbour.docx from D:/study/units/capstone/Tenders/Tenders\DOT405122-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DOT405122.docx
Extracted: DOT405220 - Request.doc from D:/study/units/capstone/Tenders/Tenders\DOT405220-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DOT405220.docx
Extracted: DOT405522 - SCID RFP VOLUME 3 - Request T&amp;Cs.docx from D:/study/units/capstone/Tenders/Tenders\DOT405522-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DOT405522.docx
Extracted: DOT405923 - Request Mid-Tier Transport Planning Market Research.docx from D:/study/units/capstone/Tenders/Tenders\DOT405923-specification.zip to D:/study/units/cap

Extracted: DPIRD2019079 - Request for Quote - Gascoyne Research Facility - Cleaning Services.doc from D:/study/units/capstone/Tenders/Tenders\DPIRD2019079-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DPIRD2019079.docx
Extracted: Request DPIRD202012 Addendum 1.doc from D:/study/units/capstone/Tenders/Tenders\DPIRD2020012-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DPIRD2020012.docx
Extracted: Request DPIRD2020012 v2.DOC from D:/study/units/capstone/Tenders/Tenders\DPIRD2020012-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DPIRD2020012.docx
Extracted: DPIRD2020014 - Request for Quote FINAL.doc from D:/study/units/capstone/Tenders/Tenders\DPIRD2020014-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DPIRD2020014.docx
Extracted: DPIRD2020021 - Request - FINAL.DOC from D:/study/units/capstone/Tenders/Tenders\DPIRD2020021-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DPIRD2020021.docx
Extracted: DPIRD20

Extracted: DWER102021 - Request.docx from D:/study/units/capstone/Tenders/Tenders\DWER102021-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DWER102021.docx
Extracted: DWER102120 - Request.docx from D:/study/units/capstone/Tenders/Tenders\DWER102120A-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DWER102120A.docx
Extracted: DWER102319A Request.doc from D:/study/units/capstone/Tenders/Tenders\DWER102319A-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DWER102319A.docx
Extracted: DWER102322 - Request.docx from D:/study/units/capstone/Tenders/Tenders\DWER102322-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DWER102322.docx
Extracted: DWER102819 - Request.docx from D:/study/units/capstone/Tenders/Tenders\DWER102819-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\DWER102819.docx
Extracted: DWER103118 - Request.docx from D:/study/units/capstone/Tenders/Tenders\DWER103118-specification.zip to D:/study/units/caps

Extracted: ED200161 - Request - Final.DOCX from D:/study/units/capstone/Tenders/Tenders\ED200161-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\ED200161.docx
Extracted: ED200197 - Request FINAL.docx from D:/study/units/capstone/Tenders/Tenders\ED200197-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\ED200197.docx
Extracted: ED200202 - Red Hat Licenses Request - Final.DOCX from D:/study/units/capstone/Tenders/Tenders\ED200202-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\ED200202.docx
Extracted: ED200214 - Request - Final.doc from D:/study/units/capstone/Tenders/Tenders\ED200214-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\ED200214.docx
Extracted: ED200226 Request.doc from D:/study/units/capstone/Tenders/Tenders\ED200226-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\ED200226.docx
Extracted: ED200243 - Provision of Catering Services - Request Final.docx from D:/study/units/capstone/Tenders/Tenders\ED2

Extracted: ED220176 - Request.docx from D:/study/units/capstone/Tenders/Tenders\ED220176-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\ED220176.docx
Extracted: ED220177 Request.docx from D:/study/units/capstone/Tenders/Tenders\ED220177-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\ED220177.docx
Extracted: ED220179 Request.docx from D:/study/units/capstone/Tenders/Tenders\ED220179-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\ED220179.docx
Extracted: ED220180 Request.docx from D:/study/units/capstone/Tenders/Tenders\ED220180-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\ED220180.docx
Extracted: ED220181 - Request - Security Services for Public Schools in the Esperance and Condingup Areas - Final.docx from D:/study/units/capstone/Tenders/Tenders\ED220181-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\ED220181.docx
Extracted: ED220314 Request-Cleaning-(Part A) Final.docx from D:/study/units/capstone/T

Extracted: F2100497 - HBF Park Cleaning Services - Request - FINAL.docx from D:/study/units/capstone/Tenders/Tenders\F2100497-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\F2100497.docx
Extracted: Request - Health and Fitness Wellness Solution.docx from D:/study/units/capstone/Tenders/Tenders\F2100633-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\F2100633.docx
Extracted: Request - Contractor Safety Management Audit Program.docx from D:/study/units/capstone/Tenders/Tenders\F2200098-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\F2200098.docx
Extracted: F2200251 - Request - FINAL.docx from D:/study/units/capstone/Tenders/Tenders\F2200251-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\F2200251.docx
Extracted: F2200625 Event Health Services - Request (Final).docx from D:/study/units/capstone/Tenders/Tenders\F2200625-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\F2200625.docx
Extracted: 1A) MfW PMA 2021 

Extracted: FINW0163822 Request - Part B.docx from D:/study/units/capstone/Tenders/Tenders\FINW0163822-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\FINW0163822.docx
Extracted: 05042147  FINW0163922 - Request - Part B.docx from D:/study/units/capstone/Tenders/Tenders\FINW0163922-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\FINW0163922.docx
Extracted: 05042150  FINW0163922 - Request - Part A(2).docx from D:/study/units/capstone/Tenders/Tenders\FINW0163922-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\FINW0163922.docx
Extracted: FINW0163922 REVISED Request - Part A.docx from D:/study/units/capstone/Tenders/Tenders\FINW0163922-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\FINW0163922.docx
Extracted: FINW0272322-Routine-Maintenance-Request-Part-A.docx from D:/study/units/capstone/Tenders/Tenders\FINW0272322-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\FINW0272322.docx
Extracted: FINW0272322-Routine-M

Extracted: HOU9618320 - Hannans (Kalgoorlie) - Addendum to Request 1.doc from D:/study/units/capstone/Tenders/Tenders\HOU9618320-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\HOU9618320.docx
Extracted: HOU9618320 - Kalgoorlie - Request for Tender.docx from D:/study/units/capstone/Tenders/Tenders\HOU9618320-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\HOU9618320.docx
Extracted: HOU9618320 - Hannans (Kalgoorlie) - Addendum to Request 4_.doc from D:/study/units/capstone/Tenders/Tenders\HOU9618320-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\HOU9618320.docx
Extracted: HOU9618420 - Kalgoorlie - Request for Tender.docx from D:/study/units/capstone/Tenders/Tenders\HOU9618420-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\HOU9618420.docx
Extracted: HOU9618420 - Somerville (Kalgoorlie) - Addendum to Request 4.doc from D:/study/units/capstone/Tenders/Tenders\HOU9618420-specification.zip to D:/study/units/capstone/Tenders/Te

Extracted: JTSI2021015 - Request.docx from D:/study/units/capstone/Tenders/Tenders\JTSI2021015-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\JTSI2021015.docx
Extracted: JTSI2021020 AMC - Transport Program Request FINAL.docx from D:/study/units/capstone/Tenders/Tenders\JTSI2021020-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\JTSI2021020.docx
Extracted: JTSI2021027 -  Addendum to Request (1) - Final.DOCX from D:/study/units/capstone/Tenders/Tenders\JTSI2021027-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\JTSI2021027.docx
Extracted: JTSI2021027 - Request - Final 28.05.2021.docx from D:/study/units/capstone/Tenders/Tenders\JTSI2021027-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\JTSI2021027.docx
Extracted: JTSI2021027 -  Addendum to Request (2) - Final.DOCX from D:/study/units/capstone/Tenders/Tenders\JTSI2021027-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\JTSI2021027.docx
Extracted: JTSI2122031 

Extracted: Community Services Request for Tender - Community mental health step up step down services (Geraldton Kalgoorlie a0.docx from D:/study/units/capstone/Tenders/Tenders\MHC819-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\MHC819.docx
Extracted: MHC20 65006  Regional Community mental health step up step down services - Addendum 3 to Request MHC819 - 8 July 20.DOCX from D:/study/units/capstone/Tenders/Tenders\MHC819-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\MHC819.docx
Extracted: Regional Community mental health step upstep down services - Addendum 1 to Request - 16 June 2020.DOCX from D:/study/units/capstone/Tenders/Tenders\MHC819-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\MHC819.docx
Extracted: Regional Community mental health step upstep down services - Addendum 2 to Request - 26 June 2020.DOCX from D:/study/units/capstone/Tenders/Tenders\MHC819-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\MHC819.d

Extracted: Contract 16720 - Tender Request - Provision of Cleaning Services - MRWA South West Region - Bunbury Offices   Depot.DOCX from D:/study/units/capstone/Tenders/Tenders\MRWA016720-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\MRWA016720.docx
Extracted: Contract 175 18 Commuity Perception survey 2020-2023  Request FINAL.DOC from D:/study/units/capstone/Tenders/Tenders\MRWA017518-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\MRWA017518.docx
Extracted: C179.21 - Tender Request - Panel Contract for the Provision of Road and Bridge Project Management Services.DOCX from D:/study/units/capstone/Tenders/Tenders\MRWA017921-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\MRWA017921.docx
Extracted: Request for Goods and Services - Community Perception Survey 2024-2027.docx from D:/study/units/capstone/Tenders/Tenders\MRWA017922-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\MRWA017922.docx
Extracted: 182_20 - Request for

Extracted: Request - NMHS20217609 - Clean Room Pass Through Hatches (Final).DOCX from D:/study/units/capstone/Tenders/Tenders\NMHS20217609-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\NMHS20217609.docx
Extracted: Addendum to Request Coversheet 20220315.docx from D:/study/units/capstone/Tenders/Tenders\NMHS20217933A-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\NMHS20217933A.docx
Extracted: Request NMHS20217933A 20220208.docx from D:/study/units/capstone/Tenders/Tenders\NMHS20217933A-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\NMHS20217933A.docx
Extracted: NMHS20218279 Request.docx from D:/study/units/capstone/Tenders/Tenders\NMHS20218279-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\NMHS20218279.docx
Extracted: NMHS20218868 - Supply of Security Training and Risk Assessment - Request.docx from D:/study/units/capstone/Tenders/Tenders\NMHS20218868-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\NMHS

Extracted: PTA190071 Major Works (Book2a) RFT Information Requested from the Tenderer.docx from D:/study/units/capstone/Tenders/Tenders\PTA190071-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\PTA190071.docx
Extracted: PTA190073 Short Form Consultancy (Book2a) RFT Information Requested from the Tenderer.docx from D:/study/units/capstone/Tenders/Tenders\PTA190073-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\PTA190073.docx
Extracted: PTA190076 Minor Services (Book2a) RFT Information Requested from the Tenderer V2.docx from D:/study/units/capstone/Tenders/Tenders\PTA190076-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\PTA190076.docx
Extracted: PTA190077 Supply of Goods (Book2a) RFT Information Requested from the Tenderer.docx from D:/study/units/capstone/Tenders/Tenders\PTA190077-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\PTA190077.docx
Extracted: PTA190078 - Minor Services (Book2a) RFT Information Requested from t

Extracted: PTA200104 Major Works (Book2a) RFT Information Requested from the Tenderer.docx from D:/study/units/capstone/Tenders/Tenders\PTA200104-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\PTA200104.docx
Extracted: PTA200105 Long Form Consultancy (Book2a) RFT Information Requested from the Tenderer.docx from D:/study/units/capstone/Tenders/Tenders\PTA200105-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\PTA200105.docx
Extracted: PTA200106 SBS TCM (Book2a) RFT Information Requested from the Tenderer.docx from D:/study/units/capstone/Tenders/Tenders\PTA200106-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\PTA200106.docx
Extracted: PTA200107 Major Works (Book2a) RFT Information Requested from the Tenderer.docx from D:/study/units/capstone/Tenders/Tenders\PTA200107-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\PTA200107.docx
Extracted: PTA200109_SBS TCM (Book2a) RFT Information Requested from the Tenderer.docx from D:

Extracted: PTA210130 Rollingstock Services (Book2a) RFT Information Requested from the Tenderer V2.0.docx from D:/study/units/capstone/Tenders/Tenders\PTA210130-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\PTA210130.docx
Extracted: PTA210131 Long Form Consultancy (Book2a) RFT Information Requested from the Tenderer#1.docx from D:/study/units/capstone/Tenders/Tenders\PTA210131-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\PTA210131.docx
Extracted: PTA210133 - Services (Book2a) RFT Information Requested from the Tenderer.docx from D:/study/units/capstone/Tenders/Tenders\PTA210133-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\PTA210133.docx
Extracted: PTA210137 - Long Form Consultancy (Book2a) RFT Information Requested from the Tenderer#2.docx from D:/study/units/capstone/Tenders/Tenders\PTA210137-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\PTA210137.docx
Extracted: PTA210140 (Book2a) RFT Information Requested from

Extracted: Request - QEII20190119 - Replacement of Chiller 5 CDW Discharge Valve 30 July 2019.doc from D:/study/units/capstone/Tenders/Tenders\QEII20190119-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\QEII20190119.docx
Extracted: Request QEII20190120 Reticulation - Self-cleaning Suction Screen CW2000 Final.doc from D:/study/units/capstone/Tenders/Tenders\QEII20190120-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\QEII20190120.docx
Extracted: QEII20190121 Final Request - 22 November 2019.doc from D:/study/units/capstone/Tenders/Tenders\QEII20190121-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\QEII20190121.docx
Extracted: Request - QEII20190133 - J Block Sewage Station Equipment Service 20191031 Final.doc from D:/study/units/capstone/Tenders/Tenders\QEII20190133-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\QEII20190133.docx
Extracted: Request - QEII20190140 Replacement of Cooling Coil and Minor Refurbishment of A.C

Extracted: 1.0 - SMT202022 - Request for Quote - Provision of Two Forklifts.docx from D:/study/units/capstone/Tenders/Tenders\SMT202022-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\SMT202022.docx
Extracted: SMT202301- Request.docx from D:/study/units/capstone/Tenders/Tenders\SMT202301-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\SMT202301.docx
Extracted: SMT202301 - Addendum to Request 1.docx from D:/study/units/capstone/Tenders/Tenders\SMT202301-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\SMT202301.docx
Extracted: Part A to C Request for Tender Document SPA2019100.docx from D:/study/units/capstone/Tenders/Tenders\SPA2019100-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\SPA2019100.docx
Extracted: Part A to C Request for Tender Document SPA2019100 Rev1.docx from D:/study/units/capstone/Tenders/Tenders\SPA2019100-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\SPA2019100.docx
Extracted: Request Do

Extracted: TWDT02222019 - Request  - FINAL.doc from D:/study/units/capstone/Tenders/Tenders\TWDT02222019-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\TWDT02222019.docx
Extracted: TWDT0228 2019 - Request FINAL.doc from D:/study/units/capstone/Tenders/Tenders\TWDT02282019-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\TWDT02282019.docx
Extracted: TWDT02392019A- RTO Auditors and Accreditation Reviewers - Refresh Request FINAL - Tenders WA.doc from D:/study/units/capstone/Tenders/Tenders\TWDT02392019A-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\TWDT02392019A.docx
Extracted: TWDT02392019A Addendum 1 to Request.docx from D:/study/units/capstone/Tenders/Tenders\TWDT02392019A-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\TWDT02392019A.docx
Extracted: TWDT02742020 Request - Final.docx from D:/study/units/capstone/Tenders/Tenders\TWDT02742020-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\TWDT02742020.docx

Extracted: Request - WACHS202210410 - Fire Service Equipment FINAL.docx from D:/study/units/capstone/Tenders/Tenders\WACHS202210410-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\WACHS202210410.docx
Extracted: Form - Addendum to Request WACHS202210410.DOCX from D:/study/units/capstone/Tenders/Tenders\WACHS202210410-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\WACHS202210410.docx
Extracted: Request - WACHS202210509 Servicing and Maintenance for Washing Machines, Dryers and Flatbed Ironers for WACHS GS.docx from D:/study/units/capstone/Tenders/Tenders\WACHS202210509-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\WACHS202210509.docx
Extracted: Request for Quote - WACHS202210536.DOCX from D:/study/units/capstone/Tenders/Tenders\WACHS202210536-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\WACHS202210536.docx
Extracted: Bunbury Hospital Mental Health Unit X Critical Anti-Ligature Works - Request for Quote (Ad2).doc from D

Extracted: WAPOL01722 Request for Disposal Final.docx from D:/study/units/capstone/Tenders/Tenders\WAPOL01722-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\WAPOL01722.docx
Extracted: Request - WAPOL02120 Final.doc from D:/study/units/capstone/Tenders/Tenders\WAPOL02120-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\WAPOL02120.docx
Extracted: WAPOL02221 Request Final.docx from D:/study/units/capstone/Tenders/Tenders\WAPOL02221-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\WAPOL02221.docx
Extracted: WAPOL02522 - Request Final.docx from D:/study/units/capstone/Tenders/Tenders\WAPOL02522-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\WAPOL02522.docx
Extracted: WAPOL02622 Request FINAL.docx from D:/study/units/capstone/Tenders/Tenders\WAPOL02622-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\WAPOL02622.docx
Extracted: WAPOL03722 - Request FINAL.docx from D:/study/units/capstone/Tenders/Tenders\WAPOL03722-

Extracted: Request Works - Cabling Works for Streaming Services (HMT, SAC, STCWA) - FINAL.docx from D:/study/units/capstone/Tenders/Tenders\WR05042022ACT-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\WR05042022ACT.docx
Extracted: ZPA1252519 - Provision of Framing Services - Request for Quote - FINAL.doc from D:/study/units/capstone/Tenders/Tenders\ZPA1252519-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\ZPA1252519.docx
Extracted: ZPA1252620 - Minor Works Request - Primate Raceways - Addendum 1.docx from D:/study/units/capstone/Tenders/Tenders\ZPA1252620-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\ZPA1252620.docx
Extracted: ZPA1252620 - Minor Works Request - Primate Raceways - FINAL.docx from D:/study/units/capstone/Tenders/Tenders\ZPA1252620-specification.zip to D:/study/units/capstone/Tenders/TendersDoc\ZPA1252620.docx
Extracted: ZPA1252620 - Minor Works Request - Primate Raceways - Addendum 2.docx from D:/study/units/capstone/Te

In [6]:
!pip install pandas openpyxl
!pip install python-docx

Collecting python-docx
  Using cached python-docx-0.8.11.tar.gz (5.6 MB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: python-docx
  Building wheel for python-docx (setup.py): started
  Building wheel for python-docx (setup.py): finished with status 'done'
  Created wheel for python-docx: filename=python_docx-0.8.11-py3-none-any.whl size=184489 sha256=de7bb6f40977f8d5fcd8936eb0d19772de0d9967e6cfec1a95eb3e842f614bf2
  Stored in directory: c:\users\yfr\appdata\local\pip\cache\wheels\83\8b\7c\09ae60c42c7ba4ed2dddaf2b8b9186cb105255856d6ed3dba5
Successfully built python-docx
Installing collected packages: python-docx
Successfully installed python-docx-0.8.11


In [8]:
from docx import Document


# extract doc files using andre's code
def extract_doc_by_header(docx_path, target_header):
    doc = Document(docx_path)
    content = []
    current_header = None
    capture_content = False

    for paragraph in doc.paragraphs:
        if paragraph.style.name.startswith("Heading") and paragraph.text == target_header:
            current_header = paragraph.text
            capture_content = True
            content.append(current_header)
        elif capture_content:
            content.append(paragraph.text)
        
        # Stop capturing content if a new header is encountered
        if paragraph.style.name.startswith("Heading") and paragraph.text != current_header:
            capture_content = False
   
    return "\n".join(content)


header = "Background"


data = []


folder_path = output_folder

for item in os.listdir(folder_path):
    try:
        
        file_path = os.path.join(folder_path, item)
        
        extracted_text = extract_doc_by_header(file_path, header)
        
        # get the reference
        reference = os.path.splitext(item)[0]
        
        # save reference and text
        data.append([reference, extracted_text])
    except Exception as e:
        print(f"Error processing {item}: {str(e)}")
        continue

df = pd.DataFrame(data, columns=["Reference", "Extracted_Text"])

# save the data frame into excel file
excel_file = excel_file = r"D:\study\units\capstone\Tenders\TendersDoc\extracted_data.xlsx"
df.to_excel(excel_file, index=False, engine="openpyxl")

print(f"Data has been saved to {excel_file}")


Error processing 18044848001.docx: file 'D:/study/units/capstone/Tenders/TendersDoc\18044848001.docx' is not a Word file, content type is 'application/vnd.openxmlformats-officedocument.themeManager+xml'
Error processing 19059841006.docx: Package not found at 'D:/study/units/capstone/Tenders/TendersDoc\19059841006.docx'
Error processing 19064022.docx: file 'D:/study/units/capstone/Tenders/TendersDoc\19064022.docx' is not a Word file, content type is 'application/vnd.openxmlformats-officedocument.themeManager+xml'
Error processing 20066768001.docx: Package not found at 'D:/study/units/capstone/Tenders/TendersDoc\20066768001.docx'
Error processing 2019275DLGSCL.docx: Package not found at 'D:/study/units/capstone/Tenders/TendersDoc\2019275DLGSCL.docx'
Error processing 2019L451588DLGSC.docx: Package not found at 'D:/study/units/capstone/Tenders/TendersDoc\2019L451588DLGSC.docx'
Error processing 20221378WAM.docx: "no relationship of type 'http://schemas.openxmlformats.org/officeDocument/2006

Error processing DLGSC029102019.docx: Package not found at 'D:/study/units/capstone/Tenders/TendersDoc\DLGSC029102019.docx'
Error processing DMIRS1220819.docx: Package not found at 'D:/study/units/capstone/Tenders/TendersDoc\DMIRS1220819.docx'
Error processing DMIRS20171.docx: file 'D:/study/units/capstone/Tenders/TendersDoc\DMIRS20171.docx' is not a Word file, content type is 'application/vnd.openxmlformats-officedocument.themeManager+xml'
Error processing DMIRS720520.docx: Package not found at 'D:/study/units/capstone/Tenders/TendersDoc\DMIRS720520.docx'
Error processing DOC202007741.docx: file 'D:/study/units/capstone/Tenders/TendersDoc\DOC202007741.docx' is not a Word file, content type is 'application/vnd.openxmlformats-officedocument.themeManager+xml'
Error processing DOC2020789.docx: Package not found at 'D:/study/units/capstone/Tenders/TendersDoc\DOC2020789.docx'
Error processing DOC202128705.docx: file 'D:/study/units/capstone/Tenders/TendersDoc\DOC202128705.docx' is not a Wor

Error processing DOT412420.docx: Package not found at 'D:/study/units/capstone/Tenders/TendersDoc\DOT412420.docx'
Error processing DOT412518.docx: Package not found at 'D:/study/units/capstone/Tenders/TendersDoc\DOT412518.docx'
Error processing DOT412718rft.docx: Package not found at 'D:/study/units/capstone/Tenders/TendersDoc\DOT412718rft.docx'
Error processing DOT412819.docx: Package not found at 'D:/study/units/capstone/Tenders/TendersDoc\DOT412819.docx'
Error processing DOT413218.docx: Package not found at 'D:/study/units/capstone/Tenders/TendersDoc\DOT413218.docx'
Error processing DOT414121.docx: Package not found at 'D:/study/units/capstone/Tenders/TendersDoc\DOT414121.docx'
Error processing DOT600021.docx: Package not found at 'D:/study/units/capstone/Tenders/TendersDoc\DOT600021.docx'
Error processing DOT700421.docx: file 'D:/study/units/capstone/Tenders/TendersDoc\DOT700421.docx' is not a Word file, content type is 'application/vnd.openxmlformats-officedocument.themeManager+xm

Error processing ED200214.docx: Package not found at 'D:/study/units/capstone/Tenders/TendersDoc\ED200214.docx'
Error processing ED200226.docx: Package not found at 'D:/study/units/capstone/Tenders/TendersDoc\ED200226.docx'
Error processing ED220003.docx: Package not found at 'D:/study/units/capstone/Tenders/TendersDoc\ED220003.docx'
Error processing EMHS20171526.docx: Package not found at 'D:/study/units/capstone/Tenders/TendersDoc\EMHS20171526.docx'
Error processing EMHS20182208.docx: Package not found at 'D:/study/units/capstone/Tenders/TendersDoc\EMHS20182208.docx'
Error processing EMHS20193259.docx: Package not found at 'D:/study/units/capstone/Tenders/TendersDoc\EMHS20193259.docx'
Error processing EMHS20193447.docx: Package not found at 'D:/study/units/capstone/Tenders/TendersDoc\EMHS20193447.docx'
Error processing EMHS20193666.docx: file 'D:/study/units/capstone/Tenders/TendersDoc\EMHS20193666.docx' is not a Word file, content type is 'application/vnd.openxmlformats-officedocume

Error processing MHC727.docx: file 'D:/study/units/capstone/Tenders/TendersDoc\MHC727.docx' is not a Word file, content type is 'application/vnd.openxmlformats-officedocument.themeManager+xml'
Error processing MHC729.docx: file 'D:/study/units/capstone/Tenders/TendersDoc\MHC729.docx' is not a Word file, content type is 'application/vnd.openxmlformats-officedocument.themeManager+xml'
Error processing MHC778.docx: file 'D:/study/units/capstone/Tenders/TendersDoc\MHC778.docx' is not a Word file, content type is 'application/vnd.openxmlformats-officedocument.themeManager+xml'
Error processing MHC818.docx: file 'D:/study/units/capstone/Tenders/TendersDoc\MHC818.docx' is not a Word file, content type is 'application/vnd.openxmlformats-officedocument.themeManager+xml'
Error processing MRWA017518.docx: Package not found at 'D:/study/units/capstone/Tenders/TendersDoc\MRWA017518.docx'
Error processing MWSP13CON2021.docx: Package not found at 'D:/study/units/capstone/Tenders/TendersDoc\MWSP13CON2

Error processing TWDV02512020.docx: file 'D:/study/units/capstone/Tenders/TendersDoc\TWDV02512020.docx' is not a Word file, content type is 'application/vnd.openxmlformats-officedocument.themeManager+xml'
Error processing VW201803015.docx: Package not found at 'D:/study/units/capstone/Tenders/TendersDoc\VW201803015.docx'
Error processing WACHS20171603.docx: Package not found at 'D:/study/units/capstone/Tenders/TendersDoc\WACHS20171603.docx'
Error processing WACHS20182156.docx: file 'D:/study/units/capstone/Tenders/TendersDoc\WACHS20182156.docx' is not a Word file, content type is 'application/vnd.openxmlformats-officedocument.themeManager+xml'
Error processing WACHS20182643.docx: Package not found at 'D:/study/units/capstone/Tenders/TendersDoc\WACHS20182643.docx'
Error processing WACHS20182666.docx: "no relationship of type 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument' in collection"
Error processing WACHS20182963.docx: Package not found at 'D:/st

In [10]:
# combine extracted text and short descriptions 


excel_file1 = r"D:\study\units\capstone\data\Tenders WA_descriptionExtracted.xlsx"
excel_file2 = r"D:\study\units\capstone\Tenders\TendersDoc\extracted_data.xlsx"

df1 = pd.read_excel(excel_file1)
df2 = pd.read_excel(excel_file2)

# merge data frames based on reference
merged_df = pd.merge(df1, df2, left_on='Reference Number', right_on='Reference', how='inner')

# if there is no text extracted, just use short description
merged_df['Content'] = merged_df.apply(
    lambda row: row['Description'] if pd.isnull(row['Extracted_Text']) else row['Description'] + '\n' + row['Extracted_Text'],
    axis=1
)


merged_df = merged_df[['Reference Number', 'Content']]

# drop duplicated data
merged_df = merged_df.drop_duplicates(subset=['Reference Number'], keep='first')

merged_excel_file = r"D:\study\units\capstone\data\merged_data.xlsx"
merged_df.to_excel(merged_excel_file, index=False, engine="openpyxl")

print(f"Merged data has been saved to {merged_excel_file}")


Merged data has been saved to D:\study\units\capstone\data\merged_data.xlsx


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# 读取 unique_data.xlsx 文件
excel_file = r"D:\study\units\capstone\data\merged_data.xlsx"
df = pd.read_excel(excel_file)

# 提取 Content 列的文本内容
documents = df["Content"].tolist()

# 使用 TF-IDF 向量化文本数据
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words="english")
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# 指定主题数量
num_topics = 50  # 您可以根据需要更改主题数量

# 使用 NMF 模型进行主题建模
nmf_model = NMF(n_components=num_topics, random_state=42)
nmf_matrix = nmf_model.fit_transform(tfidf_matrix)

# 获取主题的词汇
feature_names = tfidf_vectorizer.get_feature_names_out()

# 打印每个主题的前几个词汇
top_words_per_topic = []
for topic_idx, topic in enumerate(nmf_model.components_):
    top_words = [feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]
    top_words_per_topic.append(top_words)
    print(f"Topic #{topic_idx + 1}: {', '.join(top_words)}")

# 将主题分配添加到原始数据框
df["Topic"] = nmf_matrix.argmax(axis=1) + 1

# 保存包含主题分配的数据框到新的 Excel 文件
output_excel_file = "unique_data_with_topics.xlsx"
df.to_excel(output_excel_file, index=False, engine="openpyxl")

print(f"Data with topics has been saved to {output_excel_file}")
