In [1]:
import pandas as pd

In [3]:
meta_df = pd.read_csv('metadata.csv')  # Importing the metadata file

In [5]:
meta_df

Unnamed: 0,decision_id,location,title,date,company,decision,extras,tag
0,DRN-4813614,decision/DRN-4813614.pdf,Decision Reference DRN-4813614,28 May 2024,Wakam,Not upheld,,Insurance
1,DRN-4751994,decision/DRN-4751994.pdf,Decision Reference DRN-4751994,28 May 2024,Royal & Sun Alliance Insurance Limited,Upheld,,Insurance
2,DRN-4819277,decision/DRN-4819277.pdf,Decision Reference DRN-4819277,28 May 2024,AXA Insurance UK Plc,Upheld,,Insurance
3,DRN-4806276,decision/DRN-4806276.pdf,Decision Reference DRN-4806276,28 May 2024,Inter Partner Assistance SA,Not upheld,,Insurance
4,DRN-4782287,decision/DRN-4782287.pdf,Decision Reference DRN-4782287,28 May 2024,Admiral Insurance (Gibraltar) Limited,Not upheld,,Insurance
...,...,...,...,...,...,...,...,...
6656,DRN-4229415,decision/DRN-4229415.pdf,Decision Reference DRN-4229415,17 Jul 2023,Covea Insurance plc,Not upheld,,Insurance
6657,DRN-4001454,decision/DRN-4001454.pdf,Decision Reference DRN-4001454,17 Jul 2023,Unum Ltd,Not upheld,,Insurance
6658,DRN-4248624,decision/DRN-4248624.pdf,Decision Reference DRN-4248624,17 Jul 2023,AXA Insurance UK Plc,Upheld,,Insurance
6659,DRN-4222353,decision/DRN-4222353.pdf,Decision Reference DRN-4222353,17 Jul 2023,Skyfire Insurance Company Limited,Not upheld,,Insurance


In [7]:
# User defined function for parsing the decision files to extract the content

import os
import fitz  # PyMuPDF
import pandas as pd

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    with fitz.open(pdf_path) as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    return text

def process_text(text, decision_id):
    """Process the extracted text to find respective sections."""
    sections = {
        "decision_id": decision_id,
        "Complaint_Info": "",
        "Complaint_Explanation": "",
        "Decision_Taken_And_Reason": "",
        "Final_Decision": ""
    }

    # Define keywords to identify section boundaries
    section_keywords = {
        "Complaint_Info": ["The complaint", "complaint", "summary of complaint", "complaint and background", "summary", "complaint & background",
                           "complaint and summary", "summary and background to complaint", "complaint.", "summary complaint",
                           "summary and background of complaint", "complaint / background", "complaints", "circumstances",
                           "The complaint and what happened", "The complaint and background", "summary and background to the complaint"],
        "Complaint_Explanation": ["What happened", "background", "background to complaint", "background and summary to complaint"],
        "Decision_Taken_And_Reason": ["What I’ve decided – and why", "my findings","findings"],
        "Final_Decision": ["My final decision", "my final decision", "my decision", "final decision","decision"]
    }

    current_section = None
    section_set = set()

    lines = text.splitlines()

    for line in lines:
        line = line.strip()

        # Determine if the line contains a section header keyword and is the only content on the line
        section_found = False
        for section, keywords in section_keywords.items():
            for keyword in keywords:
                if line.lower() == keyword.lower() and section not in section_set:
                    current_section = section
                    section_set.add(section)
                    section_found = True
                    break
            if section_found:
                break

        # If the line doesn't match any section keyword, add it to Miscellaneous
        if not section_found and current_section:
            sections[current_section] += line + " "

    # Clean up extra spaces in the sections
    for section in sections:
        sections[section] = sections[section].strip()

    return sections

def parse_pdfs_in_folder(folder_path):
    """Parse all PDFs in the specified folder and save the sections in a DataFrame."""
    
    records = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            text = extract_text_from_pdf(pdf_path)
            decision_id = os.path.splitext(filename)[0]  # Use filename (without extension) as decision ID
            sections = process_text(text, decision_id)
            records.append(sections)

    df = pd.DataFrame(records)
    return df

In [9]:
# Running the above user defined fucntion to get the parsed content from files present in decisions folder.
pdf_folder_path = "decisions"
parsed_df = parse_pdfs_in_folder(pdf_folder_path)

In [10]:
parsed_df

Unnamed: 0,decision_id,Complaint_Info,Complaint_Explanation,Decision_Taken_And_Reason,Final_Decision
0,DRN-4298255,Ms R complains about the way Amtrust Europe Li...,"In July 2022, Ms R amended her existing person...",I’ve considered all the available evidence and...,"For the reasons I’ve given above, my final dec..."
1,DRN-4076446,"A limited company, which I’ll refer to as S co...",The details of the complaint are well known to...,I’ve considered all the available evidence and...,My final decision is that I uphold S’s complai...
2,DRN-4806913,Mr G and Mrs K are unhappy that AML Associates...,Mr G and Mrs K applied for the policy – throug...,I’ve considered all the available evidence and...,I partially uphold Mr G and Mrs K’s complaint ...
3,DRN-4245746,Miss B complains that The Equine and Livestock...,Miss B has an insurance policy with EL that pr...,I’ve considered all the available evidence and...,My decision is that I do not uphold this compl...
4,DRN-4319039,Mrs D complains about British Gas Insurance Li...,Mrs D held a home emergency policy with BG and...,I’ve considered all the available evidence and...,"For the reasons given, I partially uphold Mrs ..."
...,...,...,...,...,...
6656,DRN-4242030,Mr and Mrs D are unhappy with the way Amtrust ...,The circumstances of this complaint aren’t in ...,I’ve considered all the available evidence and...,I uphold this complaint. I require Amtrust Eur...
6657,DRN-4597330,Mr P complains about the way Royal & Sun Allia...,The details of this complaint are well known t...,I’ve considered all the available evidence and...,My final decision is I don’t uphold Mr P’s com...
6658,DRN-4256045,Mr S has complained about the amount Admiral I...,The details of this complaint are well known t...,I’ve considered all the available evidence and...,"For the reasons I’ve explained above, and in m..."
6659,DRN-4353897,Mr M’s complaint is about a claim he made on h...,Mr M made a claim on his DAS legal expenses in...,I’ve considered all the available evidence and...,"For the reasons set out above, I don’t uphold ..."


In [11]:
parsed_df.nunique()

decision_id                  6661
Complaint_Info               6640
Complaint_Explanation        6549
Decision_Taken_And_Reason    6619
Final_Decision               6645
dtype: int64

In [12]:
parsed_df['Complaint_Info'].iloc[223]

'Mr A complains that Marshmallow Insurance Limited wants to use a non-manufacturer’s part to replace his windscreen following a claim made on his motor insurance policy. He wants it to use a manufacturer’s part.'

In [21]:
parsed_df['Complaint_Explanation'].iloc[1213]

'Mr S holds home emergency cover with Accredited. He made a claim after his boiler broke down. An engineer attended and found a fault with a circuit board (PCB). The cost of the repair exceeded the £300 policy limit by £46.27 and so Mr S paid the difference. Accredited confirmed that Mr S could buy some heaters and it would reimburse him up to £100 for this. A new PCB was ordered, but when an engineer attended to install it, they found that it was the wrong PCB. The correct PCB was then ordered. After this was installed, the engineer found another problem (with the thermistor). Accredited initially said Mr S would need to arrange his own repair at that point, as the policy limit had been reached. Though I understand Accredited did then order and install a new thermistor (which Mr S paid for). Mr S complained to Accredited about its handling of the claim. He was offered £200 compensation due to the issues with the initial repair and for Accredited’s lack of communication. Unhappy with t

In [14]:
parsed_df[(parsed_df['Complaint_Info'] == '') & (parsed_df['Complaint_Explanation'] == '')].shape

(0, 5)

In [23]:
merged_df = pd.merge(meta_df, parsed_df, on='decision_id', how='inner') # Merging the metadata and the dataframe which had the parsed content
merged_df

Unnamed: 0,decision_id,location,title,date,company,decision,extras,tag,Complaint_Info,Complaint_Explanation,Decision_Taken_And_Reason,Final_Decision
0,DRN-4813614,decision/DRN-4813614.pdf,Decision Reference DRN-4813614,28 May 2024,Wakam,Not upheld,,Insurance,Mr S complains about the price quoted by Wakam...,Mr S received a quote to renew his policy whic...,I’ve considered all the available evidence and...,"For the reasons I have given, it is my final d..."
1,DRN-4751994,decision/DRN-4751994.pdf,Decision Reference DRN-4751994,28 May 2024,Royal & Sun Alliance Insurance Limited,Upheld,,Insurance,Mr M has complained that Royal & Sun Alliance ...,Mr M made a claim for the repair of his car wh...,I’ve considered all the available evidence and...,"So, for these reasons, it’s my final decision ..."
2,DRN-4819277,decision/DRN-4819277.pdf,Decision Reference DRN-4819277,28 May 2024,AXA Insurance UK Plc,Upheld,,Insurance,Mr D has complained about the service received...,Mr L returned home from holiday in May 2023 to...,I’ve considered all the available evidence and...,"For the reasons given above, I uphold Mr L’s c..."
3,DRN-4806276,decision/DRN-4806276.pdf,Decision Reference DRN-4806276,28 May 2024,Inter Partner Assistance SA,Not upheld,,Insurance,"Mr F, Mrs F and Miss F complaint that Inter Pa...","Mr F, Mrs F and Miss F were due to go on holid...",I’ve considered all the available evidence and...,Inter Partner Assistance SA has already made a...
4,DRN-4782287,decision/DRN-4782287.pdf,Decision Reference DRN-4782287,28 May 2024,Admiral Insurance (Gibraltar) Limited,Not upheld,,Insurance,Ms W complains that Admiral Insurance (Gibralt...,"In June 2023, Ms W took out a single trip trav...",I’ve considered all the available evidence and...,"For the reasons I’ve given above, my final dec..."
...,...,...,...,...,...,...,...,...,...,...,...,...
6656,DRN-4229415,decision/DRN-4229415.pdf,Decision Reference DRN-4229415,17 Jul 2023,Covea Insurance plc,Not upheld,,Insurance,"A limited company, which I will call L, has co...",L is a restaurant. L contacted Covea to make a...,I’ve considered all the available evidence and...,I do not uphold this complaint. Under the rule...
6657,DRN-4001454,decision/DRN-4001454.pdf,Decision Reference DRN-4001454,17 Jul 2023,Unum Ltd,Not upheld,,Insurance,Mr S complains that Unum Ltd has turned down a...,The background to this complaint is well-known...,I’ve considered all the available evidence and...,"For the reasons I’ve given above, my final dec..."
6658,DRN-4248624,decision/DRN-4248624.pdf,Decision Reference DRN-4248624,17 Jul 2023,AXA Insurance UK Plc,Upheld,,Insurance,Ms M complains that AXA Insurance UK Plc unfai...,Ms M had home insurance that was underwritten ...,I’ve considered all the available evidence and...,"For the reasons I’ve given, I uphold Ms M’s co..."
6659,DRN-4222353,decision/DRN-4222353.pdf,Decision Reference DRN-4222353,17 Jul 2023,Skyfire Insurance Company Limited,Not upheld,,Insurance,Ms M complains about Skyfire Insurance Company...,"Ms M held a motor insurance policy, underwritt...",I’ve considered all the available evidence and...,"For the reasons outlined above, I don’t uphold..."


In [25]:
# Path to the CSV file
csv_file_path = "updated_final_merged_df.csv"

In [27]:
existing_df = pd.read_csv(csv_file_path)
existing_df

Unnamed: 0,decision_id,location,title,date,company,decision,extras,tag,Complaint_Info,Complaint_Explanation,Decision_Taken_And_Reason,Final_Decision
0,DRN7934249,decision/DRN7934249.pdf,Decision Reference DRN7934249,15 Jul 2014,Bank of Scotland Plc,Not upheld,,Payment protection insurance (PPI),This complaint concerns a mortgage payment pro...,Our adjudicator considered that this complaint...,I have considered all of the available evidenc...,"For the reasons set out above, I do not uphold..."
1,DRN0343895,decision/DRN0343895.pdf,Decision Reference DRN0343895,15 Jul 2014,Nationwide Building Society,Not upheld,,Payment protection insurance (PPI),This complaint concerns a regular premium paym...,Our adjudicator considered Mrs T’s complaint a...,I have considered all of the available evidenc...,"For the reasons set out above, I do not uphold..."
2,DRN6179184,decision/DRN6179184.pdf,Decision Reference DRN6179184,15 Jul 2014,Lloyds Bank PLC,Upheld,,Payment protection insurance (PPI),This complaint concerns a regular premium paym...,The background to this complaint was set out i...,I have considered all the available evidence a...,"For the reasons set out above, I uphold Mrs T’..."
3,DRN0098468,decision/DRN0098468.pdf,Decision Reference DRN0098468,15 Jul 2014,Anglian Windows Ltd,Upheld,,Payment protection insurance (PPI),Mr L and Miss M have complained that they were...,Mr L and Miss M took out a loan with Anglian W...,I have considered all the available evidence a...,For the reasons stated above I uphold Mr L and...
4,DRN9420247,decision/DRN9420247.pdf,Decision Reference DRN9420247,15 Jul 2014,Bank of Scotland Plc,Not upheld,,Payment protection insurance (PPI),This complaint concerns a regular mortgage pay...,An adjudicator at this service has considered ...,I have considered all the available evidence a...,Having carefully considered all the evidence a...
...,...,...,...,...,...,...,...,...,...,...,...,...
154131,DRN-3573540,decision/DRN-3573540.pdf,Decision Reference DRN-3573540,15 Jul 2022,Royal & Sun Alliance Insurance Limited,Upheld,,Insurance,Mr and Mrs F complain Royal and Sun Alliance P...,In April 2021 Mrs F discovered damp to a carpe...,I’ve considered all the available evidence and...,For the reasons I have given I uphold this com...
154132,DRN-3397931,decision/DRN-3397931.pdf,Decision Reference DRN-3397931,15 Jul 2022,British Gas Services Limited,Upheld,,Insurance,E complains about how British Gas Services Lim...,"Mr O is a director of E, which is a business t...",I’ve considered all the available evidence and...,My final decision is that I uphold this compla...
154133,DRN-3306294,decision/DRN-3306294.pdf,Decision Reference DRN-3306294,15 Jul 2022,Covea Insurance plc,Not upheld,,Insurance,Mr D is unhappy with the way Covea Insurance p...,Mr D held a personal accident policy underwrit...,I’ve considered all the available evidence and...,My final decision is that I don’t uphold this ...
154134,DRN-3597660,decision/DRN-3597660.pdf,Decision Reference DRN-3597660,15 Jul 2022,Lloyds Bank General Insurance Limited,Upheld,,Insurance,Mr K complains that Lloyds Bank General Insura...,Mr K noticed a leak from his en-suite shower i...,I’ve considered all the available evidence and...,For the reasons I’ve given above and in my pro...


In [28]:
combined_df = pd.concat([existing_df, merged_df], ignore_index=True) # Concatenate the current dataframe to the existing dataframe which has data from previous years.
combined_df

Unnamed: 0,decision_id,location,title,date,company,decision,extras,tag,Complaint_Info,Complaint_Explanation,Decision_Taken_And_Reason,Final_Decision
0,DRN7934249,decision/DRN7934249.pdf,Decision Reference DRN7934249,15 Jul 2014,Bank of Scotland Plc,Not upheld,,Payment protection insurance (PPI),This complaint concerns a mortgage payment pro...,Our adjudicator considered that this complaint...,I have considered all of the available evidenc...,"For the reasons set out above, I do not uphold..."
1,DRN0343895,decision/DRN0343895.pdf,Decision Reference DRN0343895,15 Jul 2014,Nationwide Building Society,Not upheld,,Payment protection insurance (PPI),This complaint concerns a regular premium paym...,Our adjudicator considered Mrs T’s complaint a...,I have considered all of the available evidenc...,"For the reasons set out above, I do not uphold..."
2,DRN6179184,decision/DRN6179184.pdf,Decision Reference DRN6179184,15 Jul 2014,Lloyds Bank PLC,Upheld,,Payment protection insurance (PPI),This complaint concerns a regular premium paym...,The background to this complaint was set out i...,I have considered all the available evidence a...,"For the reasons set out above, I uphold Mrs T’..."
3,DRN0098468,decision/DRN0098468.pdf,Decision Reference DRN0098468,15 Jul 2014,Anglian Windows Ltd,Upheld,,Payment protection insurance (PPI),Mr L and Miss M have complained that they were...,Mr L and Miss M took out a loan with Anglian W...,I have considered all the available evidence a...,For the reasons stated above I uphold Mr L and...
4,DRN9420247,decision/DRN9420247.pdf,Decision Reference DRN9420247,15 Jul 2014,Bank of Scotland Plc,Not upheld,,Payment protection insurance (PPI),This complaint concerns a regular mortgage pay...,An adjudicator at this service has considered ...,I have considered all the available evidence a...,Having carefully considered all the evidence a...
...,...,...,...,...,...,...,...,...,...,...,...,...
160792,DRN-4229415,decision/DRN-4229415.pdf,Decision Reference DRN-4229415,17 Jul 2023,Covea Insurance plc,Not upheld,,Insurance,"A limited company, which I will call L, has co...",L is a restaurant. L contacted Covea to make a...,I’ve considered all the available evidence and...,I do not uphold this complaint. Under the rule...
160793,DRN-4001454,decision/DRN-4001454.pdf,Decision Reference DRN-4001454,17 Jul 2023,Unum Ltd,Not upheld,,Insurance,Mr S complains that Unum Ltd has turned down a...,The background to this complaint is well-known...,I’ve considered all the available evidence and...,"For the reasons I’ve given above, my final dec..."
160794,DRN-4248624,decision/DRN-4248624.pdf,Decision Reference DRN-4248624,17 Jul 2023,AXA Insurance UK Plc,Upheld,,Insurance,Ms M complains that AXA Insurance UK Plc unfai...,Ms M had home insurance that was underwritten ...,I’ve considered all the available evidence and...,"For the reasons I’ve given, I uphold Ms M’s co..."
160795,DRN-4222353,decision/DRN-4222353.pdf,Decision Reference DRN-4222353,17 Jul 2023,Skyfire Insurance Company Limited,Not upheld,,Insurance,Ms M complains about Skyfire Insurance Company...,"Ms M held a motor insurance policy, underwritt...",I’ve considered all the available evidence and...,"For the reasons outlined above, I don’t uphold..."


In [31]:
# Save the updated DataFrame to the CSV file
combined_df.to_csv(csv_file_path, index=False)  # Final dataframe to use after merging

In [None]:
'''

2013 - 2014 data is appended...

2014 - 2015 data is appended...

2015 - 2016 data is appended..

2016 - 2017 data is appended..

2017 - 2018 data is appended..

2018 - 2019 data is  appended..

2019 - 2020 data is appended..

2020 - 2021 data is  appended..

2021 - 2022 data is appended..

2022 - 2023 data is appended..

2023 - 2024 data is appended..

'''

In [None]:
************************************************************