# Import the Required Packages

In [None]:
# Basic data manipulation
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt

# Date and time handling
from datetime import datetime, timedelta

# File operations
import os
import shutil

# Regular expressions
import re

# PDF handling
import PyPDF2
from PyPDF2 import PdfReader  # PdfFileReader has been replaced by PdfReader in PyPDF2 2.x versions

# JSON handling
import json

# OpenAI API
import openai
openai.api_key = "<openai_api_key>"

# Code for the Crawler Section

In [None]:
def extract_text_between_keywords(text, start_keyword, end_keyword):
    """
    Extracts and returns the text between two specified keywords in a given string.

    Parameters:
    text (str): The string from which to extract the text.
    start_keyword (str): The keyword marking the beginning of the desired text.
    end_keyword (str): The keyword marking the end of the desired text.

    Returns:
    str: The text found between the start_keyword and end_keyword. If start_keyword is not found, 
         an empty string is returned. If end_keyword is not found, the text from start_keyword 
         to the end of the string is returned.
    """
    start_index = text.find(start_keyword)
    if start_index == -1:
        return ""
    end_index = text.find(end_keyword, start_index)
    if end_index != -1:
        return text[start_index + len(start_keyword):end_index].strip()
    else:
        return text[start_index + len(start_keyword):].strip()

def process_pdf_files(pdf_folder):
    """
    Processes all PDF files in a specified folder, extracting text content based on predefined keyword pairs.

    The function iterates through each PDF file in the given folder, extracts text between specified keywords,
    and stores the results in a DataFrame. It also keeps track of any files that encountered errors during 
    processing and returns both the cleaned DataFrame and the list of error files.

    Parameters:
    pdf_folder (str): The path to the folder containing the PDF files to be processed.

    Returns:
    tuple:
        - pd.DataFrame: A DataFrame containing the location of the PDF files and the extracted background text.
                        Rows with missing values are dropped.
        - list: A list of filenames that encountered errors during processing.
    """
    data = []  # Initialize an empty list to store the processed data
    error_files = []  # Initialize a list to keep track of files that cause errors
    
    for filename in os.listdir(pdf_folder):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, filename)
            try:
                with open(pdf_path, 'rb') as pdf_file:
                    pdf_reader = PyPDF2.PdfReader(pdf_file)
                    num_pages = len(pdf_reader.pages)
                    text_content = ""
                    
                    for page_num in range(num_pages):
                        page = pdf_reader.pages[page_num]
                        text_content += page.extract_text()
                    
                    # Extract background text based on predefined keyword pairs
                    background_text_1 = extract_text_between_keywords(text_content, "background", "my findings")
                    background_text_2 = extract_text_between_keywords(text_content, "What happened", "What I’ve decided – and why")
                    
                    # Use the first non-empty extraction result
                    background_text = background_text_1 or background_text_2
                    
                    # Append the file's location and the extracted text to the data list
                    data.append([f"decision/{filename}", background_text])
                    
            except Exception as e:
                print(f"Error processing file {filename}: {e}")
                error_files.append(filename)
                data.append([f"decision/{filename}", ""])  # Append an empty text entry for consistency
    
    # Create a DataFrame from the data list with specified column names
    df = pd.DataFrame(data, columns=["location", "background"])
    
    # Remove rows with missing values
    df_cleaned = df.dropna()
        
    return df_cleaned, error_files

In [None]:
def extract_information(text):
    """
    Extract specific information from a given complaint text using the GPT-3.5-turbo model.

    This function sends a structured prompt to the GPT-3.5-turbo model to extract the following information:
    1. Gender of the complainant.
    2. Category of the insurance product involved in the complaint.
    3. Compensation amount mentioned in the complaint, or "Non-pecuniary compensation" if unspecified.
    4. Reasons for the complaint related to:
       - Service quality
       - Misleading information
       - Insurance policy management issues
       - Delays in payment or processing
       - Inadequate final compensation
       - Claim denial
       - Non-monetary services in the claims process

    If the extracted reasons for complaints (questions 4 to 10) are all 0, the function will identify the most similar 
    reason from the causes of these questions and mark the corresponding reason as 1.

    Parameters:
    text (str): The text of the complaint from which to extract the information.

    Returns:
    str: A JSON string containing the extracted information in the following format:
         {"Gender": "", "Category": "", "Compensation": "", "R1(Service quality)": "", "R2(Misleading information)": "", 
         "R3(Insurance policy management issues)": "", "R4(Delayed payment or processing)": "", "R5(Inadequate compensation)": "", 
         "R6(Claim denial)": "", "R7(Non-monetary services in claims)": ""}.
    """
    # Define the structured prompt for the GPT-3.5-turbo model to extract specific information
    messages = [
        {
            "role": "system", 
            "content": "You are an assistant that extracts specific information from complaints."
        },
        {
            "role": "user", 
            "content": (
                f"Extract the following information from the text:\n"
                f"1. Gender (must choose from: 0 for Male, 1 for Female, or 2 for Male and Female, can not return empty)\n"
                f"2. Category of insurance product (must choose one from: c1 for Life and Health Insurance, c2 for Medical Insurance, c3 for Personal Belongings Insurance, c4 for Property Insurance, c5 for Motor Insurance, c6 for Household Insurance, c7 for Pet Insurance, c8 for Travel Insurance, c9 for Landlord Insurance, c10 for PPI (Payment Protection Insurance), c11 for Combined and Other Insurance Types. The output must be a single choice (c1 to c11) and cannot be empty.)\n"
                f"3. Compensation amount (if not specified, return Non-pecuniary compensation)\n"
                f"4. The reason for the complaint is dissatisfaction with the service quality of the insurance company (if the complaint content includes this reason, output 1, if not included, count as 0)\n"
                f"5. The reason for the complaint is that the company provided misleading information during the insurance process (if the complaint content includes this reason, output 1; otherwise 0)\n"
                f"6. Complaints are due to poor policy management, including disputes over premiums, renewal issues, and policy cancellations (if the complaint includes this reason, output 1; otherwise 0)\n"
                f"7. Complaints are due to delays in payment or processing by the insurance company (if the complaint includes this reason, output 1; otherwise 0)\n"
                f"8. Complaints are due to inadequate final compensation by the insurance company (if the complaint includes this reason, output 1; otherwise 0)\n"
                f"9. Complaints are due to the insurance company ultimately refusing to pay the claim (if the complaint includes this reason, output 1; otherwise 0)\n"
                f"10. Complaints are due to dissatisfaction with non-monetary services in the insurance company's claims process (if the complaint includes this reason, output 1; otherwise 0)\n"
                f" If the answers from questions 4 to 10 are all 0, then identify the most similar reason from the causes of questions 4 through 10 and mark the corresponding question's answer as 1.\n"
                f"Text: {text}\n\n"
                f"Provide the information in the following format:\n"
                f'{{"Gender": "", "Category": "", "Compensation": "", "R1(Service quality)": "", "R2(Misleading information)": "", "R3(Insurance policy management issues)": "", "R4(Delayed payment or processing)": "", "R5(Inadequate compensation)": "", "R6(Claim denial)": "", "R7(Non-monetary services in claims)": ""}}'
            )
        }
    ]
    
    # Send the structured prompt to the GPT-3.5-turbo model to generate the response
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages,
        max_tokens=300,
        temperature=0,
        top_p=0,
    )
    
    # Return the extracted information from the model's response
    return response.choices[0].message['content']

In [None]:
# Initialize an empty DataFrame to store final results
df_final = pd.DataFrame()

# Set the current date for metadata extraction
# Note: Adjust the anchor date backward by one day to avoid duplication
current_date = datetime.datetime.strptime("2024-07-08", "%Y-%m-%d")

# Initialize a list to record files with errors during processing
error_files = []

# Begin the loop (currently set to iterate once for testing)
# Adjust the range as needed
for i in range(12*2+8):
    print(f'Round {i}')
    
    ## Step 1: Scrape metadata
    # Calculate the date 'i' iterations before the current date
    date = current_date - timedelta(days=10*i)
    # Convert the date to a string format suitable for the scrape command
    date_str = date.strftime('%Y-%m-%d')
    
    # Run the scraping command to get metadata up to the calculated date
    # Note: Replace "your_script_path" and "your_target_directory" with actual paths
    !python your_script_path/scrape.py get-metadata --industry-sector "payment-protection-insurance,insurance" --to {date_str}
    
    # Check if the metadata file was successfully created and is not empty
    metadata_file_path = 'your_target_directory/metadata.csv'
    if not os.path.exists(metadata_file_path):
        print("metadata.csv file doesn't exist, reentering the loop")
        continue
    
    # Load the metadata CSV into a DataFrame and select relevant columns
    df_metadata = pd.read_csv(metadata_file_path)[['decision_id', 'date', 'company', 'decision']]
    
    # Map the 'decision' column to binary values: 'Upheld' = 1, 'Not upheld' = 0
    df_metadata['decision'] = df_metadata['decision'].map({'Upheld': 1, 'Not upheld': 0})
    
    # If metadata is empty, skip to the next iteration
    if df_metadata.empty:
        print("metadata.csv file is empty, reentering the loop")
        continue
    
    ## Step 2: Extract Case IDs and related text content
    # Run a command to download decision texts associated with the metadata
    %run your_script_path/scrape.py download-decisions
    
    # Process the downloaded PDFs and extract text content
    pdf_folder_path = "your_target_directory/decisions"
    df_texts, errors = process_pdf_files(pdf_folder_path)
    
    # Log any errors encountered during the text extraction
    if errors:
        print(f"Errors in round {i}: {errors}")
        error_files.append(f'df_texts_round_{i+1}.csv')
    
    ## Step 3: Extract structured information from text using an API
    # Initialize a list to store JSON strings returned by the API
    json_strings = []

    # Process each text content in the extracted texts DataFrame
    for text_content in df_texts['background']:
        if text_content:  # Ensure the background text is not empty
            extracted_info = extract_information(text_content)
        else:
            extracted_info = None

        json_strings.append(extracted_info)

    # Initialize lists to store extracted attributes
    genders = []
    categories = []
    compensations = []
    r1 = []
    r2 = []
    r3 = []
    r4 = []
    r5 = []
    r6 = []
    r7 = []

    # Process each JSON string to extract and store individual data points
    for json_str in json_strings:
        if not json_str:
            # If json_str is empty, fill with default values (None or 0)
            genders.append(None)
            categories.append(None)
            compensations.append(None)
            r1.append(0)
            r2.append(0)
            r3.append(0)
            r4.append(0)
            r5.append(0)
            r6.append(0)
            r7.append(0)
        else:
            try:
                # Load the JSON string and extract the data
                data = json.loads(json_str)
                genders.append(data.get("Gender", None))
                categories.append(data.get("Category", None))
                compensations.append(data.get("Compensation", None))
                r1.append(data.get("R1(Service quality)", 0))
                r2.append(data.get("R2(Misleading information)", 0))
                r3.append(data.get("R3(Insurance policy management issues)", 0))
                r4.append(data.get("R4(Delayed payment or processing)", 0))
                r5.append(data.get("R5(Inadequate compensation)", 0))
                r6.append(data.get("R6(Claim denial)", 0))
                r7.append(data.get("R7(Non-monetary services in claims)", 0))
            except json.JSONDecodeError as e:
                print(f"Invalid JSON string: {json_str}")
                # Handle invalid JSON by filling with default values
                genders.append(None)
                categories.append(None)
                compensations.append(None)
                r1.append(0)
                r2.append(0)
                r3.append(0)
                r4.append(0)
                r5.append(0)
                r6.append(0)
                r7.append(0)

    # Create a DataFrame to store the extracted information
    df_extracted = pd.DataFrame({
        'location': df_texts['location'],
        'gender': genders,
        'category': categories,
        'compensation': compensations,
        'reason_1': r1,
        'reason_2': r2,
        'reason_3': r3,
        'reason_4': r4,
        'reason_5': r5,
        'reason_6': r6,
        'reason_7': r7,
        'background': df_texts['background']
    })
    
    ## Step 4: Merge metadata with the extracted features
    df_temp = pd.merge(df_metadata, df_extracted, how='left', on='location')
    df_final = pd.concat([df_final, df_temp], axis=0)
    
    # Save the combined DataFrame to a CSV file
    df_final.to_csv('your_target_directory/df_final.csv', index=False)
    
    # Save each iteration's data to a separate CSV file
    output_file_name = f'your_target_directory/df_texts_round_{i+1}.csv'
    df_temp.to_csv(output_file_name, index=False)
    print(f"Saved {output_file_name}")

    ## Step 5: Clean up temporary files to save space
    # Check if the folder containing the decision texts exists
    folder_path = "your_target_directory/decisions"
    if os.path.exists(folder_path):
        # If it exists, delete the folder and its contents
        shutil.rmtree(folder_path)
        print(f"Folder '{folder_path}' has been deleted.")
    else:
        print(f"Folder '{folder_path}' does not exist.")   

    # Delete the metadata CSV file after processing
    if os.path.exists(metadata_file_path):
        os.remove(metadata_file_path)
        print(f"File '{metadata_file_path}' has been deleted.")
    else:
        print(f"File '{metadata_file_path}' does not exist.")
        
# Print all files that encountered errors during processing
if error_files:
    print("The following rounds had errors:")
    for file in error_files:
        print(file)
else:
    print("No errors occurred during processing.")