In [12]:
#-----------------------------------------------------------------JUPYTER NOTEBOOK SETTINGS-----------------------------------------------------------------
from IPython.display import display, HTML                                 
display(HTML("<style>.container { width:100% !important; }</style>"))   

In [15]:
import re
import pandas as pd
import os

In [19]:
def clean_filename(filename):
    # Remove the file extension first
    filename = re.sub(r'\.pdf$', '', filename, flags=re.IGNORECASE)

    # Replace underscores with spaces
    filename = filename.replace("_", " ")

    # Handle specific patterns like "22-23" by temporarily replacing them
    # This regex finds "number-number" patterns and temporarily replaces them to protect them
    range_placeholder = "NUMRANGE"  # Placeholder for number ranges like "22-23"
    ranges = re.findall(r'\b\d{2}-\d{2}\b', filename)  # Find number ranges
    for range_ in ranges:
        filename = filename.replace(range_, range_placeholder, 1)  # Replace ranges with placeholder

    # Replace remaining dashes not in dates or ranges with spaces
    filename = filename.replace("-", " ")

    # Restore number ranges by replacing the placeholder with original ranges
    for range_ in ranges:
        filename = filename.replace(range_placeholder, range_, 1)  # Restore number range from placeholder

    # Clean up extra spaces that might have been introduced
    filename = re.sub(r'\s+', ' ', filename).strip()
    
    return filename

# Example filenames including the new ones
filenames = [
"Cenex-WP-2-True-Value-of-V2G-Report.pdf",
"V2GB-Public-Report.pdf",
"ESC-_V2GB_D1.3-Long-term-estimates-of-size-of-V2G-market_Final.pdf",
"ESC_V2GB_WP1_D1.2_Long-term-estimates-of-V2G-opportunities_Final.pdf",
"V2GB-D1.1-Key-drivers-and-dependencies-for-V2G_Final.pdf",
"V2GB_WP-4-report-Requirements-for-market-scale-up.pdf",
"How-to-increase-consumer-confidence-in-gas-boiler-alternatives-1.pdf"
]

# Clean each filename
cleaned_filenames = [clean_filename(filename) for filename in filenames]

# Display the results
for original, cleaned in zip(filenames, cleaned_filenames):
    print(f"Filename/Original: {original}\nTitle/Cleaned: {cleaned}\n")


Filename/Original: Cenex-WP-2-True-Value-of-V2G-Report.pdf
Title/Cleaned: Cenex WP 2 True Value of V2G Report

Filename/Original: V2GB-Public-Report.pdf
Title/Cleaned: V2GB Public Report

Filename/Original: ESC-_V2GB_D1.3-Long-term-estimates-of-size-of-V2G-market_Final.pdf
Title/Cleaned: ESC V2GB D1.3 Long term estimates of size of V2G market Final

Filename/Original: ESC_V2GB_WP1_D1.2_Long-term-estimates-of-V2G-opportunities_Final.pdf
Title/Cleaned: ESC V2GB WP1 D1.2 Long term estimates of V2G opportunities Final

Filename/Original: V2GB-D1.1-Key-drivers-and-dependencies-for-V2G_Final.pdf
Title/Cleaned: V2GB D1.1 Key drivers and dependencies for V2G Final

Filename/Original: V2GB_WP-4-report-Requirements-for-market-scale-up.pdf
Title/Cleaned: V2GB WP 4 report Requirements for market scale up

Filename/Original: How-to-increase-consumer-confidence-in-gas-boiler-alternatives-1.pdf
Title/Cleaned: How to increase consumer confidence in gas boiler alternatives 1



In [17]:
def update_pdf_titles_in_csv(csv_file_path):
    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv(csv_file_path)
    
    # Apply the clean_filename function to the 'pdf_filename' column and store the result in the 'pdf_title' column
    df['pdf_title'] = df['pdf_filename'].apply(clean_filename)
    
    # Write the updated DataFrame back to the same CSV file, or specify a new file name if preferred
    df.to_csv(csv_file_path, index=False)
    print("CSV file has been updated with curated pdf titles.")

# Construct the absolute path from a base directory and a relative path
relative_path = "../../ESC/esc_webpage_report_scraper/website_log/pdf_reports_metadata.csv"
absolute_path = os.path.abspath(relative_path)
print("The absolute path to the file is:", absolute_path)

update_pdf_titles_in_csv(absolute_path)

The absolute path to the file is: /Users/ciprianifrim/_client-projects/ESC/esc_webpage_report_scraper/website_log/pdf_reports_metadata.csv
CSV file has been updated with curated pdf titles.
