In [19]:
import sys
import os
import pandas as pd
import re 
from langchain_community.document_loaders import PyMuPDFLoader

# Dynamically resolve the base directory for Jupyter notebooks
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), '../F1-regulation-RAG-2025'))
sys.path.append(BASE_DIR)

In [20]:
# Ensure the data directory path is resolved correctly
processed_data_dir = os.path.join(BASE_DIR, 'data', 'processed')
os.makedirs(processed_data_dir, exist_ok=True)
processed_data_dir 

'/Users/pierre.muller/Documents/Perso/F1-regulation-RAG-2025/data/processed'

In [21]:
!ls {BASE_DIR}

README.md          [34mexamples[m[m           pyproject.toml
[34mdata[m[m               main.py            [34msrc[m[m
docker-compose.yml [34mollama[m[m             uv.lock


In [22]:
def remove_footer(text: str) -> str:
    """
    Removes the footer from the given text.

    Args:
        text (str): The text containing the footer.

    Returns:
        str: The text with the footer removed.
    """
    footer_pattern = r"2025 Formula 1 Sporting Regulations\s+\d+/\d+\s+30 April 2025\s+©2025 Fédération Internationale de l’Automobile\s+Issue 5"
    cleaned_text = re.sub(footer_pattern, '', text)
    return cleaned_text

In [23]:
def extract_pdf_text_to_dataframe(pdf_path: str) -> pd.DataFrame:
    """
    Extracts text from a PDF file and returns it in a pandas DataFrame.

    Args:
        pdf_path (str): Path to the PDF file.

    Returns:
        pd.DataFrame: A DataFrame containing the extracted text, with each page as a row.
    """
    try:
        
        # Load the document
        loader = PyMuPDFLoader(pdf_path,extract_tables='markdown')
        documents = loader.load_and_split()

        pages_text = [remove_footer(doc.page_content) for doc in documents]

        df = pd.DataFrame({'Page': range(1, len(pages_text) + 1), 'Text': pages_text})
        return df


    except Exception as e:
        raise RuntimeError(f"Error processing PDF file: {e}")



In [24]:
pdf_path = os.path.join(BASE_DIR, 'data', 'pdfs', 'FIA 2025 Formula 1 Sporting Regulations - Issue 5 - 2025-04-30.pdf')
df = extract_pdf_text_to_dataframe(pdf_path)
df.to_csv(os.path.join(processed_data_dir, 'pdf_text.csv'), index=False)
print(df['Text'][1])

 
 
1) 
REGULATIONS 
1.1 
The FIA will organise the FIA Formula One World Championship (the Championship) which is the 
property of the FIA and comprises two titles of World Champion, one for drivers and one for 
constructors. It consists of the Formula One Grand Prix races which are included in the Formula 
One calendar and in respect of which the ASNs and organisers have signed organisation 
agreements with the FIA. All the participating parties (FIA, ASNs, organisers, Competitors and 
circuits) undertake to apply as well as observe the rules governing the Championship and must 
hold the appropriate FIA Licences which are issued to drivers, Competitors, officials, organisers 
and circuits. 
1.2 
The final text of these Sporting Regulations shall be the English version which will be used should 
any dispute arise as to their interpretation. Headings in this document are for ease of reference 
only and do not form part of these Sporting Regulations. 
1.3 
These Sporting Regulations app

In [None]:
def extract_sections_to_dataframe(full_text: str) -> pd.DataFrame:
    """
    Extracts sections from the text based on patterns like 'NUMBER) A TITLE'
    and returns a DataFrame with 'title' and 'chunk' columns.

    This function processes the entire document at once to preserve text continuity
    across page breaks.

    Args:
        full_text (str): The complete text to process.

    Returns:
        pd.DataFrame: A DataFrame with 'title' and 'chunk' columns.
    """
    # Updated regex to handle cases like "VIRTUAL SAFETY CAR (VSC)"
    # and exclude titles with letters before the number
    section_pattern = r"(?<![A-Z])(\d+\)\s+(?:[A-Z ,\n\-\(\)É]+))\n"
    matches = re.split(section_pattern, full_text)

    titles = []
    chunks = []

    # First element is text before first section (if any)
    for i in range(1, len(matches), 2):
        titles.append(matches[i].replace('\n', ' ').strip())  # Clean multi-line titles
        chunks.append(matches[i + 1].strip())

    processed_df = pd.DataFrame({'title': titles, 'chunk': chunks})
    processed_df[['title', 'chunk']] = processed_df[['title', 'chunk']].map(lambda x: x.replace('\n', ' ').strip())
    
    # Remove in the last chunk any text after "APPENDIX" to avoid overlap with appendix extraction
    processed_df['chunk'].iloc[-1] = re.sub(r'APPENDIX.*$', '', processed_df['chunk'].iloc[ -1], flags=re.DOTALL).strip()

    return processed_df

# Concatenate all pages into a single text before processing
full_text = '\n'.join(df['Text'])
df_sections = extract_sections_to_dataframe(full_text)
df_sections.to_csv(os.path.join(processed_data_dir, 'pdf_sections.csv'), index=False)
df_sections

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  processed_df['chunk'].iloc[-1] = re.sub(r'APPENDIX.*$', '', processed_df['chunk'].iloc[ -1]).strip()


Unnamed: 0,title,chunk
0,1) REGULATIONS,1.1 The FIA will organise the FIA Formula One...
1,2) GENERAL UNDERTAKING,"2.1 All drivers, Competitors and officials pa..."
2,3) GENERAL CONDITIONS,3.1 It is the Competitor’s responsibility to ...
3,4) LICENCES,"4.1 All drivers, Competitors and officials pa..."
4,5) CHAMPIONSHIP COMPETITIONS,5.1 Competitions are reserved for Formula One...
...,...,...
60,60) POST SPRINT AND POST RACE PARC FERMÉ,60.1 Only those officials charged with superv...
61,61) SPRINT SESSION CLASSIFICATION,61.1 The car placed first will be the one hav...
62,62) RACE CLASSIFICATION,62.1 The car placed first will be the one hav...
63,63) PODIUM CEREMONY AND POST RACE PRESS CONFE...,"63.1 The drivers finishing the race in 1st, 2..."


In [34]:
df_sections['chunk'][64]

'64.1  All equipment used to cool the car on the Grid using forced air flow (or any other gaseous flow)  must only be powered by electricity.'

In [40]:
df.iloc[-1]['Text']

' \n \nAPPENDIX 9 \n \nAPPROVED CHANGES FOR SUBSEQUENT YEARS \n \nConvention: \nDark red text: All changes previously agreed for subsequent years \nDark red highlighted (yellow) text: new changes for subsequent years, approved by the WMSC \n \n1 \nChanges for 2026'

In [141]:
match = re.search(r"APPENDIX[\s\n]+\d+", df.iloc[-1]['Text'])
if match:
    print("Match found:", match.group())
else:
    print("No match found.")

Match found: APPENDIX 9


In [119]:
# increase pandas display max rows 100
pd.set_option('display.max_rows', 100)
len(df_sections)

64

# Appendices must be added

In [37]:
def extract_appendices_to_dataframe(text: str) -> pd.DataFrame:
    """
    Extracts appendices from the text based on patterns like 'APPENDIX <number>'
    and returns a DataFrame with 'title' and 'chunk' columns.

    Args:
        text (str): The text to process.

    Returns:
        pd.DataFrame: A DataFrame with 'title' and 'chunk' columns.
    """
    section_pattern = r"(APPENDIX[\s\n]+\d+)"
    matches = list(re.finditer(section_pattern, text))
    print(matches)
    titles = []
    chunks = []

    if not matches:
        return pd.DataFrame({'title': [], 'chunk': []})

    for i, match in enumerate(matches):
        end = match.end()
        title = match.group(1).replace('\n', ' ').strip()
        if i + 1 < len(matches):
            chunk = text[end:matches[i + 1].start()].strip()
        else:
            chunk = text[end:].strip()
        titles.append(title)
        chunks.append(chunk)

    processed_df = pd.DataFrame({'title': titles, 'chunk': chunks})
    processed_df[['title', 'chunk']] = processed_df[['title', 'chunk']].map(lambda x: x.replace('\n', ' ').strip())
    print(processed_df)
    return processed_df

# Process the extracted text into appendices
summary_page = 1
df_appendices = pd.concat([extract_appendices_to_dataframe(df['Text'])], ignore_index=True)
df_appendices.to_csv(os.path.join(processed_data_dir, 'pdf_appendices.csv'), index=False)
print(df_appendices)

TypeError: expected string or bytes-like object, got 'Series'

In [34]:
for value in df_appendices["chunk"].values:
    print(value)

INFORMATION REQUIRED BY THE FIA 90 DAYS BEFORE A COMPETITION    PART A.    1.  NAME AND ADDRESS OF THE NATIONAL SPORTING AUTHORITY (ASN).  2.  NAME AND ADDRESS OF THE ORGANISER.  3.  DATE AND PLACE OF THE COMPETITION.  4.  START TIME OF THE RACE (AS AGREED WITH THE PERMANENT BUREAU OF THE FIA F1 COMMISSION).  5.  ADDRESS AND TELEPHONE, FAX AND TELEX NUMBERS TO WHICH ENQUIRIES CAN BE ADDRESSED.  6.  DETAILS OF THE CIRCUIT, WHICH MUST INCLUDE:    -  LOCATION AND HOW TO GAIN ACCESS.    -  LENGTH OF ONE LAP.    -  NUMBER OF LAPS FOR RACE.    -  DIRECTION (CLOCKWISE OR ANTI-CLOCKWISE).    -  LOCATION OF END OF THE PIT LANE IN RELATION TO LINE.  7.  PRECISE LOCATION AT THE CIRCUIT OF:    -  STEWARDS’ OFFICE.    -  RACE DIRECTOR’S OFFICE.    -  FIA OFFICE.    -  PARC FERMÉ.    -  DRIVERS’ AND COMPETITORS’ BRIEFING.    -  WINNER’S PRESS CONFERENCE.  8.  LIST OF ANY TROPHIES AND SPECIAL AWARDS.  9.  THE NAMES OF THE FOLLOWING OFFICIALS OF THE COMPETITION APPOINTED BY THE ASN:    -  STEWARDS.   

In [36]:
df_appendices.iloc[7]['chunk']

'POWER UNIT TEST BENCH RESTRICTIONS    1   Engine dyno test limitations  a)   Definitions:  - Engine Test Bench: An Engine Test Bench is either a Power Unit Test Bench, a Power Train Test  Bench or a Full Car Dyno.   - Power Unit Test Bench: A test bench facility cell where a fired engine with more than 1 cylinder  representative of a Formula One engine may be tested. In addition to test bench components,  it may include and is limited to the following power unit and car components:  o Items listed in Technical Regulations Appendix 3 column 1 (art 5.1.2 Defin) listed as ‘INC’.  o The clutch, flywheel, and clutch actuation system.  o Fuel, engine oil and PU related liquids other than fuel and engine oil.  o Heat exchangers and their associated accessories. (including but not limited to housings,  tubes, pipes, hoses, supports, brackets and fasteners).  o The PU intake upstream of compressor inlet up to and including the air filter.  o The FIA Standard ECU.  o The regulatory fuel flow me

### Token embeddings

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")


In [122]:
df = pd.read_csv(os.path.join(processed_data_dir, 'pdf_sections.csv'))

In [123]:
df.head()  # Display the first few rows of the DataFrame

Unnamed: 0,title,chunk
0,1) REGULATIONS,1.1 The FIA will organise the FIA Formula One...
1,2) GENERAL UNDERTAKING,"2.1 All drivers, Competitors and officials pa..."
2,3) GENERAL CONDITIONS,3.1 It is the Competitor’s responsibility to ...
3,4) LICENCES,"4.1 All drivers, Competitors and officials pa..."
4,5) CHAMPIONSHIP COMPETITIONS,5.1 Competitions are reserved for Formula One...


In [124]:
df.loc[df.isna().sum(axis=1) > 0]

Unnamed: 0,title,chunk
28,29) VOID,
40,41) VOID,


In [125]:
model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")
embeddings = model.encode(df['chunk'].tolist(), show_progress_bar=True, device='mps')

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

In [126]:
embeddings

array([[ 0.03867137, -0.00604724, -0.00682677, ..., -0.03376015,
        -0.00947953, -0.00209979],
       [ 0.0311457 , -0.03375433, -0.00729547, ..., -0.04132152,
        -0.00200754, -0.02746743],
       [ 0.05352963, -0.03765709, -0.00557554, ..., -0.01629589,
        -0.02041579,  0.00070395],
       ...,
       [ 0.00495611, -0.02649595, -0.00973289, ..., -0.01721682,
         0.00948874, -0.04745426],
       [ 0.04810619, -0.0259371 , -0.00585549, ...,  0.02249218,
         0.00224992, -0.00090425],
       [ 0.03289371,  0.01738825, -0.00715252, ..., -0.04115681,
        -0.03080961, -0.00738988]], shape=(64, 1024), dtype=float32)

In [130]:
model.similarity(model.encode("What is the VSC?"), embeddings)

tensor([[0.1843, 0.1749, 0.2323, 0.2085, 0.2418, 0.2401, 0.3081, 0.1699, 0.1859,
         0.2654, 0.2676, 0.2892, 0.1821, 0.2115, 0.2099, 0.2381, 0.2024, 0.1846,
         0.1996, 0.2168, 0.2357, 0.2096, 0.1788, 0.3229, 0.2023, 0.2564, 0.1876,
         0.2377, 0.5478, 0.1695, 0.2415, 0.1461, 0.2723, 0.1519, 0.1893, 0.2173,
         0.2170, 0.2490, 0.2368, 0.2996, 0.5478, 0.1999, 0.3463, 0.1779, 0.2154,
         0.2779, 0.1876, 0.2692, 0.2043, 0.2816, 0.3420, 0.2849, 0.2743, 0.2674,
         0.2068, 0.5777, 0.2131, 0.2295, 0.2824, 0.2243, 0.2554, 0.2894, 0.1919,
         0.3097]])

In [131]:
model.similarity(model.encode("What is the VSC?"), embeddings).argmax()

tensor(55)

In [132]:
df['chunk'][55]

'56.1  The VSC procedure may be initiated to neutralise a practice session, sprint session or a race upon  the order of the clerk of the course.  a)  It will normally be used when double waved yellow flags are needed on any section of  track and Competitors or officials may be in danger, but the circumstances are not such  as to warrant use of the safety car itself.  56.2  When the order is given to initiate the VSC procedure a message “VSC DEPLOYED” will be sent  to all Competitors using the official messaging system and all FIA light panels will display “VSC”.  56.3  No car may be driven unnecessarily slowly, erratically or in a manner which could be deemed  potentially dangerous to other drivers or any other person at any time whilst the VSC procedure  is in use. This will apply whether any such car is being driven on the track, the pit entry road, or  the pit lane.  56.4  When initiated during a sprint session or a race, no car may enter the pits whilst the VSC  procedure is in use

### Token Length

In [36]:
df = pd.read_csv('data/processed/pdf_sections.csv')
df
df['len'] = df['chunk'].apply(lambda x: len(str(x).split()))
df.sort_values(by='len', ascending=False).head(10)


Unnamed: 0,title,chunk,len
10,10) TRACK RUNNING TIME OUTSIDE A COMPETITION,10.1 Testing of Current Cars (TCC) a) Testi...,4357
30,30) SUPPLY OF TYRES IN THE CHAMPIONSHIP AND T...,30.1 Supply of tyres a) The single tyre man...,3712
58,58) RESUMING A SPRINT SESSION OR A RACE,58.1 The delay will be kept as short as possi...,2308
40,40) PRE-SPRINT AND PRE-RACE PARC FERME,40.1 Each Competitor must provide the Technic...,1910
55,55) SAFETY CAR,55.1 The FIA safety car will be driven by an ...,1712
19,"19) PRESS CONFERENCES, MEDIA OPPORTUNITIES, D...",19.1 Day before on track running a) Up to ...,1701
26,26) GENERAL SAFETY,26.1 Official instructions will be given to d...,1604
44,44) RACE STARTING PROCEDURE,44.1 Forty (40) minutes before the scheduled ...,1525
43,43) SPRINT SESSION STARTING PROCEDURE,43.1 The sprint session start procedure detai...,1470
34,"34) PIT ENTRY ROAD, PIT LANE AND PIT EXIT ROAD",34.1 Unless otherwise defined by the Race Dir...,1312


In [18]:
df['chunk'][64]

'64.1  All equipment used to cool the car on the Grid using forced air flow (or any other gaseous flow)  must only be powered by electricity.     APPENDIX 1    INFORMATION REQUIRED BY THE FIA 90 DAYS BEFORE A COMPETITION    PART A.    1.  NAME AND ADDRESS OF THE NATIONAL SPORTING AUTHORITY (ASN).  2.  NAME AND ADDRESS OF THE ORGANISER.  3.  DATE AND PLACE OF THE COMPETITION.  4.  START TIME OF THE RACE (AS AGREED WITH THE PERMANENT BUREAU OF THE FIA F1 COMMISSION).  5.  ADDRESS AND TELEPHONE, FAX AND TELEX NUMBERS TO WHICH ENQUIRIES CAN BE ADDRESSED.  6.  DETAILS OF THE CIRCUIT, WHICH MUST INCLUDE:    -  LOCATION AND HOW TO GAIN ACCESS.    -  LENGTH OF ONE LAP.    -  NUMBER OF LAPS FOR RACE.    -  DIRECTION (CLOCKWISE OR ANTI-CLOCKWISE).    -  LOCATION OF END OF THE PIT LANE IN RELATION TO LINE.  7.  PRECISE LOCATION AT THE CIRCUIT OF:    -  STEWARDS’ OFFICE.    -  RACE DIRECTOR’S OFFICE.    -  FIA OFFICE.    -  PARC FERMÉ.    -  DRIVERS’ AND COMPETITORS’ BRIEFING.    -  WINNER’S PRESS

In [37]:
df = pd.read_csv('data/processed/pdf_appendices.csv')
df
df['len'] = df['chunk'].apply(lambda x: len(str(x).split()))
df.sort_values(by='len', ascending=False).head(10)


Unnamed: 0,title,chunk,len
6,APPENDIX 7,AERODYNAMIC TESTING RESTRICTIONS (ATR) The ...,6753
1,APPENDIX 2,FEDERATION INTERNATIONALE DE L’AUTOMOBILE 202...,6041
7,APPENDIX 8 POWER UNIT TEST BENCH RESTRICTIONS,1 Engine dyno test limitations a) Definit...,3671
5,APPENDIX 6,SUPPLY OF POWER UNITS FOR THE 2021-2025 CHAMPI...,2960
4,APPENDIX 5 PODIUM CEREMONY,At each Event the procedure for the Podium Cer...,972
0,APPENDIX 1,INFORMATION REQUIRED BY THE FIA 90 DAYS BEFORE...,243
3,APPENDIX 4,ENTRY FEES FOR THE 2025 FIA FORMULA ONE WORLD ...,112
8,APPENDIX 9 APPROVED CHANGES FOR SUBSEQUENT ...,Convention: Dark red text: All changes previo...,29
2,APPENDIX 3 REGULATIONS OF THE DRIVER CONTRA...,"(""Reserved for the exclusive use of Competitor...",15
