### The QED_data.txt, is a compiled version of all the collected datafrom the given dataset 
* It contains the compiled version of QED-2-to-2-diag-TreeLevel-{0 - 9}.txt 
* The Dataset will be converted into the csv format 
* the CSV format will consists of two attributes named, text and label 
* text attribute is the attribute, Where it consists information about the Particle Interaction, to Vertex to Amplitude 
* The Label column is the Squared Amplitude that is the prediction task 

In [2]:
input_file = '../QED_data/QED_data.txt'
output_file = '../QED_data/processed_dataset.csv'

In [3]:
pattern = r'Interaction:(.*?)\s*:\s*.*:\s*(.*?)\s*:\s*(.*)'

data = []
current_block = ""

In [4]:
import re
import pandas as pd

# Initialize list to store data
data = []
  # Replace with your file path

# Read the file and process each line
with open(input_file, 'r') as file:
    for line in file:
        # Remove leading/trailing whitespace
        line = line.strip()
        if not line:  # Skip empty lines
            continue

        # Find the last colon in the line to separate text and label
        last_colon_index = line.rfind(':')
        if last_colon_index == -1:  # No colon found, skip the line
            print(f"Skipping line (no colon found): {line}")
            continue

        # Split the line into text (before last colon) and label (after last colon)
        input_text = line[:last_colon_index].strip()
        squared_amplitude = line[last_colon_index + 1:].strip()

        # Cleaning steps for input_text
        # 1. Remove irrelevant terms
        input_text = re.sub(r'\bInteraction\b', '', input_text)
        input_text = re.sub(r'\bOffShell\b', '', input_text)
        input_text = re.sub(r'\bVertex\b', '', input_text)

        # 2. Preserve state names and replace numeric IDs
        input_text = re.sub(r'(e|c|u|X)_(\w+)_\d+', r'\1_\2_[STATE_ID]', input_text)

        # 3. Handle gamma terms (e.g., gamma_{+%\sigma_8689,%gam_11966,%eps_1132})
        # Replace each %state_numeric with %state_[STATE_ID]
        input_text = re.sub(r'%\w+_(\d+)', r'%\g<0>_[STATE_ID]', input_text)
        input_text = re.sub(r'%(\w+)_\d+_[STATE_ID]', r'%\1_[STATE_ID]', input_text)

        # 4. Clean tensor terms (e.g., e_{k_919,%del_8324} → e_{k_[STATE_ID],%del_[STATE_ID]})
        input_text = re.sub(r'(\w+)_{(\w+)_(\d+),(%\w+_\d+)}', r'\1_{\2_[STATE_ID],\4_[STATE_ID]}', input_text)
        input_text = re.sub(r'%\w+_(\d+)', r'%\g<0>_[STATE_ID]', input_text)
        input_text = re.sub(r'%(\w+)_\d+_[STATE_ID]', r'%\1_[STATE_ID]', input_text)

        # 5. Clean parentheses and conjugates
        input_text = re.sub(r'\((.*?)\)', '(X)', input_text)
        input_text = re.sub(r'\^\((.*?)\)', '^(*)', input_text)

        # 6. Fix complex function calls
        input_text = re.sub(r'u_\(\*\)', 'u_(*)', input_text)
        input_text = re.sub(r'v_\(\*\)', 'v_(*)', input_text)

        # 7. Remove colons (including double colons) between terms
        input_text = re.sub(r':+', ' ', input_text)

        # 8. Remove trailing quotes and extra commas
        input_text = re.sub(r'["\']', '', input_text)  # Remove quotes
        input_text = re.sub(r',+', ',', input_text)
        input_text = re.sub(r',\s*', ' ', input_text)  # Replace ", " with " "
        input_text = re.sub(r'\s+', ' ', input_text).strip()
        input_text = re.sub(r'^,+|,+$', '', input_text)

        # Cleaning for squared_amplitude (remove quotes, normalize spaces)
        squared_amplitude = re.sub(r'["\']', '', squared_amplitude)
        squared_amplitude = re.sub(r'\s+', ' ', squared_amplitude).strip()

        # Append cleaned data
        data.append({
            'text': input_text,
            'label': squared_amplitude
        })

# Convert to DataFrame and save as CSV
df = pd.DataFrame(data)
output_csv_path = "some_random_run.csv"  # Replace with desired path
df.to_csv(output_csv_path, index=False)
print(f"Preprocessed data saved to {output_csv_path}")

# Debugging: Print first 5 entries
print("First 5 data entries:")
for entry in data[:5]:
    print(entry)

# Step 2: Calculate unique tokens
# Combine all text from both columns
all_texts = df['text'].astype(str).tolist() + df['label'].astype(str).tolist()

# Tokenize using whitespace and count unique tokens
all_tokens = set()
for text in all_texts:
    tokens = text.split()
    all_tokens.update(tokens)

# Print the results
print(f"\nNumber of unique tokens in the dataset: {len(all_tokens)}")
print(f"Sample tokens: {list(all_tokens)[:20]}")

Preprocessed data saved to some_random_run.csv
First 5 data entries:
{'text': 'e_gam_[STATE_ID](X)^(*) e_del_[STATE_ID](X)^(*) to e_eps_[STATE_ID](X) e_eta_[STATE_ID](X) V_1 e(X) e(X) A(X) V_0 e(X) e(X) A(X) -1/2*i*e^2*gamma_{+%\\sigma_165 %%%gam_145_[STATE_ID]_[STATE_ID] %%%gam_146_[STATE_ID]_[STATE_ID]}*gamma_{%\\sigma_165 %%%gam_147_[STATE_ID]_[STATE_ID] %%%del_137_[STATE_ID]_[STATE_ID]}*e_{i_3 %%%gam_146_[STATE_ID]_[STATE_ID]}(X)_u*e_{k_3 %%%del_137_[STATE_ID]_[STATE_ID]}(X)_u*e_{l_3 %%%gam_145_[STATE_ID]_[STATE_ID]}(X)_u^(*)*e_{i_5 %%%gam_147_[STATE_ID]_[STATE_ID]}(X)_u^(*)/(X)', 'label': '2*e^4*(m_e^4 + -1/2*m_e^2*s_13 + 1/2*s_14*s_23 + -1/2*m_e^2*s_24 + 1/2*s_12*s_34)*(m_e^2 + -s_13 + 1/2*reg_prop)^(-2)'}
{'text': 'e_gam_[STATE_ID](X)^(*) e_del_[STATE_ID](X)^(*) to e_eps_[STATE_ID](X) e_eta_[STATE_ID](X) V_0 e(X) e(X) A(X) V_1 e(X) e(X) A(X) 1/2*i*e^2*gamma_{+%\\sigma_172 %%%gam_162_[STATE_ID]_[STATE_ID] %%%del_144_[STATE_ID]_[STATE_ID]}*gamma_{%\\sigma_172 %%%gam_163_[STATE_ID]

In [104]:
import pandas as pd 
data = pd.DataFrame(data , columns= ['text' , 'label']) 
data.to_csv(output_file , index = False)
print('Data saved to ' , output_file)

Data saved to  ../QED_data/processed_dataset.csv
