# Note: combined_df_v2.csv is our initial dataset containing high-level planning and low-level reasoning steps

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

path = "combined_df_v2.csv"

df = pd.read_csv(path)

df.head()





In [None]:
df.iloc[0, 3]

In [None]:
import os
print(os.getcwd())

In [None]:
df.describe()

# Rename HLR col as HLP
df = df.rename(columns={'HLR': 'HLS'})
df.head()


In [None]:
df.describe()

In [None]:
import pandas as pd
import torch
from transformers import GPT2TokenizerFast
import ast

def clean_answer(text):
    text = text.replace('\n\n', ' ')
    text = text.replace('. Step', ' ; Step')
    text = text.rstrip('. ')
    return text

new_rows = []
for index, row in df.iterrows():
    test_q = row['question']
    test_a = row['answer']
    test_llr = row['LLR']
    test_hlr = row['HLS']

    test_q_HL = 'Question: ' + test_q + " ; " + ' High Level Steps: '
    test_a_HL = ' ; '.join(ast.literal_eval(test_hlr)) + ' '

    test_a_LLR = clean_answer(test_a)

    test_full_HL = test_q_HL + " ; " + test_a_HL
    test_full = 'Question: ' + test_q + " ; " + ' Answer: ' +  " ; " + test_a_LLR

    new_rows.append({"high_level": test_full_HL, "low_level": test_full})




# Output transformed dataset to data/train_data.csv

In [None]:
new_df = pd.DataFrame(new_rows)
new_df.head()

new_df.to_csv('../data/train_data_raw.csv')

# Exploratory Data Analysis

## 1: Load data + tokenizer

In [None]:
import pandas as pd
from transformers import GPT2TokenizerFast
import torch
from tqdm.notebook import tqdm  # Use notebook version for better progress bars
import numpy as np
import os

# Initialize tokenizer
print("Loading tokenizer...")
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')

# Set padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
pad_token_str = tokenizer.pad_token
print(f"Using padding token: '{pad_token_str}'")

# Initialize tokenizer
print("Loading tokenizer...")
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')

# Set padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
pad_token_str = tokenizer.pad_token
print(f"Using padding token: '{pad_token_str}'")

# Define path relative to the notebook location
input_csv_path = '../data/train_data_raw.csv'

print(f"Loading data from {input_csv_path}...")
try:
    # Load the raw training data generated by the previous cells/script
    eda_df = pd.read_csv(input_csv_path)
    # Drop rows with missing values just in case
    eda_df.dropna(subset=['high_level', 'low_level'], inplace=True)
    print(f"Loaded {len(eda_df)} rows.")
    print("Data sample:")
    display(eda_df.head())
except FileNotFoundError:
    print(f"Error: Input file not found at {input_csv_path}")
    print("Please ensure the previous cells or the `training_dataset_builder.py` script has run successfully.")
    eda_df = None
except Exception as e:
    print(f"An error occurred: {e}")
    eda_df = None



## 2: Test tokenizer

In [None]:
print("Tokenizing a sample (first 5 rows)...")
sample_df = eda_df.head().copy() # Work on a copy

# Ensure data is string type
sample_df['high_level'] = sample_df['high_level'].astype(str)
sample_df['low_level'] = sample_df['low_level'].astype(str)

# Tokenize
sample_df['hl_tokens'] = sample_df['high_level'].apply(lambda x: tokenizer.encode(x))
sample_df['ll_tokens'] = sample_df['low_level'].apply(lambda x: tokenizer.encode(x))

# Get lengths
sample_df['hl_len'] = sample_df['hl_tokens'].apply(len)
sample_df['ll_len'] = sample_df['ll_tokens'].apply(len)

print("\nSample Data with Token Lengths:")
display(sample_df[['high_level', 'hl_len', 'low_level', 'll_len']])

print("\nExample Tokenized Output (Row 0):")
print("High Level Tokens:", sample_df.loc[0, 'hl_tokens'])
print("Decoded:", tokenizer.decode(sample_df.loc[0, 'hl_tokens'])) # Optional: decode back
print("\nLow Level Tokens:", sample_df.loc[0, 'll_tokens'])
print("Decoded:", tokenizer.decode(sample_df.loc[0, 'll_tokens'])) # Optional: decode back

print("\n")

print("\nExample Tokenized Output (Row 1):")
print("High Level Tokens:", sample_df.loc[1, 'hl_tokens'])
print("Decoded:", tokenizer.decode(sample_df.loc[1, 'hl_tokens'])) # Optional: decode back
print("\nLow Level Tokens:", sample_df.loc[1, 'll_tokens'])
print("Decoded:", tokenizer.decode(sample_df.loc[1, 'll_tokens'])) # Optional: decode back



## 3: Split into fragments, get token lengths

### Define fragment extraction function

In [None]:
import pandas as pd
from transformers import GPT2TokenizerFast
import torch
from tqdm import tqdm
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns



def extract_tokenize_fragments(row_index, high_level_text, low_level_text, tokenizer, delimiter=" ; "):
    """
    Extracts HL and LL fragments from texts, tokenizes them,
    and returns a list of dictionaries for this row.
    Splits "Conclusion:" prefix into its own fragment.
    """
    fragments_data = []
    # Use separate counters for accurate in-row indexing after potential splits
    hl_fragment_output_index = 0
    ll_fragment_output_index = 0
    conclusion_prefix = "Conclusion:" # Define the prefix to split

    # --- Process High Level ---
    if isinstance(high_level_text, str):
        hl_fragments_raw = [frag.strip() for frag in high_level_text.split(delimiter)]
        # Using slice [2:] to discard "Question:" and "High Level Steps:" prefix.
        # Change to [1:] if you want to include "High Level Steps:" as the first fragment.
        hl_step_fragments = [frag for frag in hl_fragments_raw[2:] if frag]

        for frag_text in hl_step_fragments:
            try:
                conclusion_idx = frag_text.find(conclusion_prefix)

                # Case 1: "Conclusion:" is found mid-fragment
                if conclusion_idx > 0:
                    part1 = frag_text[:conclusion_idx].strip()
                    # Add part 1 if it's not empty
                    if part1:
                        len1 = len(tokenizer.encode(part1))
                        fragments_data.append({
                            'original_index': row_index,
                            'fragment_index_in_row': hl_fragment_output_index,
                            'fragment_type': 'HL',
                            'fragment_text': part1,
                            'token_length': len1
                        })
                        hl_fragment_output_index += 1

                    # Add "Conclusion:" prefix
                    len_prefix = len(tokenizer.encode(conclusion_prefix))
                    fragments_data.append({
                        'original_index': row_index,
                        'fragment_index_in_row': hl_fragment_output_index,
                        'fragment_type': 'HL',
                        'fragment_text': conclusion_prefix,
                        'token_length': len_prefix
                    })
                    hl_fragment_output_index += 1

                    # Add remaining text after prefix, if any
                    remaining_text = frag_text[conclusion_idx + len(conclusion_prefix):].strip()
                    if remaining_text:
                        len_remaining = len(tokenizer.encode(remaining_text))
                        fragments_data.append({
                             'original_index': row_index,
                             'fragment_index_in_row': hl_fragment_output_index,
                             'fragment_type': 'HL',
                             'fragment_text': remaining_text,
                             'token_length': len_remaining
                        })
                        hl_fragment_output_index += 1

                # Case 2: Fragment *starts* with "Conclusion:"
                elif conclusion_idx == 0:
                    # Add "Conclusion:" prefix
                    len_prefix = len(tokenizer.encode(conclusion_prefix))
                    fragments_data.append({
                        'original_index': row_index,
                        'fragment_index_in_row': hl_fragment_output_index,
                        'fragment_type': 'HL',
                        'fragment_text': conclusion_prefix,
                        'token_length': len_prefix
                    })
                    hl_fragment_output_index += 1

                    # Add remaining text after prefix, if any
                    remaining_text = frag_text[len(conclusion_prefix):].strip()
                    if remaining_text:
                        len_remaining = len(tokenizer.encode(remaining_text))
                        fragments_data.append({
                             'original_index': row_index,
                             'fragment_index_in_row': hl_fragment_output_index,
                             'fragment_type': 'HL',
                             'fragment_text': remaining_text,
                             'token_length': len_remaining
                        })
                        hl_fragment_output_index += 1

                # Case 3: No "Conclusion:", add as a single fragment
                else:
                    token_length = len(tokenizer.encode(frag_text))
                    fragments_data.append({
                        'original_index': row_index,
                        'fragment_index_in_row': hl_fragment_output_index,
                        'fragment_type': 'HL',
                        'fragment_text': frag_text,
                        'token_length': token_length
                    })
                    hl_fragment_output_index += 1

            except Exception as e:
                 print(f"Error tokenizing/processing HL fragment at index {row_index}: {e} - Text: {frag_text[:50]}...")


    # --- Process Low Level ---
    if isinstance(low_level_text, str):
        ll_fragments_raw = [frag.strip() for frag in low_level_text.split(delimiter)]
        # Using slice [2:] to discard "Question:" and "Answer:" prefix.
        # Change to [1:] if you want to include "Answer:" as the first fragment.
        ll_reasoning_fragments = [frag for frag in ll_fragments_raw[2:] if frag]

        for frag_text in ll_reasoning_fragments:
            try:
                 conclusion_idx = frag_text.find(conclusion_prefix)

                 # Case 1: "Conclusion:" is found mid-fragment
                 if conclusion_idx > 0:
                     part1 = frag_text[:conclusion_idx].strip()
                     # Add part 1 if it's not empty
                     if part1:
                         len1 = len(tokenizer.encode(part1))
                         fragments_data.append({
                             'original_index': row_index,
                             'fragment_index_in_row': ll_fragment_output_index,
                             'fragment_type': 'LL',
                             'fragment_text': part1,
                             'token_length': len1
                         })
                         ll_fragment_output_index += 1

                     # Add "Conclusion:" prefix
                     len_prefix = len(tokenizer.encode(conclusion_prefix))
                     fragments_data.append({
                         'original_index': row_index,
                         'fragment_index_in_row': ll_fragment_output_index,
                         'fragment_type': 'LL',
                         'fragment_text': conclusion_prefix,
                         'token_length': len_prefix
                     })
                     ll_fragment_output_index += 1

                     # Add remaining text after prefix, if any
                     remaining_text = frag_text[conclusion_idx + len(conclusion_prefix):].strip()
                     if remaining_text:
                         len_remaining = len(tokenizer.encode(remaining_text))
                         fragments_data.append({
                              'original_index': row_index,
                              'fragment_index_in_row': ll_fragment_output_index,
                              'fragment_type': 'LL',
                              'fragment_text': remaining_text,
                              'token_length': len_remaining
                         })
                         ll_fragment_output_index += 1

                 # Case 2: Fragment *starts* with "Conclusion:"
                 elif conclusion_idx == 0:
                    # Add "Conclusion:" prefix
                    len_prefix = len(tokenizer.encode(conclusion_prefix))
                    fragments_data.append({
                        'original_index': row_index,
                        'fragment_index_in_row': ll_fragment_output_index,
                        'fragment_type': 'LL',
                        'fragment_text': conclusion_prefix,
                        'token_length': len_prefix
                    })
                    ll_fragment_output_index += 1

                    # Add remaining text after prefix, if any
                    remaining_text = frag_text[len(conclusion_prefix):].strip()
                    if remaining_text:
                        len_remaining = len(tokenizer.encode(remaining_text))
                        fragments_data.append({
                             'original_index': row_index,
                             'fragment_index_in_row': ll_fragment_output_index,
                             'fragment_type': 'LL',
                             'fragment_text': remaining_text,
                             'token_length': len_remaining
                        })
                        ll_fragment_output_index += 1

                 # Case 3: No "Conclusion:", add as a single fragment
                 else:
                     token_length = len(tokenizer.encode(frag_text))
                     fragments_data.append({
                         'original_index': row_index,
                         'fragment_index_in_row': ll_fragment_output_index,
                         'fragment_type': 'LL',
                         'fragment_text': frag_text,
                         'token_length': token_length
                     })
                     ll_fragment_output_index += 1
            except Exception as e:
                 print(f"Error tokenizing/processing LL fragment at index {row_index}: {e} - Text: {frag_text[:50]}...")


    return fragments_data

### Implement extraction

In [None]:
import pandas as pd
from transformers import GPT2TokenizerFast
import torch
# Use standard tqdm as notebook version caused issues
from tqdm import tqdm
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns # Ensure installed: pip install seaborn

# --- Assume tokenizer and eda_df (full or subset) are loaded ---

# --- Function Definition (assuming it's defined correctly above) ---
# def extract_tokenize_fragments(row_index, high_level_text, low_level_text, tokenizer, delimiter=" ; "): ...
# --- Verification Step: Process a single example row ---
example_row_index = 5 # Choose an index to inspect (e.g., 5)
example_fragments_df = None # Initialize DataFrame for the example

if eda_df is not None and example_row_index < len(eda_df):
    print(f"\n--- Verifying fragment extraction for row index: {example_row_index} ---")
    example_row = eda_df.iloc[example_row_index]
    example_hl_text = example_row.get('high_level', 'N/A')
    example_ll_text = example_row.get('low_level', 'N/A')

    # --- Log: Print Original Texts ---
    print("\nOriginal High Level Text:")
    print("```")
    print(example_hl_text)
    print("```")
    print("-" * 20)
    print("Original Low Level Text:")
    print("```")
    print(example_ll_text)
    print("```")
    print("-" * 20)

    # Call the function just for this row
    example_fragments_list = extract_tokenize_fragments(
        example_row_index,
        example_hl_text,
        example_ll_text,
        tokenizer
    )

    print(f"\nExtracted Fragments & Details (Row {example_row_index}):")
    if not example_fragments_list:
        print("  No fragments extracted for this example row.")
    else:
        # --- Create DataFrame for the example row's fragments ---
        example_fragments_df = pd.DataFrame(example_fragments_list)
        print(f"  Stored fragments for row {example_row_index} in `example_fragments_df`.")
        # Log details from the list (easier to read here than full df display)
        for frag_data in example_fragments_list:
             print(f"  - Type: {frag_data['fragment_type']:<3} | "
                   f"IndexInRow: {frag_data['fragment_index_in_row']:<2} | "
                   f"Tokens: {frag_data['token_length']:<4} | "
                   f"Text: '{frag_data['fragment_text'][:80]}...'") # Truncate text for log clarity
    print("-" * 20)
    print("You can now display `example_fragments_df` manually in the next cell.")

else:
    print(f"Cannot verify example row index {example_row_index}. DataFrame might be empty or index out of bounds.")

# --- Main Processing (The rest of the cell remains the same) ---
# print(f"\n--- Processing all {len(eda_df)} rows in the current DataFrame ---")
# ... (rest of the code for processing all rows, creating fragments_df, analysis) ...

# --- Main Processing: Get all fragments (using the current eda_df, which might be a subset) ---
print(f"\n--- Processing all {len(eda_df)} rows in the current DataFrame ---")
all_fragments_list = []
if eda_df is not None:
    # Use tqdm for progress indication
    for index, row in tqdm(eda_df.iterrows(), total=len(eda_df), desc="Processing Rows"):
        # Add a check for logging the example row *during* the main loop if desired (optional)
        # if index == example_row_index:
        #    print(f"\nProcessing example row {index} within the main loop...")
        row_fragments = extract_tokenize_fragments(
            index,
            row.get('high_level'), # Use .get for safety
            row.get('low_level'),
            tokenizer
        )
        all_fragments_list.extend(row_fragments)

    if not all_fragments_list:
        print("Error: No fragments were extracted during main processing.")
        fragments_df = None
        max_len_hl_frag = None
        max_len_ll_frag = None
    else:
        # Create the DataFrame where each row is a single fragment
        fragments_df = pd.DataFrame(all_fragments_list)
        print(f"\nCreated DataFrame with {len(fragments_df)} total fragments.")
        print("\n--- Verifying fragments_df content for example row ---")
        example_fragments_in_df = fragments_df[fragments_df['original_index'] == example_row_index]
        if example_fragments_in_df.empty:
             print(f"No fragments found in fragments_df for original_index {example_row_index}")
        else:
             print(f"Fragments stored in DataFrame for original_index {example_row_index}:")
             # Display relevant columns for verification
             display(example_fragments_in_df[['original_index', 'fragment_type', 'fragment_index_in_row', 'token_length', 'fragment_text']])
        print("-" * 20)


        # --- The rest of the analysis (Max Lengths, Visualization, Stats) ---
        print("\n--- Calculating Max Lengths and Statistics ---")
        # --- Find Max Lengths by Fragment Type ---
        max_lengths = fragments_df.groupby('fragment_type')['token_length'].max()
        max_len_hl_frag = max_lengths.get('HL', None)
        max_len_ll_frag = max_lengths.get('LL', None)

        if max_len_hl_frag is not None: print(f"Max HL Fragment Token Length: {max_len_hl_frag}")
        else: print("No HL fragments found.")
        if max_len_ll_frag is not None: print(f"Max LL Fragment Token Length: {max_len_ll_frag}")
        else: print("No LL fragments found.")

        # --- Visualize Distribution by Type ---
        if max_len_hl_frag is not None or max_len_ll_frag is not None:
            # (Visualization code remains the same as before)
            print("\nVisualizing fragment length distributions...")
            plt.figure(figsize=(12, 5))
            if max_len_hl_frag is not None:
                 plt.subplot(1, 2, 1)
                 sns.histplot(fragments_df[fragments_df['fragment_type'] == 'HL']['token_length'], bins=50)
                 plt.title('Distribution of HL Fragment Lengths')
                 plt.xlabel('Token Length'); plt.ylabel('Frequency')
                 plt.axvline(max_len_hl_frag, color='r', linestyle='--', label=f'Max HL: {max_len_hl_frag}'); plt.legend()
            if max_len_ll_frag is not None:
                 plt.subplot(1, 2, 2)
                 sns.histplot(fragments_df[fragments_df['fragment_type'] == 'LL']['token_length'], bins=50)
                 plt.title('Distribution of LL Fragment Lengths')
                 plt.xlabel('Token Length')
                 plt.axvline(max_len_ll_frag, color='g', linestyle='--', label=f'Max LL: {max_len_ll_frag}'); plt.legend()
            if max_len_hl_frag is None or max_len_ll_frag is None:
                 plt.subplot(1, 2, 1 if max_len_ll_frag is None else 2).set_ylabel('')
                 plt.gcf().set_size_inches(6, 5)
            plt.tight_layout(); plt.show()

            # Display stats by type
            print("\nFragment Length Stats by Type:")
            print(fragments_df.groupby('fragment_type')['token_length'].describe())
        else:
            print("\nSkipping visualization as no fragments were found.")
else:
    print("Skipping main processing as eda_df is None.")
    fragments_df = None
    max_len_hl_frag = None
    max_len_ll_frag = None


# --- Explanation for Reconstruction/Prepending ---
print("\n--- Note on Reconstructing Sequences ---")
if fragments_df is not None:
    print("The 'fragments_df' DataFrame now contains all necessary information:")
    print("  - 'original_index': Links fragment back to its source row in 'eda_df'.")
    print("  - 'fragment_type': Identifies fragment as 'HL' (High Level) or 'LL' (Low Level).")
    print("  - 'fragment_index_in_row': Indicates the order of the fragment within its type for that row.")
    print("  - 'fragment_text': The actual text content of the fragment.")
    print("  - 'token_length': The pre-calculated number of GPT2 tokens.")
    print("\nThis structure allows you to:")
    print("  1. Group by 'original_index'.")
    print("  2. Within each group, sort fragments by 'fragment_type' and 'fragment_index_in_row' (or any custom order).")
    print("  3. Prepend '[{token_length}]' to 'fragment_text'.")
    print("  4. Join the modified fragments to reconstruct sequences.")
else:
    print("Cannot provide reconstruction notes as 'fragments_df' was not created.")


# Update the variables used by subsequent cells
max_len_hl = max_len_hl_frag
max_len_ll = max_len_ll_frag

In [None]:
fragments_df[fragments_df['original_index'] == 1]

# Create uniform length & length pre-pended datasets

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import matplotlib.pyplot as plt
import seaborn as sns # Ensure installed

# --- Configuration ---
# Assume eda_df, fragments_df, tokenizer, pad_token_str are available from previous cells
# Ensure the 'token_length' column in fragments_df is integer type
if 'fragments_df' in locals() and fragments_df is not None:
     fragments_df['token_length'] = fragments_df['token_length'].astype(int)
else:
     print("Error: fragments_df not found. Please run previous cells.")
     # Optional: exit or skip
     fragments_df = None # Set to None to skip downstream steps cleanly

if 'pad_token_str' not in globals() and 'pad_token_str' not in locals():
     # Attempt to define it if tokenizer exists
     if 'tokenizer' in locals() and tokenizer is not None:
         if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
         pad_token_str = tokenizer.pad_token
         print("Warning: 'pad_token_str' was not found, defined it from tokenizer.")
     else:
        raise NameError("Execution Error: 'pad_token_str' and 'tokenizer' are not defined. Please run previous cells.")

HL_TARGET_LEN = 35
LL_TARGET_LEN = 100
OUTPUT_DELIMITER = " ; "
HL_PLAN_PREFIX = "High-Level Plan"
LL_REASONING_PREFIX = "Low-Level Reasoning"
OUTPUT_DIR = "../data" # Make sure this directory exists
PREPEND_FILENAME = "train_len_prepend.csv"
UNIFORM_FILENAME = "train_len_uniform.csv"

# Make sure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- Helper Function to Extract Question ---
def extract_question(text):
    """Extracts question text after 'Question: ' and before the first ';'. """
    if not isinstance(text, str): return ""
    try:
        q_start = text.find("Question: ") + len("Question: ")
        q_end = text.find(OUTPUT_DELIMITER, q_start)
        if q_start < len("Question: ") or q_end == -1:
             # print(f"Warning: Could not parse question format: {text[:100]}...")
             return text # Return original text if parsing fails? Or empty string?
        return text[q_start:q_end].strip()
    except Exception as e:
        # print(f"Error extracting question: {e} from {text[:100]}...")
        return ""

# --- 1. Extract Original Questions ---
if 'eda_df' not in locals() or eda_df is None:
     print("Error: `eda_df` is not available. Cannot extract questions.")
     question_map = {} # Define as empty to prevent downstream errors
else:
     print("Extracting original questions...")
     question_map = eda_df['high_level'].apply(extract_question).to_dict()
     print(f"Sample question (index 5): {question_map.get(5, 'Not Found')}")


# --- Check if fragments_df exists before proceeding ---
if fragments_df is not None:

     # --- 2a. Prepare for train_len_prepend ---
     print("Preparing fragments for train_len_prepend...")
     fragments_df['prepended_text'] = fragments_df.apply(
         lambda row: f"[{row['token_length']}] {row['fragment_text']}", axis=1
     )

     # --- 2b. Prepare for train_len_uniform ---
     print("Preparing fragments for train_len_uniform...")
     # Modified function to accept pad_token
     def pad_fragment(row, pad_token):
         target_len = HL_TARGET_LEN if row['fragment_type'] == 'HL' else LL_TARGET_LEN
         current_len = row['token_length']
         pad_needed = max(0, target_len - current_len)
         # Removed the check for pad_token_str here
         return row['fragment_text'] + (pad_token * pad_needed) # Use the argument

     # Modified apply call to pass pad_token_str
     fragments_df['padded_text'] = fragments_df.apply(
         pad_fragment,
         axis=1,
         args=(pad_token_str,) # Pass pad_token_str via args
     )

     # Verify padding for a sample fragment (optional)
     if not fragments_df.empty:
          sample_padded_frag = fragments_df.iloc[0]
          print(f"Sample padding check (Fragment 0): Type={sample_padded_frag['fragment_type']}, Orig Len={sample_padded_frag['token_length']}")
          # Optional detailed verification can be added here if needed
     else:
          print("Skipping padding check as fragments_df is empty.")


     # --- 3. Group and Reconstruct Sequences ---
     tqdm.pandas(desc="Building Prepended Sequences")

          # Function to build one sequence row (UPDATED)
     def build_sequence(group, text_col_name):
          original_index = group.name
          question_text = question_map.get(original_index, "[QUESTION NOT FOUND]") # Get question

          # Get HL fragments (no change needed here)
          hl_frags = group[group['fragment_type'] == 'HL'].sort_values('fragment_index_in_row')[text_col_name]

          # Get LL fragments BUT filter out the step/conclusion headers
          ll_group = group[group['fragment_type'] == 'LL'].sort_values('fragment_index_in_row')

          # Filter out rows where fragment_text starts with "Step " or is exactly "Conclusion:"
          # Adjust the startswith('Step ') condition if step numbers can have multiple digits
          # and are followed by ':'. A regex might be more robust if format varies.
          reasoning_frags_only = ll_group[
               ~ll_group['fragment_text'].str.startswith('Step ', na=False) & \
               ~ll_group['fragment_text'].str.startswith('Conclusion:', na=False) # Check against the exact prefix we added
          ][text_col_name]

          # Join fragments
          joined_hl = OUTPUT_DELIMITER.join(hl_frags)
          joined_ll_reasoning = OUTPUT_DELIMITER.join(reasoning_frags_only) # Join only the filtered reasoning

          # Construct final string
          # Format: Question + HL_PREFIX + HL_FRAGS + LL_PREFIX + LL_REASONING_FRAGS_ONLY
          parts = [
               f"Question: {question_text}", # Add "Question: " prefix back
               HL_PLAN_PREFIX,
               joined_hl,
               LL_REASONING_PREFIX,
               joined_ll_reasoning # Use the filtered LL fragments
          ]
          # Filter out empty parts that might result if a type had no fragments
          parts = [part for part in parts if part] # Especially important if all LL frags were headers
          return OUTPUT_DELIMITER.join(parts) 

     grouped_fragments = fragments_df.groupby('original_index')

     print("Building prepended sequences...")
     prepended_sequences = grouped_fragments.progress_apply(build_sequence, text_col_name='prepended_text')
     prepended_df = pd.DataFrame({'full_sequence': prepended_sequences})
     prepended_df.index.name = 'original_index'
     print("Prepended sequences sample:")
     display(prepended_df.head())

     tqdm.pandas(desc="Building Uniform Sequences")
     print("\nBuilding uniform length sequences...")
     uniform_sequences = grouped_fragments.progress_apply(build_sequence, text_col_name='padded_text')
     uniform_df = pd.DataFrame({'full_sequence': uniform_sequences})
     uniform_df.index.name = 'original_index'
     print("Uniform length sequences sample:")
     display(uniform_df.head())

     # --- 4. Save Results ---
     prepend_path = os.path.join(OUTPUT_DIR, PREPEND_FILENAME)
     uniform_path = os.path.join(OUTPUT_DIR, UNIFORM_FILENAME)

     print(f"\nSaving prepended data to: {prepend_path}")
     prepended_df.to_csv(prepend_path)

     print(f"Saving uniform length data to: {uniform_path}")
     uniform_df.to_csv(uniform_path)

     print("\nProcessing finished.")

else: # This else corresponds to "if fragments_df is not None:"
     print("Skipping sequence building and saving as 'fragments_df' is not available.")


In [None]:
path = "../data/train_len_uniform.csv"

df = pd.read_csv(path)

In [None]:
df = df.head(10)


df.iloc[1,1]

# Clean up final answer token to avoid latex <-> text ambiguity

In [None]:
import pandas as pd

path = "../data/train_len_prepend.csv"
df = pd.read_csv(path, index_col = 0)

sentence_col = df['full_sequence'] 

#



# extract the last fragment from each row. Delimiter is " ; "
last_fragment = sentence_col.str.split(' ; ').str[-1]


last_fragment




In [None]:
# Remove the first 4 characters from all rows of last_fragment
last_fragment = last_fragment.str[4:]

In [None]:
last_fragment

In [None]:
last_fragment

# Count # instances which don't contain 'The answer is'. Also count # instances where the sentence ends with a number
import pandas as pd
count_no_answer_is = (~last_fragment.str.contains('The answer is', na=False)).sum()

count_ends_with_number = last_fragment.str.match(r'.*\d$', na=False).sum()

# Of the instances which don't end with a number, print the number of instances which contain '\(' and '\)'
count_latex_no_number = (~last_fragment.str.match(r'.*\d$', na=False) & 
                        last_fragment.str.contains(r'\\\(|\\\)', na=False)).sum()

print(f"Instances without 'The answer is': {count_no_answer_is}")
print(f"Instances ending with number: {count_ends_with_number}")
print(f"Instances with LaTeX delimiters but no number: {count_latex_no_number}")


# # Print all instances which don't contain 'The answer is' 
# count = 0
# for frag in last_fragment: 
#     if ('the answer is' not in frag) and ('The answer is' not in frag):
#         print(frag)
#         count += 1
# print(f"Number of instances without 'The answer is': {count}")




        

In [None]:
import pandas as pd
import numpy as np
import re # Import regex module

# Assuming 'last_fragment' is your Pandas Series 
# Example Data (replace with your actual Series):
# data = ["The answer is 600", "Some text ending with 3", "The answer is \(x^2\)", "Text that ends with \(y=mx+b\)", "No answer here", "the answer is 4.5", "Just text"]
# last_fragment = pd.Series(data)

# Define regex patterns
# Number pattern: Integer or float
num_pattern = r'\d+(\.\d+)?' 
# LaTeX pattern: Matches \(...\) non-greedily
latex_pattern = r'\\\((.*?)\\\)' 

# Criterion 1 Checks: Contains "The answer is" (case-insensitive) followed by number or LaTeX
# Need to be careful with potential spaces after "is"
regex_c1 = re.compile(
    r'the answer is\s*(' + num_pattern + r'|' + latex_pattern + r')', 
    re.IGNORECASE # Make the search case-insensitive
)
criterion1_met = last_fragment.str.contains(regex_c1, na=False)

# Criterion 2 Checks: Ends with a number or LaTeX
regex_c2_ends_num = re.compile(num_pattern + r'$')
regex_c2_ends_latex = re.compile(latex_pattern + r'$')

criterion2_ends_num = last_fragment.str.contains(regex_c2_ends_num, na=False)
criterion2_ends_latex = last_fragment.str.contains(regex_c2_ends_latex, na=False)
criterion2_met = criterion2_ends_num | criterion2_ends_latex

# Combine criteria: An answer is identified if EITHER criterion 1 OR criterion 2 is met
has_identifiable_answer = criterion1_met | criterion2_met

# Get the indices where an answer was identified
identified_indices = last_fragment.index[has_identifiable_answer].tolist()

# Print results
print(f"Number of entries: {len(last_fragment)}")
print(f"Number of entries with an identifiable answer: {len(identified_indices)}")
# print("\nIndices with identifiable answers:")
# print(identified_indices) # Uncomment to see the list of indices
# print("\nSample entries identified:")
# print(last_fragment[identified_indices].head()) # Uncomment to see sample matching entries

In [None]:
# # Filter out indices from df which don't have an identifiable answer
df = df.iloc[identified_indices]


In [None]:
filtered_frags = last_fragment.iloc[identified_indices] 
filtered_frags

### Clean up the latex into human readable

In [None]:
import re
import sympy as sp
from sympy.parsing.latex import parse_latex

# -----------------------------------------------------------
# Step 1: Strip LaTeX wrappers and cosmetic commands
# -----------------------------------------------------------

def strip_latex_wrappers(tex: str) -> str:
    tex = tex.strip()
    # Remove $...$, \(...\), or \[...\]
    tex = re.sub(r'^\s*(\$|\\\(|\\\[)\s*', '', tex)
    tex = re.sub(r'\s*(\$|\\\)|\\\])\s*$', '', tex)
    # Remove \left and \right
    tex = re.sub(r'\\left|\\right', '', tex)
    # Collapse multiple spaces
    return " ".join(tex.split())

# -----------------------------------------------------------
# Step 2: Convert plain 'i'/'j' forms to sympy-compatible 'I'
# -----------------------------------------------------------

_plain_i = re.compile(
    r'(?<![A-Za-z0-9_])'          # not part of identifier
    r'([-+]?\s*\d*\.?\d*(?:/\d*\.?\d*)?\s*)?'  # optional coeff
    r'([ij])'                     # i or j
)

def plaintext_to_sympy(expr: str) -> str:
    def repl(m):
        coeff = m.group(1)
        return (coeff if coeff and coeff.strip() else '') + ('*I' if coeff else 'I')
    return _plain_i.sub(repl, expr)

# -----------------------------------------------------------
# Step 3: Parse to SymPy object
# -----------------------------------------------------------

def to_sympy(expr: str):
    try:
        return parse_latex(expr)
    except Exception:
        pass
    expr2 = plaintext_to_sympy(expr)
    try:
        return sp.sympify(expr2, rational=True)
    except Exception:
        # Basic \frac fallback
        m = re.fullmatch(r'\\frac\{([^}]+)\}\{([^}]+)\}', expr)
        if m:
            return sp.Rational(sp.sympify(m.group(1)), sp.sympify(m.group(2)))
        raise

# -----------------------------------------------------------
# Step 4: Canonical form for exact string comparison
# -----------------------------------------------------------

def sympy_canonical(sym):
    sym = sp.nsimplify(sym, rational=True)
    sym = sp.simplify(sym)
    return str(sym)

# -----------------------------------------------------------
# Step 5: Master function — pass in string *after* ####
# -----------------------------------------------------------

def normalize_math_answer(raw_ans: str, tol: float = 1e-9) -> str:
    stripped = strip_latex_wrappers(raw_ans)
    try:
        sym = to_sympy(stripped)
        # Snap near-zero real or imaginary parts
        if (sym.is_real or sym.is_complex):
            re_part, im_part = sp.re(sym), sp.im(sym)
            if abs(re_part) < tol and abs(im_part) < tol:
                sym = sp.Integer(0)
        return sympy_canonical(sym)
    except Exception:
        return stripped  # fallback: cleaned input

# -----------------------------------------------------------
# Step 6: Optional equality checker for eval loop
# -----------------------------------------------------------

def equal_answers(ans1: str, ans2: str, tol: float = 1e-6) -> bool:
    try:
        a, b = sp.sympify(ans1), sp.sympify(ans2)
        return abs(a - b) < tol
    except Exception:
        return False

# -----------------------------------------------------------
# Example usage
# -----------------------------------------------------------

examples = [
    r"#### \frac{3}{5}",
    r"#### \sqrt{2}",
    r"#### \frac{7}{2}i",
    r"#### i\sqrt{3}",
    r"#### 3 + 4i",
    r"#### -i",
    r"#### \frac{\pi}{6}",
    r"#### 0.75",
]

for ex in examples:
    _, raw = ex.split("####")
    print(f"{ex}  -->  {normalize_math_answer(raw)}")

In [None]:
import pandas as pd
import re
import numpy as np
# Make sure sympy is imported if not already done by the script
import sympy as sp 
# Assume the normalization functions (normalize_math_answer, etc.) are defined above

# Assuming 'filtered_frags' is your input Pandas Series with original indices preserved.
# Example Data (replace with your actual Series):
# filtered_frags = pd.Series({
#     0: "The answer is 600", 
#     1: "Some text ending with 3", 
#     2: "The answer is \(x^2\)", 
#     3: "Text that ends with \(y=mx+b\)", 
#     4: "No answer here", 
#     5: "the answer is 4.5", 
#     6: "Just text ending with 10." # Example ending with number + punctuation
# })


# --- Step 1: Define Regex for Answer Extraction ---

# Number pattern: Integer or float (potentially scientific notation)
num_pattern = r'[-+]?\d+(\.\d+)?([eE][-+]?\d+)?' 
# LaTeX pattern: Matches \(...\) non-greedily
latex_pattern = r'\\\((.*?)\\\)' 

# Regex to capture content after "The answer is" (case-insensitive)
# Captures everything after optional whitespace following "is"
regex_extract_c1 = re.compile(r'the answer is\s*(.*)', re.IGNORECASE)

# Regex to capture a number at the very end of the string
regex_extract_c2_num = re.compile(r'(' + num_pattern + r')\s*$') # Capture group 1, allow trailing whitespace

# Regex to capture LaTeX \(...\) at the very end of the string
regex_extract_c2_latex = re.compile(r'(' + latex_pattern + r')\s*$') # Capture group 1, allow trailing whitespace


# --- Step 2: Function to Extract Raw Answer String ---

def extract_answer(text):
    if not isinstance(text, str):
        return None # Handle potential non-string data

    # Try Criterion 1: Starts with "The answer is"
    match1 = regex_extract_c1.search(text)
    if match1:
        # Check if the extracted part *is* actually a number or LaTeX 
        # This adds robustness based on the original identification criteria
        potential_answer = match1.group(1).strip()
        if re.fullmatch(num_pattern, potential_answer) or \
           re.fullmatch(latex_pattern, potential_answer):
             return potential_answer
        # If it starts with "The answer is" but isn't followed by num/latex, maybe ignore?
        # Or return it anyway? Let's return it based on simpler extraction rule.
        # return potential_answer # Option: return even if not strictly num/latex
        
        # Let's stick closer to the *identification* criteria: only extract if followed by num/latex
        # We re-use the *identification* regex here on the *extracted* part
        regex_c1_content_check = re.compile(r'^\s*(' + num_pattern + r'|' + latex_pattern + r')\s*$')
        if regex_c1_content_check.match(potential_answer):
             return potential_answer
             
    # Try Criterion 2: Ends with Number
    match2_num = regex_extract_c2_num.search(text)
    if match2_num:
        return match2_num.group(1).strip() # Group 1 is the captured number

    # Try Criterion 2: Ends with LaTeX
    match2_latex = regex_extract_c2_latex.search(text)
    if match2_latex:
        return match2_latex.group(1).strip() # Group 1 is the captured LaTeX

    return None # No answer pattern extracted


# --- Step 3: Extract Raw Answers ---

print("Extracting raw answers...")
# Apply the extraction function; result might contain None values
raw_answers = filtered_frags.apply(extract_answer)

# Filter out entries where no answer could be extracted
extracted_answers = raw_answers.dropna()
print(f"Successfully extracted raw answers from {len(extracted_answers)} entries.")


# --- Step 4: Normalize and Format ---

print("Normalizing extracted answers and formatting...")
final_formatted_answers = {} # Use a dictionary to preserve original indices

for index, raw_ans in extracted_answers.items():
    # Apply the normalization script's main function
    # The normalize_math_answer function includes the stripping logic
    try:
        normalized_ans = normalize_math_answer(raw_ans)
        # Format the output string
        final_formatted_answers[index] = f"The answer is #### {normalized_ans}"
    except Exception as e:
        print(f"Warning: Error normalizing answer at index {index} ('{raw_ans}'): {e}")
        # Option: Store original or a placeholder if normalization fails
        # final_formatted_answers[index] = f"The answer is #### [Normalization Error: {raw_ans}]" 
        final_formatted_answers[index] = f"The answer is #### {strip_latex_wrappers(raw_ans)}" # Fallback to just stripped

# Convert the dictionary back to a Pandas Series, preserving original indices
final_output_series = pd.Series(final_formatted_answers)

print(f"Generated final formatted series with {len(final_output_series)} entries.")
# Display some examples
print("\nSample final formatted answers:")
print(final_output_series.head())



In [None]:

final_output_series

In [None]:
new_last_fragment = df['full_sequence'].str.split(' ; ').str[-1]
new_last_fragment

In [None]:
import pandas as pd
import numpy as np

# Assume 'df' is your original DataFrame
# Assume 'final_output_series' is the Series with normalized answers and original indices

# --- Step 1: Subset df down to the indices which appear in final_output_series ---

# Get the list of indices that are present in the final normalized output
valid_indices = final_output_series.index

# Filter the original DataFrame to keep only rows with these indices
# Use .loc for label-based indexing. No .copy() needed if we modify directly,
# but let's create df_subset for clarity like before. Use copy() for safety.
df_subset = df.loc[df.index.isin(valid_indices)].copy()

print(f"Original df length: {len(df)}")
print(f"Subset df length (matching final_output_series indices): {len(df_subset)}")

# --- Step 2: Swap the final fragment using vectorized operations ---

# Ensure 'final_output_series' is perfectly aligned with 'df_subset' index
# This should already be true, but reindexing or using .loc is safest
new_fragments_aligned = final_output_series.loc[df_subset.index]

# Perform the right-split on the full sequence column
# n=1 ensures we only split on the last occurrence. expand=True creates columns.
split_parts = df_subset['full_sequence'].str.rsplit(' ; ', n=1, expand=True)

# split_parts[0] contains the prefix (or the full string if no delimiter)
# split_parts[1] contains the last fragment (or None if no delimiter)

# Identify rows where the delimiter was actually found
has_delimiter = split_parts[1].notna()

# Construct the result conditionally using np.where
# If delimiter exists: prefix + ' ; ' + new_fragment
# If delimiter doesn't exist: just the new_fragment
df_subset['full_sequence'] = np.where(
    has_delimiter, 
    split_parts[0] + ' ; ' + new_fragments_aligned, # Value if True
    new_fragments_aligned                          # Value if False
)
print("\nReplacement complete using vectorized operations.")

# --- Verification (Optional) ---
print("\nVerifying a few examples:")
original_indices_to_check = df_subset.index[:5] # Check the first 5 indices in the subset

for idx in original_indices_to_check:
    print(f"\n--- Index: {idx} ---")
    print(f"Original Sequence (from df):")
    print(df.loc[idx, 'full_sequence']) 
    print(f"New Sequence (from df_subset):")
    print(df_subset.loc[idx, 'full_sequence'])
    print(f"Normalized Fragment Used:")
    # Use .loc to ensure correct alignment when fetching from final_output_series
    print(final_output_series.loc[idx]) 
    
# --- Overwrite original df if desired ---
# df = df_subset 
# print("\nOriginal df overwritten with updated subset.")

In [None]:
df_subset

In [None]:
# Reset index, creating a new column from the old index
df_subset_reset = df_subset.reset_index()


In [None]:
df_subset_reset.drop(columns = ['original_index'], inplace = True)
df_subset_reset

In [None]:
# Write to csv with path data/latex_replaced_data.csv
path = "../data/latex_replaced_data.csv"

df_subset_reset.to_csv(path)

# Reorganize train_len_prepend to remove token counters and enforce uniform cumulative token length

In [None]:
# Read in the data:
import pandas as pd

path = "../data/latex_replaced_data.csv"
df = pd.read_csv(path, index_col = 0)


In [None]:
# Remove instaces of [num] (ex: [7] , [17]) from the full_sequence column
df['full_sequence'] = df['full_sequence'].str.replace(r'\[[0-9]+\]', '', regex=True)

# test instance
df['full_sequence'][9996]



In [None]:
print(df.describe())

# De-duplicate the df
df = df.drop_duplicates()
# Re-index
df = df.reset_index(drop = True)

df

In [None]:
## Plot the distribution of token sequence lengths

# from transformers import GPT2TokenizerFast
# import matplotlib.pyplot as plt 
# import numpy as np 


# sentence_col = df['full_sequence'] 

# tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')

# # Tokenize the sentences in batch 
# # Convert Series to list if necessary
# tokenized_output = tokenizer(sentence_col.tolist(), truncation=False) # Don't truncate yet, we want the real lengths

# # Get the length of each token sequence using a list comprehension 
# # This is efficient as the heavy lifting (tokenization) was done in batch
# sequence_lengths = [len(ids) for ids in tokenized_output['input_ids']]

# # Plot the distribution of sequence lengths
# plt.figure(figsize=(10, 6))
# plt.hist(sequence_lengths, bins=50, edgecolor='black') # Adjust bins as needed
# plt.title('Distribution of Token Sequence Lengths')
# plt.xlabel('Sequence Length (Number of Tokens)')
# plt.ylabel('Number of Examples')
# plt.axvline(x=1024, color='r', linestyle='--', label='GPT-2 Max Length (1024)') # Add line for max length
# plt.legend()
# plt.grid(axis='y', alpha=0.75)
# plt.show() # Display the plot

# # Print statistics
# print(f"Maximum sequence length: {max(sequence_lengths)}")
# print(f"Minimum sequence length: {min(sequence_lengths)}")
# print(f"Average sequence length: {np.mean(sequence_lengths):.2f}")
# num_too_long = sum(1 for length in sequence_lengths if length > 1024)
# print(f"Number of sequences longer than 1024: {num_too_long}")
# print(f"Percentage of sequences longer than 1024: {(num_too_long / len(sequence_lengths) * 100):.2f}%")

# print(f"Number of sequences longer than 512: {sum(1 for length in sequence_lengths if length > 512)}")
# print(f"Percentage of sequences longer than 512: {(sum(1 for length in sequence_lengths if length > 512) / len(sequence_lengths) * 100):.2f}%")



In [None]:
from transformers import GPT2TokenizerFast

tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
# §

# Tokenize this and see the number
tokenizer.tokenize('Ã')

# print the id
tokenizer.convert_tokens_to_ids('Ã')



## Tokenize w/ max length of 512

In [None]:
import pandas as pd
from transformers import GPT2TokenizerFast
import numpy as np
import torch 
from tqdm.auto import tqdm # Keep tqdm for potential future loops if needed

# --- Setup ---
# Assuming 'df' is your initial DataFrame
# Initialize tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')

# *** Use a rare existing token for padding ***
# Candidate: ID 127 ('Ã') 
pad_token_to_add = tokenizer.decode([127]) # Get the string 'Ã'
tokenizer.add_special_tokens({'pad_token': pad_token_to_add}) 

# Now, check the pad_token_id that the tokenizer assigned *after* adding
pad_token_id = tokenizer.pad_token_id 
print(f"Using PAD token ID: {pad_token_id} ('{tokenizer.pad_token}')") 


# GPT-2's default EOS token is '<|endoftext|>' with ID 50256
eos_token_id = tokenizer.eos_token_id 
eos_token_string = tokenizer.eos_token
print(f"Using EOS token ID: {eos_token_id} ('{eos_token_string}') for end-of-sequence.")

# Define max length
max_len_allowed = 512 

# --- Step 1: Clean the text ---
# (Previously Step 3)
print("Step 1: Cleaning text...")
# Operate directly on df, assuming we want to process all rows initially
# Create a copy if you want to preserve the original df
df_processed = df.copy() 
df_processed['full_sequence'] = df_processed['full_sequence'].str.replace(r'\[[0-9]+\]', '', regex=True)

# --- Step 2: Append EOS token string ---
# (Previously Step 4, adapted)
print("Step 2: Appending EOS string...")
# Ensure a space before EOS if tokenizer doesn't handle it automatically
df_processed['full_sequence'] = df_processed['full_sequence'] + ' ' + eos_token_string

# --- Step 3: Tokenize with Padding and Truncation using Tokenizer ---
# (Replaces previous Steps 1, 2, 4, 5, 6)
print(f"Step 3: Tokenizing, padding (with ID {pad_token_id}), and truncating sequences to {max_len_allowed}...")


# # --- Debug ---
# print(f"DEBUG: Tokenizer pad_token_id immediately before call: {tokenizer.pad_token_id}") 
# # --------------------
    

# The tokenizer handles padding with tokenizer.pad_token_id and truncation efficiently.
# add_special_tokens=False might be needed if you manually added EOS string,
# otherwise, if True, it might add another EOS depending on tokenizer config.
# Let's assume manual EOS addition is sufficient.
final_tokenization = tokenizer(
    df_processed['full_sequence'].tolist(),
    padding='max_length',    # Pad to max_length using tokenizer.pad_token_id (now set to 127)
    truncation=True,         # Truncate sequences longer than max_length
    max_length=max_len_allowed, 
    add_special_tokens=False,# Set to False as we manually added the EOS string
    return_tensors='pt'      # Return PyTorch tensors
)
print("Tokenization complete.")

# --- Final Result ---
# 'final_tokenization' now contains 'input_ids' (padded with ID 127) and 'attention_mask'
input_ids_tensor = final_tokenization['input_ids']
attention_mask_tensor = final_tokenization['attention_mask']

print("\nProcessing complete.")
print(f"Shape of input_ids: {input_ids_tensor.shape}")
print(f"Shape of attention_mask: {attention_mask_tensor.shape}")


### RUN CHECKS ###

# Assuming 'final_tokenization' holds the output from the previous batch 
# tokenization step (which included padding='max_length', max_length=512)
# And 'df' is the filtered and processed DataFrame 

# Access the tokenized 'input_ids'. The structure depends on 'return_tensors'.
input_ids = final_tokenization['input_ids']

# Check if the number of sequences matches the DataFrame length
assert len(input_ids) == len(df), \
    f"Verification failed: Mismatch between tokenized sequences ({len(input_ids)}) and DataFrame rows ({len(df)})"

# Check if all sequences have the correct length (512)
# If input_ids is a Tensor (PyTorch/TensorFlow):
if hasattr(input_ids, 'shape'): 
    num_sequences, seq_len = input_ids.shape
    assert seq_len == 512, \
        f"Verification failed: Expected sequence length 512, but tensor shape shows {seq_len}"
# If input_ids is a list of lists:
else: 
    all_lengths_correct = all(len(ids) == 512 for ids in input_ids)
    assert all_lengths_correct, \
        "Verification failed: Not all tokenized sequences in the list have length 512."

print("Verification successful: All processed sequences have the target length of 512 tokens.")

# --- Output summary stats and a sample sequence ---
print(f"Number of sequences: {len(df)}")
# Displaying the string sequence might not show the padding added by the tokenizer
# print(f"Sample processed string sequence: {df['full_sequence'].iloc[0]}") 



In [None]:
# Visualize
print("\nSample tokenization output:")
sample_idx = 100
print(f"Sample input_ids ({sample_idx}): {input_ids[sample_idx].tolist() if hasattr(input_ids, 'tolist') else input_ids[sample_idx]}")
if 'attention_mask' in final_tokenization:
    print(f"Sample attention_mask ({sample_idx}): {final_tokenization['attention_mask'][sample_idx].tolist() if hasattr(final_tokenization['attention_mask'], 'tolist') else final_tokenization['attention_mask'][sample_idx]}")
# decode to see the tokens including special ones
print(f"Decoded sample: {tokenizer.decode(input_ids[sample_idx])}") 

In [None]:
# # Print the fields of the final_tokenization
# print(f"Final tokenization fields: {final_tokenization.keys()}")

# # Compare two different attention masks
# print(f"Attention mask 0: {final_tokenization['attention_mask'][0]}")
# print(f"Attention mask 1: {final_tokenization['attention_mask'][1]}")

# # Compare two different input IDs
# print(f"Input ID 0: {final_tokenization['input_ids'][0]}")
# print(f"Input ID 1: {final_tokenization['input_ids'][1]}")


In [None]:
df = df['full_sequence']
df.head(10)

In [None]:
# Save the tokenized data as a df with input_ids and attention_mask as columns
tokenized_df = pd.DataFrame({
    'input_ids': final_tokenization['input_ids'].tolist(),
    'attention_mask': final_tokenization['attention_mask'].tolist()
})

tokenized_df.head(10)

In [None]:
# Validate properties of df and tokenized_df, then merge them

print(f"df shape: {df.shape}")
print(f"tokenized_df shape: {tokenized_df.shape}")
# print(f"df columns: {df.columns}")
print(f"tokenized_df columns: {tokenized_df.columns}")


# Check whether the indices are the same
if df.index.equals(tokenized_df.index):
    print("Indices match between df and tokenized_df")
else:
    print("Indices do not match between df and tokenized_df")

# Observe how many instances there are in which the attention mask does not contain 0
print("Number of instances in which the attention mask does not contain 0: ", sum(tokenized_df['attention_mask'].apply(lambda x: 0 not in x)))

# Of these values, see whether any of them have length of the attention mask less than 512
print("Number of instances in which the attention mask has length less than 512: ", len(tokenized_df[tokenized_df['attention_mask'].apply(lambda x: len(x) < 512)]))
print("Number of instances in which the attention mask has length greater than 512: ", len(tokenized_df[tokenized_df['attention_mask'].apply(lambda x: len(x) > 512)]))
print("Number of instances in which the attention mask has length equal to 512: ", len(tokenized_df[tokenized_df['attention_mask'].apply(lambda x: len(x) == 512)]))



In [None]:

# Merge the dfs 
merged_df = pd.concat([df, tokenized_df], axis=1)
# Drop the rows where the attention mask does not contain 0
merged_df = merged_df[merged_df['attention_mask'].apply(lambda x: 0 in x)]

# Print properties of the merged df
print(f"Merged df shape: {merged_df.shape}")
print(f"Merged df columns: {merged_df.columns}")

# Reset the index of the merged df
merged_df = merged_df.reset_index(drop=True)

merged_df.iloc[0,:]


In [None]:
# Check types 
print("full_sequence type: ", type(merged_df.iloc[0,0]))
print("input_ids type: ", type(merged_df.iloc[0,1]))
print("attention_mask type: ", type(merged_df.iloc[0,2]))

# Print the data types of the columns
print("data types: ", merged_df.dtypes)

type(merged_df.iloc[0,2])

merged_df.columns

In [None]:
# ## Check that the attention mask leaves 1 in the spot corresponding to the end of text token (50256). Also check that the next token is 50257 and the next element of the attention mask is 0.

# # Mismatching token ids
# end_text_token_list = []
# next_token_list = []
# next_mask_list = []

# for row_idx in range(len(merged_df)):
#     temp_row = merged_df.iloc[row_idx,:]
#     temp_row_sequence = temp_row['full_sequence']
#     temp_row_input_ids = temp_row['input_ids']
#     temp_row_attention_mask = temp_row['attention_mask']

#     # Get the index of the last 1 in the attention mask
#     last_one_idx = np.where(np.array(temp_row_attention_mask) == 1)[0][-1]

#     # Get the token id at the last 1 in the attention mask
#     last_one_token_id = temp_row_input_ids[last_one_idx] 

#     end_text_token_list.append(last_one_token_id)

#     # Get the next token id
#     next_token_id = temp_row_input_ids[last_one_idx + 1]
#     next_token_list.append(next_token_id)

#     # Get the next element of the attention mask
#     next_mask_id = temp_row_attention_mask[last_one_idx + 1]
#     next_mask_list.append(next_mask_id)


# # Print the unique values of the lists
# print("unique end text token ids: ", np.unique(end_text_token_list))
# print("unique next token ids: ", np.unique(next_token_list))
# print("unique next mask ids: ", np.unique(next_mask_list))









In [None]:
merged_df

## Output cleaned dataset (full) and tokenization to csv

In [None]:
merged_df.to_csv('../data/merged_data_full.csv', index=False)

# Create Training and Validation Sets

In [None]:
merged_df

In [None]:
# Print dtypes
example_row = merged_df.iloc[0,:] 

# print type of each col in row
for col in example_row.index:
    print(f"{col}: {type(example_row[col])}")



In [None]:
import pandas as pd

# Assuming 'merged_df' is your DataFrame

def get_question_robust(row):
    full_seq = row['full_sequence']
    question_text = None # Default value

    if isinstance(full_seq, str):
        # CORRECTED DELIMITER: Remove the leading space before "Question:"
        parts_after_marker = full_seq.split("Question:", 1) 

        if len(parts_after_marker) > 1: # Check if "Question:" was found
            text_potentially_with_hlp = parts_after_marker[1]
            
            # Split by " ; High-Level" (keep the space here if it exists)
            parts_before_hlp = text_potentially_with_hlp.split(" ; High-Level", 1)
            
            # The question is the first part of this second split
            question_text = parts_before_hlp[0].strip() 
        # else: # Optional: Handle cases where "Question:" is missing entirely
            # print(f"Warning: 'Question:' not found in row index {row.name}")
            # pass 

    return question_text

# Re-apply the corrected function
print("Re-extracting questions with corrected delimiter...")
merged_df['question'] = merged_df.apply(get_question_robust, axis=1)
print("Extraction complete.")

# Check for rows where extraction might have failed 
failed_extractions = merged_df['question'].isna().sum()
if failed_extractions > 0:
    print(f"Warning: Could not extract question text for {failed_extractions} rows.")
    # Inspect problematic rows if needed:
    # print(merged_df[merged_df['question'].isna()][['full_sequence']].head())

print("\nDataFrame with 'question' column:")
print(merged_df[['full_sequence', 'question']].head()) # Show relevant columns

In [None]:
merged_df

In [None]:
# Test instance 

test_idx = 99756
merged_df['question'][test_idx]



In [None]:
question_list = merged_df['question'].unique()

# Print the number of unique questions
print(f"Number of unique questions: {len(question_list)}")




In [None]:
# Group by question and split into train and validation set 


# First, group by question
grouped_df = merged_df.groupby('question')

# Then, split into train and validation sets
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

# Assuming 'merged_df' is your DataFrame with the 'question' column populated

# --- Step 1: Get Unique Questions ---
# Drop rows where question might be None/NaN if extraction failed for some
# Or handle them separately if needed
unique_questions = merged_df['question'].dropna().unique()
print(f"Found {len(unique_questions)} unique questions.")

# --- Step 2: Split Unique Questions into Train/Validation ---
# Define the split ratio 
validation_size = 0.10
# Use a fixed random state for reproducibility
random_state = 42 

train_questions, val_questions = train_test_split(
    unique_questions,
    test_size=validation_size,
    random_state=random_state
)

print(f"Splitting into {len(train_questions)} train questions and {len(val_questions)} validation questions.")

# --- Step 3: Create Train and Validation DataFrames ---
# Filter the original DataFrame based on the question lists

# Create boolean masks
is_train_question = merged_df['question'].isin(train_questions)
is_val_question = merged_df['question'].isin(val_questions)

# Apply masks to create the final DataFrames
train_df = merged_df[is_train_question].copy()
val_df = merged_df[is_val_question].copy()

# --- Verification ---
print(f"\nOriginal DataFrame length: {len(merged_df)}")
print(f"Train DataFrame length: {len(train_df)}")
print(f"Validation DataFrame length: {len(val_df)}")
print(f"Total split length: {len(train_df) + len(val_df)}")

# Optional: Check if any question overlaps between sets (should be empty)
train_q_set = set(train_df['question'].unique())
val_q_set = set(val_df['question'].unique())
overlap = train_q_set.intersection(val_q_set)
if not overlap:
    print("Verification successful: No question overlap between train and validation sets.")
else:
    print(f"Warning: Found {len(overlap)} overlapping questions: {list(overlap)[:5]}...") # Print first 5 overlaps if any

In [None]:
train_df.drop(columns = ['question'], inplace = True)
val_df.drop(columns = ['question'], inplace = True)

train_df


In [None]:
val_df

## Write train and val set to CSV

In [None]:
import os

# Crate folder if it doesn't exist
if not os.path.exists('../data/train_val_sets'):
    os.makedirs('../data/train_val_sets')

# Write train and val set to CSV
train_df.to_csv('../data/train_val_sets/train_set.csv', index = False)
val_df.to_csv('../data/train_val_sets/val_set.csv', index = False)