In [None]:
import pandas as pd
import re
from bs4 import BeautifulSoup
import os

In [None]:
# Replace with your file path

CWD = os.getcwd()
RAW_CSV_PATH = os.path.join(CWD, '..',"data", "raw_requirements.csv")
df = pd.read_csv(RAW_CSV_PATH)

# Quick sanity check
df.head(3)


In [None]:
def clean_issue_text(text):
    if pd.isna(text):
        return ""
    
    # Remove HTML tags
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text(separator=" ")

    # Remove markdown image/links
    text = re.sub(r'!\[.*?\]\(.*?\)', '', text)
    text = re.sub(r'\[.*?\]\(.*?\)', '', text)

    # Remove GitHub comment tags <!-- -->
    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)

    # Remove file paths, links, version dumps
    text = re.sub(r'http\S+|www\S+|\S+\.(jpg|png|md|html)', '', text)
    text = re.sub(r'VS Code version:.*|OS version:.*', '', text)

    # Remove boilerplate phrases
    templates = [
        r'Please read our Rules of Conduct.*',
        r'Read our guide about submitting issues.*',
        r'Search existing issues.*',
        r'Test using the latest Insiders build.*',
        r'Use \'Report Issue\'.*',
        r'Launch with `code --disable-extensions`.*',
        r'Issues caused by an extension.*',
        r'Help: Start Extension Bisect.*'
    ]
    for pattern in templates:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE)

    # Remove excess whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text


In [None]:
df['cleaned_text'] = df['raw_text'].apply(clean_issue_text)

# View cleaned result
df[['raw_text', 'cleaned_text']].head(3)


In [None]:
def chunk_text(text):
    # Break by line, bullet, or punctuation
    lines = re.split(r'[\n\r]|[-*•]\s+|(?<=[.?!])\s+', text)
    lines = [line.strip() for line in lines if len(line.strip()) > 20]
    return lines

# Apply and store as list of chunks
df['chunks'] = df['cleaned_text'].apply(chunk_text)

# Example
df.iloc[0]['chunks']


# Next step is optional

In [None]:
# Flatten a few chunks for labeling
sample_data = df['chunks'].explode().dropna().reset_index(drop=True)

# Show first 5 to copy-paste
for i, record in enumerate(sample_data.head(5)):
    print(f"Sample {i+1}:\n{record}\n{'-'*60}")
