<a href="https://colab.research.google.com/github/DilkiSandunika/VGTU_Thesis_Project/blob/main/notebooks/01_data_exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ===================================================================
# CELL 1: Install Necessary Libraries
# ===================================================================
# We need to install these libraries every time we start a new Colab session.
# The '!' tells Colab to run this as a command-line command.
print("Installing required libraries...")
!pip install pandas beautifulsoup4 lxml -q

print("Libraries installed successfully.")

# ===================================================================
# CELL 2: Import Libraries and Define File Paths
# ===================================================================
import os
import pandas as pd
from bs4 import BeautifulSoup
from google.colab import files
import re # <-- ADD THIS IMPORT

# Define the path to the file you uploaded
# In Colab, uploaded files are in the '/content/' directory
input_xml_path = '/content/cctns_frs.xml'

# Define the path for our clean, processed output file
output_csv_path = '/content/parsed_requirements.csv'

# ===================================================================
# CELL 3: Parse the XML File (Using a Different Parser)
# ===================================================================
print(f"Reading and parsing the XML file from: {input_xml_path}")

# Open and read the file content
with open(input_xml_path, 'r', encoding='utf-8') as f:
    content = f.read()

# We don't even need the namespace fix anymore with this parser, but it doesn't hurt to keep it.
content = re.sub(r' xmlns="[^"]*"', '', content, count=1)

# --- THE DEFINITIVE FIX IS HERE ---
# Create a BeautifulSoup object using the more lenient 'html.parser' engine.
soup = BeautifulSoup(content, 'html.parser') # <--- THE ONLY CHANGE IS HERE

# Find all the <req> tags, which contain our requirements
all_req_tags = soup.find_all('req')
print(f"Found {len(all_req_tags)} requirement tags in the document.")

# ===================================================================
# CELL 4: Extract and Structure the Data
# ===================================================================
# Create an empty list to store our structured requirement data
requirements_data = []

# Loop through each <req> tag we found
for req in all_req_tags:
    # Get the ID attribute (use 'N/A' as a default if it's missing)
    req_id = req.get('id', 'N/A')

    # Get the clean text from the <text_body> inside the <req> tag
    # The .strip() removes any leading/trailing whitespace
    if req.text_body:
        req_text = req.text_body.get_text(strip=True)
    else:
        req_text = "No text body found"

    # Append the structured data as a dictionary to our list
    requirements_data.append({
        'id': req_id,
        'text': req_text
    })

print("Successfully extracted and structured the requirement data.")

# ===================================================================
# CELL 5: Display the Data in a DataFrame
# ===================================================================
# Convert our list of dictionaries into a pandas DataFrame for a clean, tabular view
df_requirements = pd.DataFrame(requirements_data)

# Display the first 10 rows of the DataFrame
print("\n--- First 10 Parsed Requirements ---")
display(df_requirements.head(10))


# ===================================================================
# CELL 6: Save the Processed Data
# ===================================================================
# This is a crucial step! We save our clean data to a CSV file.
# The next notebook (02_...) will be able to use this clean file directly.
df_requirements.to_csv(output_csv_path, index=False)

print(f"\nSuccessfully saved the parsed data to: {output_csv_path}")
print("You can see this file in the file browser on the left.")

# ===================================================================
# CELL 7: Download the File (Optional)
# ===================================================================
# This line will trigger a download of the CSV file to your computer,
# so you have a copy of the processed data locally.
print("\nTriggering download of the processed CSV file...")
files.download(output_csv_path)

Installing required libraries...
Libraries installed successfully.
Reading and parsing the XML file from: /content/cctns_frs.xml
Found 115 requirement tags in the document.
Successfully extracted and structured the requirement data.

--- First 10 Parsed Requirements ---


Unnamed: 0,id,text
0,1,The solution should provide detailed context-s...
1,2,The solution should provide detailed context-s...
2,1,The solution should provide an interface for t...
3,2,"The solution should send alerts (e.g., email, ..."
4,3,The solution should enable the user to track t...
5,4,The solution should enable the help-desk user ...
6,5,The support solution should be accessible to t...
7,1,An audit trail is a record of actions taken by...
8,2,Once the audit trail functionality has been ac...
9,3,The System must maintain the audit trail for a...



Successfully saved the parsed data to: /content/parsed_requirements.csv
You can see this file in the file browser on the left.

Triggering download of the processed CSV file...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>