### ContentPDFClass: Stores the extracted content from each PDF file

In [8]:
! pip3 install lxml

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [9]:
import pandas as pd
from lxml import etree

# File paths
txt_files = [
    "../Datasets/PyPDF/PyPDF_RR_2024_l1_combined.txt",
    "../Datasets/PyPDF/PyPDF_RR_2024_l2_combined.txt",
    "../Datasets/PyPDF/PyPDF_RR_2024_l3_combined.txt"
]

xml_files = [
    "../Datasets/Grobid/Grobid_RR_2024_l1_combined.xml",
    "../Datasets/Grobid/Grobid_RR_2024_l2_combined.xml",
    "../Datasets/Grobid/Grobid_RR_2024_l3_combined.xml"
]

# Read text file content up to the first newline character
def read_first_line(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.readline().strip()

# Extract data from XML based on provided structure and requirements
def extract_data_from_xml(file_path):
    ns = {'tei': 'http://www.tei-c.org/ns/1.0'}
    topics = []
    heading =[]
    head_with_p = []  # Correct variable name used
    learning_outcomes_found = False

    with open(file_path, 'rb') as file:
        tree = etree.parse(file)
        divs = tree.findall('.//tei:div', namespaces=ns)
        subtopics = []
        for div in divs:
            head = div.find('tei:head', namespaces=ns)
            if head is not None and head.text:
                subtopics.append(head.text.strip())
                if head.text.strip() == "LEARNING OUTCOMES":
                    subtopics.pop(-1)
                    heading.append(subtopics[-1])
                    subtopics.pop(-1)
                    topics.append(subtopics)
                    subtopics = []
                    continue
    
    topics.append(subtopics)
    # Correctly use head_with_p variable
    return topics, heading



# Process files and collect data
data = []
for txt_file, xml_file in zip(txt_files, xml_files):
    first_line = read_first_line(txt_file)
    topics, heading = extract_data_from_xml(xml_file)
    heading.insert(0,first_line)
    for i in range(len(topics)):
        data.append([xml_file.split('/')[-1], heading[i],'|'.join(topics[i]),len(topics[i])])
    

# Convert to DataFrame
df = pd.DataFrame(data, columns=['File Name','Headings', 'Topics', 'Topics Count'])

# Write to CSV
output_csv_path = "../Datasets/final_output.csv"
df.to_csv(output_csv_path, index=False)


In [10]:
! pip3 install pydantic

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m


Step 1: Define Pydantic Models

In [11]:
from pydantic import BaseModel, Field, validator
from typing import List
import re
class ContentPDFClass(BaseModel):
    file_name: str = Field(..., alias='File Name')
    headings: str = Field(..., alias='Headings')
    topics: str = Field(..., alias='Topics')
    topics_count: int = Field(..., alias='Topics Count')
    
    @validator('file_name')
    def file_name_must_be_valid(cls, v):
        if not v.endswith(".xml"):
            raise ValueError("File name must end with .xml")
        return v

    @validator('topics_count')
    def topics_count_must_be_positive(cls, v):
        if v <= 0:
            raise ValueError("Topics count must be positive")
        return v
    


/var/folders/g6/6zt2_rp57dq8l45n6hd1gd_80000gn/T/ipykernel_33831/3652341833.py:10: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.6/migration/
  @validator('file_name')
/var/folders/g6/6zt2_rp57dq8l45n6hd1gd_80000gn/T/ipykernel_33831/3652341833.py:16: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.6/migration/
  @validator('topics_count')


Step 2: Validate and Clean Data

In [12]:
import pandas as pd

#df is your DataFrame
valid_data = []
invalid_data = []

for index, row in df.iterrows():
    try:
        # Convert row to dictionary and validate
        topic = ContentPDFClass(**row.to_dict())
        valid_data.append(topic.dict(by_alias=True))
    except Exception as e:
        print(f"Invalid data at row {index}: {e}")
        invalid_data.append(row.to_dict())

# Convert valid data back to DataFrame
clean_df = pd.DataFrame(valid_data)

# Optionally, save invalid data for review
invalid_df = pd.DataFrame(invalid_data)


Step 3: Save the Clean DataFrame to CSV

In [13]:
clean_output_csv_path = "/Users/riyasingh/Downloads/Datasets/clean_final_output.csv"
clean_df.to_csv(clean_output_csv_path, index=False)

In [16]:
! pip install ipytest
! pip3 install ipytest

[0mDefaulting to user installation because normal site-packages is not writeable
Collecting ipytest
  Using cached ipytest-0.14.0-py3-none-any.whl (14 kB)
Collecting pytest>=5.4
  Downloading pytest-8.0.2-py3-none-any.whl (333 kB)
[K     |████████████████████████████████| 333 kB 4.8 MB/s eta 0:00:01
Collecting iniconfig
  Downloading iniconfig-2.0.0-py3-none-any.whl (5.9 kB)
Collecting tomli>=1.0.0
  Downloading tomli-2.0.1-py3-none-any.whl (12 kB)
Collecting pluggy<2.0,>=1.3.0
  Downloading pluggy-1.4.0-py3-none-any.whl (20 kB)
Installing collected packages: tomli, pluggy, iniconfig, pytest, ipytest
Successfully installed iniconfig-2.0.0 ipytest-0.14.0 pluggy-1.4.0 pytest-8.0.2 tomli-2.0.1
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m


Test Cases Using Pytest

In [17]:
import ipytest
ipytest.autoconfig()
import pytest

#Positive Test Cases
def test_valid_file_name():
    assert ContentPDFClass(**{'File Name' : 'valid_file.xml', 'Headings' : 'Heading', 'Topics' : 'Topics1|Topic2' , 'Topics Count' : 2})

def test_positive_topics_count():
    assert ContentPDFClass(**{'File Name' : 'valid_file.xml', 'Headings' : 'Heading', 'Topics' : 'Topics1|Topic2' , 'Topics Count' : 2})

def test_valid_topics_string():
    """Test that a valid topics string passes validation."""
    model = ContentPDFClass(**{'File Name': "good_file.xml", 'Headings': "Valid Heading", 'Topics': 'Topic1|Topic2|Topic3', 'Topics Count':3})
    assert model.topics == "Topic1|Topic2|Topic3"

def test_maximum_topics_count():
    """Test that the maximum expected topics count is considered valid."""
    model = ContentPDFClass(**{'File Name':'max_topics.xml', 'Headings': 'Max Heading', 'Topics':'T1|T2', 'Topics Count': 2})
    assert model.topics_count == 2

def test_single_topic():
    """Test that a single topic is valid."""
    model = ContentPDFClass(**{'File Name':'single_topic.xml', 'Headings': 'Single Heading', 'Topics':'OnlyOneTopic', 'Topics Count': 1})
    assert model.topics == "OnlyOneTopic"


def test_headings_with_special_characters():
    """Test that headings with special characters are valid."""
    model = ContentPDFClass(**{'File Name' : 'special_char_heading.xml', 'Headings' : 'Heading & Heading', 'Topics' : 'Topic1' , 'Topics Count' : 1})
    assert '&' in model.headings



#Negative Test Cases

def test_invalid_file_name():
    with pytest.raises(ValueError):
        ContentPDFClass(**{'File Name' : 'invalid_file.txt', 'Headings' : 'Heading', 'Topics' : 'Topics1|Topic2' , 'Topics Count' : 2})


def test_negative_topics_count():
    with pytest.raises(ValueError):
        ContentPDFClass(**{'File Name' : 'file.xml', 'Headings' : 'Heading', 'Topics' : 'Topics1|Topic2' , 'Topics Count' : -1})

def test_invalid_file_extension():
    """Test that an invalid file extension raises a ValueError."""
    with pytest.raises(ValueError):
        ContentPDFClass(**{'File Name' : 'invalid_file.txt', 'Headings' : 'Invalid Extension', 'Topics' : 'Topics1' , 'Topics Count' : 1})


def test_zero_topics_count_with_non_empty_topics():
    """Test that a topics count of zero with non-empty topics string raises a ValueError."""
    with pytest.raises(ValueError):
        ContentPDFClass(**{'File Name' : 'zero_topics.xml', 'Headings' : 'Zero Count', 'Topics' : 'Topics1' , 'Topics Count' : 0})

        
def test_empty_file_name():
    """Test that an empty file name raises a ValueError."""
    with pytest.raises(ValueError):
        ContentPDFClass(**{'File Name' : '', 'Headings' : 'Empty File Name', 'Topics' : 'Topic1' , 'Topics Count' : 1})


ipytest.run('-v')

platform darwin -- Python 3.9.6, pytest-8.0.2, pluggy-1.4.0
rootdir: /Users/riyasingh/Desktop/BigData/Assignment3/notebooks
collected 11 items

t_9963a3b871cf48ceb69ef03108221496.py [32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m                                            [100%][0m



<ExitCode.OK: 0>