## MetaDataPDFClass

In [2]:
import pandas as pd

# Load the CSV file to check its headers and a few rows
file_path = '../input-data/metadata-grobid.csv'
try:
    # Attempt to read the CSV with a specified delimiter (e.g., tab)
    data = pd.read_csv(file_path)
except Exception as e:
    print("Error reading the CSV file:", e)

# Display the first few rows to understand the schema
data.head()

Unnamed: 0,file_name,language,version,encoding,file_size,s3_url
0,Grobid_RR_2024_l3_combined.txt,en,1.0,UTF-8,32602,https://bigdata-group3-assignment2.s3.amazonaw...
1,Grobid_RR_2024_l1_combined.txt,en,1.0,UTF-8,52255,https://bigdata-group3-assignment2.s3.amazonaw...
2,Grobid_RR_2024_l2_combined.txt,en,1.0,UTF-8,50250,https://bigdata-group3-assignment2.s3.amazonaw...


In [9]:
from pydantic import BaseModel, HttpUrl, Field, validator, constr, ValidationError
import re
from typing import Optional
from datetime import datetime

class MetaDataPDFClass(BaseModel):
    file_name: str
    language: str
    version: float
    encoding: str
    file_size: int
    s3_url: HttpUrl

    @validator('language')
    def language_must_be_valid(cls, v):
        allowed_languages = ['en']  # Extend this list based on your requirements
        if v not in allowed_languages:
            raise ValueError(f'language must be one of {allowed_languages}')
        return v

    @validator('version')
    def version_must_be_positive(cls, v):
        if v <= 0:
            raise ValueError('version must be positive')
        return v

    @validator('file_size')
    def file_size_must_be_non_negative(cls, v):
        if v < 0:
            raise ValueError('file_size must be non-negative')
        return v

    @validator('file_name')
    def validate_file_name(cls, v):
        # Pattern to extract year and level_number from file_name
        pattern = r"Grobid_RR_(?P<year>\d{4})_l(?P<level_number>\d+)_combined\.txt"
        match = re.match(pattern, v)
        if not match:
            raise ValueError(f"file_name does not match required pattern: {v}")

        # Extract year and level_number from the match object
        year = int(match.group('year'))
        level_number = int(match.group('level_number'))

        # Perform any additional validation on extracted values if necessary
        # For example, ensuring year and level_number fall within expected ranges
        if year != 2024 or level_number not in [1, 2, 3]:
            raise ValueError("file_name contains invalid year or level_number")

        return v

    @validator('s3_url', pre=True)
    def validate_s3_url(cls, v, values):
        if 'file_name' not in values:
            raise ValueError("file_name must be validated before s3_url")
    
        file_name = values['file_name'].strip()  # Ensure no trailing spaces
        expected_url_start = f"https://bigdata-group3-assignment2.s3.amazonaws.com/Grobid/{file_name}"
    
        # Check if the actual URL starts with the expected URL start
        if not v.startswith(expected_url_start):
            raise ValueError(f"s3_url does not match expected pattern: {v}")

        return v


/var/folders/_r/t0b63cp179v0bgf_vxmy6brh0000gp/T/ipykernel_86562/927597858.py:14: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.6/migration/
  @validator('language')
/var/folders/_r/t0b63cp179v0bgf_vxmy6brh0000gp/T/ipykernel_86562/927597858.py:21: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.6/migration/
  @validator('version')
/var/folders/_r/t0b63cp179v0bgf_vxmy6brh0000gp/T/ipykernel_86562/927597858.py:27: PydanticDeprecatedSince20: Pydantic V1 style `@vali

In [10]:
import csv
from pydantic import ValidationError

def validate_csv(file_path, model):
    with open(file_path, mode='r', encoding='utf-8') as csvfile:
        # Assuming the delimiter is a comma, adjust if necessary
        reader = csv.DictReader(csvfile)

        valid_rows = []
        errors = []

        for row_number, row in enumerate(reader, start=1):
            try:
                # Convert the row to the Pydantic model
                model_instance = model(**row)
                valid_rows.append(model_instance)
            except ValidationError as e:
                errors.append({'row': row_number, 'error': str(e)})

        return valid_rows, errors

In [11]:
# Adjust the file path as necessary
valid_data, validation_errors = validate_csv(file_path, MetaDataPDFClass)

print(f"Valid Rows Count: {len(valid_data)}")
if validation_errors:
    print("Errors Found:")
    for error in validation_errors:
        print(f"Row {error['row']}: {error['error']}")
else:
    print("No Validation Errors Found.")

Valid Rows Count: 3
No Validation Errors Found.


## ContentPDFClass

In [68]:
import pandas as pd
from bs4 import BeautifulSoup

# List of XML files to process
xml_files = [
    '../input-data/Grobid_RR_2024_l1_combined.xml',
    '../input-data/Grobid_RR_2024_l2_combined.xml',
    '../input-data/Grobid_RR_2024_l3_combined.xml'
]

# Initialize an empty DataFrame
all_data = pd.DataFrame(
    columns=["Title", "Idno", "Abstract", "Chapter_Name", "Learning_Outcomes"])

for tei_doc in xml_files:
    with open(tei_doc, 'r') as tei:
        soup = BeautifulSoup(tei, 'lxml')

    # Extract the additional fields
    doc_title = soup.title.getText() if soup.title else "N/A"
    idno = soup.find('idno', type='MD5').getText(
    ) if soup.find('idno', type='MD5') else "N/A"
    abstract = soup.abstract.getText() if soup.abstract else "N/A"

    # Assuming divs_text is already populated as per your code snippet
    divs_text = []
    for div in soup.body.find_all("div"):
        if not div.get("type"):
        # Use '\n' to preserve line breaks
            div_text = div.get_text(separator='\n', strip=True)
            divs_text.append(div_text)

    plain_text = "\n".join(divs_text)
    parts = plain_text.split("LEARNING OUTCOMES")

    data = []  # To store tuples of (title, content)

    for i in range(1, len(parts)):
        # For each part, find the last newline in the previous part to isolate the title
        prev_part = parts[i-1].rstrip()
        # Find the last occurrence of newline
        title_line_index = prev_part.rfind('\n')
        title = prev_part[title_line_index +
                        1:] if title_line_index != -1 else prev_part
        content = parts[i].strip()
        data.append((doc_title, idno, abstract, title, content))

    # Creating a temporary DataFrame for the current XML file
    df = pd.DataFrame(
        data, columns=["Title", "Idno", "Abstract", "Chapter_Name", "Learning_Outcomes"])

    # Append the data from this file to the main DataFrame
    all_data = pd.concat([all_data, df], ignore_index=True)

# Save the combined DataFrame to CSV
all_data.to_csv('ContentPDFClass.csv', index=False)

  soup = BeautifulSoup(tei, 'lxml')


In [69]:
from pydantic import BaseModel, HttpUrl, Field, validator, constr, ValidationError
import re
from typing import Optional
from datetime import datetime

class ContentPDFClass(BaseModel):
    Title: Optional[str] = None
    Idno: str = Field(..., pattern="^[a-fA-F0-9]{32}$")
    Abstract: str
    Chapter_Name: str
    Learning_Outcomes: str

   # Ensure LearningOutcome contains a specific pattern
    @validator('Learning_Outcomes')
    def learning_outcome_must_contain_pattern(cls, v):
        required_phrase = "The candidate should be able to:"
        if required_phrase not in v:
            raise ValueError(
                f"LearningOutcome must contain '{required_phrase}'")
        return v

/var/folders/_r/t0b63cp179v0bgf_vxmy6brh0000gp/T/ipykernel_47678/2683677914.py:14: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.6/migration/
  @validator('Learning_Outcomes')


In [70]:
import csv
from pydantic import ValidationError

def validate_csv(file_path, model):
    with open(file_path, mode='r', encoding='utf-8') as csvfile:
        # Assuming the delimiter is a comma, adjust if necessary
        reader = csv.DictReader(csvfile)

        valid_rows = []
        errors = []

        for row_number, row in enumerate(reader, start=1):
            try:
                # Convert the row to the Pydantic model
                model_instance = model(**row)
                valid_rows.append(model_instance)
            except ValidationError as e:
                errors.append({'row': row_number, 'error': str(e)})

        return valid_rows, errors

In [71]:
# Adjust the file path as necessary
file_path = 'ContentPDFClass.csv'
valid_data, validation_errors = validate_csv(file_path, ContentPDFClass)

print(f"Valid Rows Count: {len(valid_data)}")
if validation_errors:
    print("Errors Found:")
    for error in validation_errors:
        print(f"Row {error['row']}: {error['error']}")
else:
    print("No Validation Errors Found.")

Valid Rows Count: 24
No Validation Errors Found.
