In [1]:
from pydantic import BaseModel, HttpUrl, ValidationError, Field, field_validator
from datetime import datetime
from typing import Optional
from urllib.parse import urlparse
import re

#Pydantic Class for URLModel with the schema provided for extraction
class URLModel(BaseModel):
    Name_of_the_topic: str = Field(alias="Name of the topic")
    Year: Optional[int] = Field(alias="Year")
    Level: int = Field(alias="Level")
    Introduction_Summary: Optional[str] = Field(alias="Introduction Summary")
    Learning_Outcomes: Optional[str] = Field(alias="Learning Outcomes")
    Link_to_the_Summary_Page: HttpUrl = Field(alias="Link to the Summary Page")
    Link_to_the_PDF_File: Optional[HttpUrl] = Field(alias="Link to the PDF File")
    
    
    #field validation for Year to check int/none input and check the year is appropriate
    @field_validator('Year', mode='before')
    def validate_year(cls, value):
        if value is None:
            return value
        
        #extract current year for comparison 
        current_year = datetime.now().year
        if isinstance(value, str):
            cleaned_year = ''.join(filter(str.isdigit, value))
            if cleaned_year:
                value = int(cleaned_year)
            else:    
                raise ValueError("Year must contain digits")
        
        if not isinstance(value, int):
            raise TypeError("Year must be provided as an integer or string containing digits.")
        
        if not (2010<= value <=current_year):
            raise ValueError(f"Invalid Year {value}.")
    
        return value    

    #field validation for Level ; map the roman numerals to integers and pass through the int value
    @field_validator('Level', mode='before')
    def validate_level(cls, v):
        roman_to_int = {'I': 1, 'II': 2, 'III': 3, 'IV': 4, 'V': 5}
        roman_numeral = v.replace("Level ", "").strip()
        
        if roman_numeral in roman_to_int:
            return roman_to_int[roman_numeral]
        else:
            raise ValueError("Invalid Roman numeral")
    
        
    #field validation for summary page link to check if the link is a valid link
    @field_validator('Link_to_the_Summary_Page', mode='before')
    def validate_url_domain(cls, v):
        parsed_url = urlparse(v)
        expected_domain = "www.cfainstitute.org"
        if parsed_url.netloc.lower() != expected_domain:
            raise ValueError(f"URL must be from {expected_domain}")
        return v
    
    #field validation for pdf link to check if the link is a valid link and has valid extension
    @field_validator('Link_to_the_PDF_File', mode='before')
    def validate_url_domainpdf(cls, v):
        if v is None:
            return v
        
        parsed_url = urlparse(v)
        expected_domain = "www.cfainstitute.org"
        if parsed_url.netloc.lower() != expected_domain:
            raise ValueError(f"URL must be from {expected_domain}")
        if not parsed_url.path.lower().endswith('.pdf'):
            raise ValueError("URL must end with .pdf")
        return v

    #clean the learning outcomes
    @field_validator('Learning_Outcomes', mode='before')
    def clean_learning_outcomes(cls, v):
        v = v.strip()
        v = re.sub(r'\s+', ' ', v)
        return v
    
    #clean the introduction summary
    @field_validator('Introduction_Summary', mode='before')
    def clean_intro_summary(cls, v):
        v = v.strip()
        v = re.sub(r'\s+', ' ', v)
        return v




In [2]:
import json
import csv
from pydantic import ValidationError

#Use the data in JSON for validation
json_file_path = 'CFA.json'
with open(json_file_path, 'r') as file:
    json_data = json.load(file)

#Validate each item in the dataset; if cleared save it to csv, if not then omit 
validated_data = []
for item in json_data:
    try:
        validated_item = URLModel.model_validate(item)
        validated_data.append(validated_item.model_dump())
    except (ValidationError, TypeError) as e:
        print(f"Error for item {item}: {e}")
        continue


csv_file_path = 'validated_CFA.csv'
with open(csv_file_path, 'w', newline='') as file:
    if validated_data:
        fieldnames = validated_data[0].keys()
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        for item in validated_data:
            writer.writerow(item)
    else:
        print("No validated data to save.")


In [3]:
import ipytest
ipytest.autoconfig()
import pytest

## Positive Scenarios Test Cases

#Validate Year
def test_year_positive():
    assert URLModel.validate_year("2024 Curriculum")

#Validate Level    
def test_level_positive():
    assert URLModel.validate_level("Level II")

#Validate URL for valid Domain    
def test_url_positive():
    assert URLModel.validate_url_domain("https://www.cfainstitute.org/membership/professional-development/refresher-readings/time-series-analysis")

#Validate URL for valid Domain for PDF link    
def test_urlpdf_positive():
    assert URLModel.validate_url_domainpdf("https://www.cfainstitute.org/-/media/documents/protected/refresher-reading/2024/level2/level2a/RR_2024_L2V1R5_time_series_analysis.pdf")  
 
#Validate optional input for PDF link     
def test_url_pdf():
    assert URLModel.validate_url_domainpdf(None) is None


## Negative Scenarios Test Cases

#Validate Error for invalid Year
def test_year_negative():
    with pytest.raises(ValueError):
        URLModel.validate_year(1800) 
    
#Validate Error for invalid Level
def test_level_negative():
    with pytest.raises(ValueError):
        URLModel.validate_level("Level M")

#Validate Error for invalid PDF URL 
def test_url_pdf_invalid_domain():
    with pytest.raises(ValueError) as excinfo:
        invalid_domain_url = "https://www.example.com/somepath/document.pdf"
        URLModel.validate_url_domainpdf(invalid_domain_url)
    assert "URL must be from www.cfainstitute.org" in str(excinfo.value)

#Validate Error for invalid PDF URL extension    
def test_url_pdf_invalid_extension():
    with pytest.raises(ValueError) as excinfo:
        invalid_extension_url = "https://www.cfainstitute.org/somepath/document.txt"
        URLModel.validate_url_domainpdf(invalid_extension_url)
    assert "URL must end with .pdf" in str(excinfo.value)

#Validate Error for invalid URL     
def test_url_invalid_domain():
    with pytest.raises(ValueError) as excinfo:
        invalid_domain_url = "https://www.example.com/somepath"
        URLModel.validate_url_domain(invalid_domain_url)
    assert "URL must be from www.cfainstitute.org" in str(excinfo.value)    
    
                
    
    
ipytest.run('-v')    
    

platform darwin -- Python 3.11.4, pytest-8.0.2, pluggy-1.4.0
rootdir: /Users/nidhikulkarni
plugins: anyio-4.3.0
collected 10 items

t_f9bd2c2a0cea4f65903ac5367bd25a2e.py [32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m                                             [100%][0m



<ExitCode.OK: 0>