### Pydantic


###### Importing Libraries

In [None]:
import pandas as pd
from pydantic import BaseModel, ValidationError, validator
from pydantic.networks import HttpUrl
from typing import Optional
import re


##### Creating URL Class

In [None]:
class URLClass(BaseModel):
    ID: Optional[int]
    TOPIC_NAME: Optional[str]
    YEAR: Optional[str]
    LEVEL: Optional[str]
    INTRODUCTION_SUMMARY: Optional[str]
    LEARNING_OUTCOMES: Optional[str]
    SUMMARY_PAGE_LINK: Optional[HttpUrl]
    PDF_FILE_LINK: Optional[HttpUrl]


###### Validation for ID

In [None]:
@validator('ID')
    def validate_id(cls, v):
        if v is not None and v < 0:
            raise ValueError('ID must be a positive integer')
        return v

##### Validation for Year

In [None]:
@validator('YEAR')
    def validate_year(cls, v):
        if v is not None:
            if v.strip().lower() == 'unkn':
                return None  # Treat 'Unkn' as None
            year_str = re.sub(r'\D', '', v)  # Extract digits
            if not year_str.isdigit():
                raise ValueError(f'Invalid year format: {v}. Must contain numeric characters only.')
            year_int = int(year_str)
            if year_int < 2000 or year_int > 2050:
                raise ValueError('Year must be between 2000 and 2050')
            return str(year_int)
        else:
            return None

##### Validation for URL

In [None]:
@validator('SUMMARY_PAGE_LINK', 'PDF_FILE_LINK')
    def validate_url(cls, v):
        if v is not None and not v.scheme.startswith('https'):
            raise ValueError('URL must start with "https://"')
        return v

##### Validation for Level

In [None]:
@validator('LEVEL')
    def validate_level(cls, v):
        if v is not None:
            if v.strip().lower() == 'unknown':
                return None  # Treat 'Unknown' as None
            if pd.isna(v):
                return ''
            pattern = r'^Level\s+(I|II|III|IV|V|VI|VII|VIII|IX|X+)$'
            if not re.match(pattern, v):
                raise ValueError(f'Invalid format for Level: {v}. It should be "Level" followed by a Roman numeral')
        return v


##### Validation for content 

In [None]:
@validator('INTRODUCTION_SUMMARY', 'TOPIC_NAME', 'LEARNING_OUTCOMES')
    def validate_string_not_empty(cls, v):
        if v is not None and not v.strip():
            return "NULL"
        return v

##### Going through each row and creating clean csv

In [None]:
# Read the CSV file into a pandas DataFrame
input_file_path = 'C:\\Users\\Client\\Desktop\\updated_worksheet.csv'
output_file_path = 'C:\\Users\\Client\\Desktop\\worksheet_2.csv'

df = pd.read_csv(input_file_path)

# Validate each row of the DataFrame
validated_data = []
for index, row in df.iterrows():
    data = row.to_dict()

    # Replace NaN values with None
    for key, value in data.items():
        if pd.isna(value):
            data[key] = None

    try:
        url_instance = URLClass(**data)
        validated_data.append(data)
        print(f"Row {index + 1}: Data is valid!")
    except ValidationError as e:
        print(f"Row {index + 1}: Validation error(s): {e.errors()}")

# Create a new DataFrame from the validated data
validated_df = pd.DataFrame(validated_data)

# Append the validated data to the output CSV file
with open(output_file_path, 'a', encoding='utf-8') as f:
    validated_df.to_csv(f, index=False, header=f.tell() == 0)

print(f"Validated data has been appended to {output_file_path}")

In [None]:
# import pandas as pd
# from pydantic import BaseModel, ValidationError, validator
# from pydantic.networks import HttpUrl
# from typing import Optional
# import re

# class URLClass(BaseModel):
#     ID: Optional[int]
#     TOPIC_NAME: Optional[str]
#     YEAR: Optional[str]
#     LEVEL: Optional[str]
#     INTRODUCTION_SUMMARY: Optional[str]
#     LEARNING_OUTCOMES: Optional[str]
#     SUMMARY_PAGE_LINK: Optional[HttpUrl]
#     PDF_FILE_LINK: Optional[HttpUrl]

#     @validator('ID')
#     def validate_id(cls, v):
#         if v is not None and v < 0:
#             raise ValueError('ID must be a positive integer')
#         return v

#     @validator('YEAR')
#     def validate_year(cls, v):
#         if v is not None:
#             if v.strip().lower() == 'unkn':
#                 return None  # Treat 'Unkn' as None
#             year_str = re.sub(r'\D', '', v)  # Extract digits
#             if not year_str.isdigit():
#                 raise ValueError(f'Invalid year format: {v}. Must contain numeric characters only.')
#             year_int = int(year_str)
#             if year_int < 2000 or year_int > 2050:
#                 raise ValueError('Year must be between 2000 and 2050')
#             return str(year_int)
#         else:
#             return None


#     @validator('SUMMARY_PAGE_LINK', 'PDF_FILE_LINK')
#     def validate_url(cls, v):
#         if v is not None and not v.scheme.startswith('https'):
#             raise ValueError('URL must start with "https://"')
#         return v

#     @validator('LEVEL')
#     def validate_level(cls, v):
#         if v is not None:
#             if v.strip().lower() == 'unknown':
#                 return None  # Treat 'Unknown' as None
#             if pd.isna(v):
#                 return ''
#             pattern = r'^Level\s+(I|II|III|IV|V|VI|VII|VIII|IX|X+)$'
#             if not re.match(pattern, v):
#                 raise ValueError(f'Invalid format for Level: {v}. It should be "Level" followed by a Roman numeral')
#         return v

#     @validator('INTRODUCTION_SUMMARY', 'TOPIC_NAME', 'LEARNING_OUTCOMES')
#     def validate_string_not_empty(cls, v):
#         if v is not None and not v.strip():
#             return "NULL"
#         return v

# # Read the CSV file into a pandas DataFrame
# input_file_path = 'C:\\Users\\Client\\Desktop\\updated_worksheet.csv'
# output_file_path = 'C:\\Users\\Client\\Desktop\\worksheet_2.csv'

# df = pd.read_csv(input_file_path)

# # Validate each row of the DataFrame
# validated_data = []
# for index, row in df.iterrows():
#     data = row.to_dict()

#     # Replace NaN values with None
#     for key, value in data.items():
#         if pd.isna(value):
#             data[key] = None

#     try:
#         url_instance = URLClass(**data)
#         validated_data.append(data)
#         print(f"Row {index + 1}: Data is valid!")
#     except ValidationError as e:
#         print(f"Row {index + 1}: Validation error(s): {e.errors()}")

# # Create a new DataFrame from the validated data
# validated_df = pd.DataFrame(validated_data)

# # Append the validated data to the output CSV file
# with open(output_file_path, 'a', encoding='utf-8') as f:
#     validated_df.to_csv(f, index=False, header=f.tell() == 0)

# print(f"Validated data has been appended to {output_file_path}")

### Pytest


In [6]:
import pytest
from pydantic import ValidationError

def test_valid_data():
    data = {
        "ID": 1,
        "TOPIC_NAME": "Machine Learning",
        "YEAR": "2024",
        "LEVEL": "Level II",
        "INTRODUCTION_SUMMARY": "Investment firms are increasingly using technology...",
        "LEARNING_OUTCOMES": "The member should be able to: describe supervised machine learning, unsupervised machine learning, and deep learning;",
        "SUMMARY_PAGE_LINK": "https://www.cfainstitute.org/membership/professional-development/refresher-readings/machine-learning",
        "PDF_FILE_LINK": "https://www.cfainstitute.org/membership/professional-development/refresher-readings/machine-learning.pdf"
    }
    assert URLClass(**data)
    print("Valid data")

    
@pytest.mark.parametrize("invalid_id", [-1, "invalid"])
def test_invalid_id(invalid_id):
    data = {
        "ID": invalid_id,
        "TOPIC_NAME": "Quantitative Methods",
        "YEAR": "2024",
        "LEVEL": "Level II",
        "INTRODUCTION_SUMMARY": "Investment firms are increasingly using technology...",
        "LEARNING_OUTCOMES": "The member should be able to: describe supervised machine learning, unsupervised machine learning, and deep learning;",
        "SUMMARY_PAGE_LINK": "https://www.cfainstitute.org/membership/professional-development/refresher-readings/machine-learning",
        "PDF_FILE_LINK": "https://www.cfainstitute.org/membership/professional-development/refresher-readings/machine-learning.pdf"
    }
    with pytest.raises(ValidationError) as excinfo:
        URLClass(**data)
    print(str(excinfo.value))

@pytest.mark.parametrize("invalid_title", ["", None])
def test_invalid_title(invalid_title):
    data = {
        "title": invalid_title,
        "topic": "Quantitative Methods",
        "published_year": 2024,
        "level": "Level II",
        "introduction": "Investment firms are increasingly using technology at every step of the investment management value chain—from improving their understanding of clients to uncovering new sources of alpha and executing trades more efficiently. Machine learning techniques, a central part of that technology, are the subject of this reading. These techniques first appeared in finance in the 1990s and have since flourished with the explosion of data and cheap computing power.This reading provides a high-level view of machine learning (ML).",
        "learning_outcomes": "The member should be able to: describe supervised machine learning, unsupervised machine learning, and deep learning;",
        "summary": "Machine learning methods are gaining usage at many stages in the investment management value chain. Among the major points made are the following",
        "overview": "",
        "link": "https://www.cfainstitute.org/membership/professional-development/refresher-readings/machine-learning"
    }
    with pytest.raises(ValidationError) as excinfo:
        URLClass(**data)
    # Print the captured exception
#     print(excinfo)
    # Or print the exception message
    print(str(excinfo.value))


@pytest.mark.parametrize("invalid_year", ["1999", "2051", "invalid_year"])
def test_invalid_year(invalid_year):
    data = {
        "ID": 1,
        "TOPIC_NAME": "Quantitative Methods",
        "YEAR": invalid_year,
        "LEVEL": "Level II",
        "INTRODUCTION_SUMMARY": "Investment firms are increasingly using technology...",
        "LEARNING_OUTCOMES": "The member should be able to: describe supervised machine learning, unsupervised machine learning, and deep learning;",
        "SUMMARY_PAGE_LINK": "https://www.cfainstitute.org/membership/professional-development/refresher-readings/machine-learning",
        "PDF_FILE_LINK": "https://www.cfainstitute.org/membership/professional-development/refresher-readings/machine-learning.pdf"
    }
    with pytest.raises(ValidationError) as excinfo:
        URLClass(**data)
    print(str(excinfo.value))


@pytest.mark.parametrize("invalid_link", ["invalid_link", "ftp://example.com"])
def test_invalid_link(invalid_link):
    data = {
        "ID": 1,
        "TOPIC_NAME": "Quantitative Methods",
        "YEAR": "2024",
        "LEVEL": "Level II",
        "INTRODUCTION_SUMMARY": "Investment firms are increasingly using technology...",
        "LEARNING_OUTCOMES": "The member should be able to: describe supervised machine learning, unsupervised machine learning, and deep learning;",
        "SUMMARY_PAGE_LINK": invalid_link,
        "PDF_FILE_LINK": invalid_link
    }
    with pytest.raises(ValidationError) as excinfo:
        URLClass(**data)
    print(str(excinfo.value))


In [10]:
test_valid_data()

Valid data


In [23]:
test_invalid_year(1999)

1 validation error for URLClass
YEAR
  Input should be a valid string [type=string_type, input_value=1999, input_type=int]
    For further information visit https://errors.pydantic.dev/2.6/v/string_type


In [13]:
test_invalid_id("")

1 validation error for URLClass
ID
  Input should be a valid integer, unable to parse string as an integer [type=int_parsing, input_value='', input_type=str]
    For further information visit https://errors.pydantic.dev/2.6/v/int_parsing


In [18]:
test_invalid_link("ftp://example.com")

2 validation errors for URLClass
SUMMARY_PAGE_LINK
  URL scheme should be 'http' or 'https' [type=url_scheme, input_value='ftp://example.com', input_type=str]
    For further information visit https://errors.pydantic.dev/2.6/v/url_scheme
PDF_FILE_LINK
  URL scheme should be 'http' or 'https' [type=url_scheme, input_value='ftp://example.com', input_type=str]
    For further information visit https://errors.pydantic.dev/2.6/v/url_scheme


In [None]:
test_invalid_link("https://example.com")