In [2]:
import pandas as pd

# Load the CSV file to check its headers and a few rows
file_path = '../input-data/scraped_data.csv'
try:
    # Attempt to read the CSV with a specified delimiter (e.g., tab)
    data = pd.read_csv(file_path, delimiter='\t')
except Exception as e:
    print("Error reading the CSV file:", e)

# Display the first few rows to understand the schema
data.head()

Unnamed: 0,topic_name,year,level,introduction,learning_outcome,summary,summary_page_link,pdf_file_Link
0,Time-Series Analysis,2024.0,II,"As financial analysts, we often use time-seri...",The member should be able to: calculate and...,The predicted trend value of a time series in...,https://www.cfainstitute.org/membership/profes...,https://www.cfainstitute.org/-/media/documents...
1,Credit Analysis Models,2024.0,II,Credit analysis plays an important role in th...,The member should be able to: explain expec...,This reading has covered several important top...,https://www.cfainstitute.org/membership/profes...,https://www.cfainstitute.org/-/media/documents...
2,Introduction to Alternative Investments,2023.0,I,"In this section, we explain what alternative ...",The member should be able to: describe type...,This reading provides a comprehensive introduc...,https://www.cfainstitute.org/membership/profes...,https://www.cfainstitute.org/-/media/documents...
3,Credit Default Swaps,2024.0,II,Derivative instruments in which the underlyin...,The member should be able to: describe cred...,A credit default swap (CDS) is a contract betw...,https://www.cfainstitute.org/membership/profes...,https://www.cfainstitute.org/-/media/documents...
4,Valuation of Contingent Claims,2024.0,II,A contingent claim is a derivative instrument...,The member should be able to: describe and ...,This reading on the valuation of contingent cl...,https://www.cfainstitute.org/membership/profes...,https://www.cfainstitute.org/-/media/documents...


In [46]:
from pydantic import BaseModel, HttpUrl, Field, validator, constr, ValidationError
import re
from typing import Optional
from datetime import datetime

class URLClass(BaseModel):
    topic_name: Optional[str]
    year: Optional[int]
    level: Optional[str]
    introduction: Optional[str]
    learning_outcome: Optional[str]
    summary: Optional[str]
    summary_page_link: Optional[constr(strict=True)]
    pdf_file_Link: Optional[constr(strict=True)]

    @validator('year', always=True)
    def year_must_be_recent_or_none(cls, v):
        if v in [None, '', 'NaN']:
            return None  # Skip validation for None or empty strings
        try:
            v = int(float(v))  # Convert to int, handling strings and floats
        except (ValueError, TypeError):
            raise ValueError('year must be a valid integer')

        current_year = datetime.now().year
        if not (current_year - 10 <= v <= current_year):
            raise ValueError('year must be within the last 10 years')
        return v


    @validator('summary_page_link', 'pdf_file_Link', always=True)
    def urls_must_start_with_specified_pattern_or_none(cls, v):
        if v is not None and v != "" and not v.startswith("https://www.cfainstitute.org"):
            raise ValueError('URL must start with "https://www.cfainstitute.org"')
        return v
    

In [47]:
import csv
from pydantic import ValidationError

def validate_csv(file_path, model):
    with open(file_path, mode='r', encoding='utf-8') as csvfile:
        # Assuming the delimiter is a comma, adjust if necessary
        reader = csv.DictReader(csvfile, delimiter='\t')
        
        valid_rows = []
        errors = []

        for row_number, row in enumerate(reader, start=1):
            try:
                # Convert the row to the Pydantic model
                model_instance = model(**row)
                valid_rows.append(model_instance)
            except ValidationError as e:
                errors.append({'row': row_number, 'error': str(e)})

        return valid_rows, errors

In [49]:
# Adjust the file path as necessary
valid_data, validation_errors = validate_csv(file_path, URLClass)

print(f"Valid Rows Count: {len(valid_data)}")
if validation_errors:
    print("Errors Found:")
    for error in validation_errors:
        print(f"Row {error['row']}: {error['error']}")
else:
    print("No Validation Errors Found.")

Valid Rows Count: 219
Errors Found:
Row 142: 1 validation error for URLClass
year
  value is not a valid integer (type=type_error.integer)
Row 143: 1 validation error for URLClass
year
  value is not a valid integer (type=type_error.integer)
Row 144: 1 validation error for URLClass
year
  value is not a valid integer (type=type_error.integer)
Row 145: 1 validation error for URLClass
year
  value is not a valid integer (type=type_error.integer)
Row 172: 1 validation error for URLClass
year
  value is not a valid integer (type=type_error.integer)
