In [47]:
# Team05 URLClass
# Import necessary libraries
from pydantic import BaseModel, Field, HttpUrl, ValidationError, validator
import pandas as pd  # For handling CSV file operations
import re  # Regular expressions library for text matching

# Define the URLClass model using Pydantic's BaseModel
class URLClass(BaseModel):
    # Define fields with types 
    # Aliases are used to match the CSV column names
    article: str = Field(alias='Article')
    topic: str = Field(alias='Topic')
    year: str = Field(alias='Year')
    level: str = Field(alias='Level')
    introduction: str = Field(alias='Introduction')
    learning_outcomes: str = Field(alias='Learning Outcomes')
    summary: str = Field(alias='Summary')
    link_to_the_summary_page: HttpUrl = Field(alias='Link to the Summary Page')
    link_to_the_pdf_file: HttpUrl = Field(alias='Link to the PDF File')

    # Validator to ensure certain text fields are not empty or just whitespace
    @validator('article', 'introduction', 'topic', 'learning_outcomes', 'summary', pre=True)
    def check_non_empty(cls, v):
        if not isinstance(v, str) or not v.strip():
            raise ValueError("This field cannot be empty or just whitespace.")
        return v.strip()

#     # Validator to extract and validate the year from a string
#     @validator('year', pre=True)
#     def extract_year_and_format(cls, v):
#         year_match = re.search(r'\d{4}', v)
#         if year_match:
#             return int(year_match.group(0))
#         raise ValueError('Year must be a four-digit number and present in the field.')
    @validator('year', pre=True)
    def extract_year_and_format(cls, v):
         if isinstance(v, str):
            year_match = re.search(r'\d{4}', v)
            if year_match:
                return int(year_match.group(0))
            return "Year Not Found"

    # Validator to ensure the 'level' field does not start with "CFA Program"
    @validator('level', pre=True)
    def format_level(cls, v):
        clean_level = " ".join(v.split())
        if clean_level.startswith("CFA Program"):
            return "Level Not Found"  # Set to a default value if it starts with "CFA Program"
        return clean_level


    # Validator for the PDF link, ensuring it starts with "https://" or fixes relative URLs
    @validator('link_to_the_pdf_file', pre=True)
    def adjust_pdf_link(cls, v):
        if v.startswith('/'):
            return f"https://www.cfainstitute.org{v}"
        return v

    # Validator for the summary page link, ensuring it's a valid URL
    @validator('link_to_the_summary_page', pre=True)
    def validate_summary_link(cls, v):
        if not v.startswith('http'):
            raise ValueError('Summary page link must start with http or https.')
        return v

# Function to validate data in a CSV file against the URLClass model
def validate_csv(file_path):
    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv(file_path)
    valid_rows = []  # List to hold instances of URLClass that are valid
    invalid_rows_details = []  # List to hold details of rows that failed validation

    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        try:
            # Attempt to create an instance of URLClass using data from the current row
            validated_data = URLClass.parse_obj(row.to_dict())
            # If successful, append the validated data to the list of valid rows
            valid_rows.append(validated_data)
        except ValidationError as e:
            # If validation fails, append details of the failure to the list of invalid rows
            invalid_rows_details.append({'Row': index + 1, 'Errors': e.errors(), 'Value': row.to_dict()})

    return valid_rows, invalid_rows_details

# Function to print detailed error messages for invalid rows
def print_validation_errors(invalid_rows_details):
    # Print the total number of invalid rows
    print(f"Invalid Rows Count: {len(invalid_rows_details)}\n")
    # Iterate over each invalid row detail
    for detail in invalid_rows_details:
        # Print which row had the error
        print(f"Row {detail['Row']} Errors:")
        # Iterate over each error in the current row
        for error in detail['Errors']:
            # Print the field that had the error, the error message, and the problematic value
            print(f"  - Field: {error['loc'][0]}, Error: {error['msg']}, Value: {detail['Value'][error['loc'][0]]}")
        print("\n")



file_path = 'Team05.csv'

# Validate the CSV file and capture valid and invalid rows
valid_rows, invalid_rows_details = validate_csv(file_path)
print(f"Valid Rows Count: {len(valid_rows)}")
print_validation_errors(invalid_rows_details)

Valid Rows Count: 145
Invalid Rows Count: 79

Row 19 Errors:
  - Field: Summary, Error: This field cannot be empty or just whitespace., Value: nan


Row 63 Errors:
  - Field: Summary, Error: This field cannot be empty or just whitespace., Value: nan


Row 71 Errors:
  - Field: Summary, Error: This field cannot be empty or just whitespace., Value: nan


Row 72 Errors:
  - Field: Summary, Error: This field cannot be empty or just whitespace., Value: nan


Row 73 Errors:
  - Field: Summary, Error: This field cannot be empty or just whitespace., Value: nan


Row 74 Errors:
  - Field: Summary, Error: This field cannot be empty or just whitespace., Value: nan


Row 81 Errors:
  - Field: Summary, Error: This field cannot be empty or just whitespace., Value: nan


Row 97 Errors:
  - Field: Summary, Error: This field cannot be empty or just whitespace., Value: nan


Row 98 Errors:
  - Field: Summary, Error: This field cannot be empty or just whitespace., Value: nan


Row 102 Errors:
  - Field: 

## Generating the Cleaned CSV File

In [13]:
import pandas as pd
from pydantic import ValidationError

# Assuming URLClass is already defined as per your previous code

# Function to apply transformations based on validation outcomes
def transform_and_validate_data(row):
    # Convert row to dictionary for Pydantic compatibility
    row_dict = row.to_dict()
    
    try:
        # Validate row with URLClass
        validated_data = URLClass.parse_obj(row_dict)
        # Convert the model instance back to a dictionary for easy manipulation
        transformed_data = validated_data.dict(by_alias=True)  # Ensure aliases are used for keys
    except ValidationError as e:
        # Initialize transformed_data with original row values, handling NaN values explicitly
        transformed_data = {k: (v if pd.notna(v) else "Content Not Found") for k, v in row_dict.items()}
        
        # Process errors to apply specific transformations
        for error in e.errors():
            field = error.get('loc', [None])[0]
            if field:
                if field == 'year':
                    transformed_data['Year'] = "Year Not Found"
                elif field == 'link_to_the_summary_page':
                    transformed_data['Link to the Summary Page'] = "Link Not Found"
                elif field == 'link_to_the_pdf_file':
                    transformed_data['Link to the PDF File'] = "Link Not Found"
            else:
                print("Unexpected error structure:", error)

    # Apply transformations that are independent of validation errors
    if pd.isna(transformed_data.get('Link to the PDF File')):
        transformed_data['Link to the PDF File'] = "Link Not Found"
    elif isinstance(transformed_data.get('Link to the PDF File'), str) and transformed_data['Link to the PDF File'].startswith('/'):
        transformed_data['Link to the PDF File'] = f"https://www.cfainstitute.org{transformed_data['Link to the PDF File']}"
    
    # Remove "Curriculum" suffix from the Year column
    if str(transformed_data['Year']).endswith("Curriculum"):
        transformed_data['Year'] = str(transformed_data['Year'])[:-len("Curriculum")].strip()
        
    return transformed_data

# Load the original CSV file
df = pd.read_csv('Team05.csv')

# Apply transformations and validations
transformed_df = pd.DataFrame([transform_and_validate_data(row) for index, row in df.iterrows()])

# Write the transformed data to a new CSV file
transformed_df.to_csv('cleaned.csv', index=False)
