In [1]:
# Import necessary libraries for the script
from pydantic import BaseModel, HttpUrl, validator  # Pydantic for data modeling and validation
from datetime import datetime  # datetime for handling date and time
import csv  # csv for CSV file operations

# Define a Pydantic model for the metadata of PDF files
class MetaDataPDFClass(BaseModel):
    level: str  # The level of the PDF file (e.g., I, II, III)
    file_size_kb: float  # The size of the PDF file in kilobytes
    amazon_storage_class: str  # The Amazon S3 storage class of the PDF file
    s3_text_link: HttpUrl  # The HTTP URL to the PDF file stored in Amazon S3
    file_path: str  # The file path of the PDF file
    content_type: str = "txt"  # Default content type set to text
    date_updated: datetime  # The date when the PDF file was last updated

    # Define a validator to ensure the Amazon storage class is either 'Standard' or 'Glacier'
    @validator('amazon_storage_class')
    def storage_class_must_be_standard_or_glacier(cls, v):
        if v not in ["Standard", "Glacier"]:
            raise ValueError('Amazon_storage_class must be "Standard" or "Glacier"')
        return v

    # Define a validator to ensure the file size is a positive number
    @validator('file_size_kb')
    def file_size_must_be_positive(cls, v):
        if v <= 0:
            raise ValueError('file_size_kb must be a positive number')
        return v

# Metadata for all six PDF files
pdf_data = [
    {"level": "I", "file_size_kb": 35.8, "amazon_storage_class": "Standard", "s3_text_link": "https://cfa-assignment2.s3.us-east-2.amazonaws.com/data/Grobid_RR_2024_LevelI_combined.txt", "file_path": "data/Grobid_RR_2024_LevelI_combined.txt", "date_updated": "2024-02-27"},
    {"level": "II", "file_size_kb": 37.5, "amazon_storage_class": "Standard", "s3_text_link": "https://cfa-assignment2.s3.us-east-2.amazonaws.com/data/Grobid_RR_2024_LevelII_combined.txt", "file_path": "data/Grobid_RR_2024_LevelII_combined.txt", "date_updated": "2024-02-27"},
    {"level": "III", "file_size_kb": 21.1, "amazon_storage_class": "Standard", "s3_text_link": "https://cfa-assignment2.s3.us-east-2.amazonaws.com/data/Grobid_RR_2024_LevelIII_combined.txt", "file_path": "data/Grobid_RR_2024_LevelIII_combined.txt", "date_updated": "2024-02-27"},
    {"level": "I", "file_size_kb": 45, "amazon_storage_class": "Standard", "s3_text_link": "https://cfa-assignment2.s3.us-east-2.amazonaws.com/data/PyPDF_2024_l1_combined.txt", "file_path": "data/PyPDF_2024_l1_combined.txt", "date_updated": "2024-02-27"},
    {"level": "II", "file_size_kb": 46, "amazon_storage_class": "Standard", "s3_text_link": "https://cfa-assignment2.s3.us-east-2.amazonaws.com/data/PyPDF_2024_l2_combined.txt", "file_path": "data/PyPDF_2024_l2_combined.txt", "date_updated": "2024-02-27"},
    {"level": "III", "file_size_kb": 30, "amazon_storage_class": "Standard", "s3_text_link": "https://cfa-assignment2.s3.us-east-2.amazonaws.com/data/PyPDF_2024_l3_combined.txt", "file_path": "data/PyPDF_2024_l3_combined.txt", "date_updated": "2024-02-27"}
]

# Create instances of MetaDataPDFClass for each item in `pdf_data`
pdf_instances = [MetaDataPDFClass(**data) for data in pdf_data]

# Specify the output CSV file name
csv_file = "/Users/shubh/Desktop/Aww/MetaData_Cleaned.csv"

# Open the CSV file and write the metadata of each PDF instance
with open(csv_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    # Write the header row to the CSV file
    writer.writerow(['Level', 'File Size (KB)', 'Amazon Storage Class', 'S3 Text Link', 'File Path', 'Content Type', 'Date Updated'])
    # Iterate through each PDF instance and write its metadata to the CSV
    for instance in pdf_instances:
        writer.writerow([
            instance.level, 
            instance.file_size_kb, 
            instance.amazon_storage_class, 
            instance.s3_text_link, 
            instance.file_path, 
            instance.content_type, 
            instance.date_updated.strftime('%Y-%m-%d')  # Format the date_updated field
        ])

# Print a confirmation message after successfully creating the CSV file
print(f"CSV file '{csv_file}' created with metadata for all.")


CSV file '/Users/shubh/Desktop/Aww/MetaData_Cleaned.csv' created with metadata for all.


/var/folders/ks/khrkjrdn6ms4yvtd6kj2hn2m0000gn/T/ipykernel_599/195703345.py:17: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.6/migration/
  @validator('amazon_storage_class')
/var/folders/ks/khrkjrdn6ms4yvtd6kj2hn2m0000gn/T/ipykernel_599/195703345.py:24: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.6/migration/
  @validator('file_size_kb')
