In [1]:
import os
from lxml import etree
from sqlalchemy import create_engine, text
from sqlalchemy.orm import sessionmaker
from dotenv import load_dotenv
import os
load_dotenv()


True

In [2]:
from sqlalchemy import Column, Integer, String
from sqlalchemy.ext.declarative import declarative_base
from pydantic import ValidationError
from sqlalchemy import Sequence


In [3]:
Base = declarative_base()

class LearningOutcomesOrm(Base):
    __tablename__ = 'learning_outcomes_table'

    id = Column(Integer, Sequence('learning_outcomes_table_id_seq'), primary_key=True, autoincrement=True)
    topic = Column(String)
    outcomes = Column(String, default=2023)


  Base = declarative_base()


In [4]:
# Load environment variables
user = os.getenv('SNOWFLAKE_USER')
password = os.getenv('SNOWFLAKE_PASSWORD')
account = os.getenv('SNOWFLAKE_ACCOUNT')
warehouse = os.getenv('SNOWFLAKE_WAREHOUSE')
database = os.getenv('SNOWFLAKE_DATABASE')
schema = os.getenv('SNOWFLAKE_SCHEMA')

In [5]:
import csv
import xml.etree.ElementTree as ET
def create_outcomes(csv_filename, xml_file_paths):
    with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile)
        
        # Write header
        csv_writer.writerow(['Topic', 'Learning_Outcomes'])
        
        # Iterate through each XML file
        for xml_file_path in xml_file_paths:
            print(xml_file_path)
            # Read the XML data from the file
            with open(xml_file_path, 'r') as file:
                xml_data = file.read()
            # Parse the XML data
            root = ET.fromstring(xml_data)

            for div_element in root.findall('.//{http://www.tei-c.org/ns/1.0}div'):
                    head_element = div_element.find('.//{http://www.tei-c.org/ns/1.0}head')

                    p_elements = div_element.findall('.//{http://www.tei-c.org/ns/1.0}p')

                    combined_p_text = ' '.join(p_element.text for p_element in p_elements if p_element.text)
                    if combined_p_text != '':
                        csv_writer.writerow([head_element.text if head_element is not None else '', combined_p_text])


In [6]:
raw_file_path = '../resources/grobid_xml_data/raw_pdf_content.csv'
create_outcomes(csv_filename=raw_file_path, xml_file_paths=['../resources/grobid_xml_data/Level1_combined.grobid.tei.xml',
         '../resources/grobid_xml_data/Level2_combined.grobid.tei.xml',
         '../resources/grobid_xml_data/Level3_combined.grobid.tei.xml',])

../resources/grobid_xml_data/Level1_combined.grobid.tei.xml
../resources/grobid_xml_data/Level2_combined.grobid.tei.xml
../resources/grobid_xml_data/Level3_combined.grobid.tei.xml


In [7]:
import pandas as pd
def remove_extra_whitespaces(value):
    if isinstance(value, str):
        return ' '.join(value.split())
    else:
        return value

def pre_process_text(path, output_path):
    df = pd.read_csv(path, dtype=str)
    df=df.fillna('Not Available')
    df = df.map(remove_extra_whitespaces)
    df.to_csv(output_path, index=False)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [8]:
processed_file_path = '../resources/clean_csv/processed_pdf_content.csv'

pre_process_text(raw_file_path, processed_file_path)

In [9]:
def create_orm_instances(df):
    orm_instances = []
    for _, row in df.iterrows():
        orm_instance = LearningOutcomesOrm(
                topic=row['Topic'], 
                outcomes=row['Learning_Outcomes']
        )
        orm_instances.append(orm_instance)
    return orm_instances

In [10]:
from pydantic import BaseModel, Field, validator, ValidationError
from typing_extensions import Annotated
from pydantic import BaseModel, ConfigDict, Field, HttpUrl, constr
from typing import Optional, Any
from pydantic.functional_validators import field_validator
from typing_extensions import Annotated
class LearningOutcomes(BaseModel):
    topic: str = Field(..., alias='topic', min_length=2)  
    outcomes: str = Field(default="Not defined", min_length=2)

    @validator('topic', 'outcomes', pre=True, each_item=False)
    def strip_whitespace(cls, v):
        if isinstance(v, str):
            ValueError('empty not allowed')
        return v


/var/folders/6q/q891flcj0r375hpjwrt2wtbm0000gn/T/ipykernel_13089/2748408219.py:11: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.6/migration/
  @validator('topic', 'outcomes', pre=True, each_item=False)


In [11]:
from pydantic import BaseModel, Field, validator, ValidationError

def orm_instance_to_pydantic(orm_instance):
    # print(orm_instance.__dict__)
    return LearningOutcomes(**orm_instance.__dict__)
def convert_to_pydantic_instances(orm_instances):
        return [orm_instance_to_pydantic(orm_instance) for orm_instance in orm_instances]


In [12]:
def create_orm_instances_and_validate_using_pydantic(path):
    try:
        df = pd.read_csv(path)
        orm_instances = create_orm_instances(df)
        pydantic_instances = convert_to_pydantic_instances(orm_instances)
        print(f"{len(pydantic_instances)} are validated")
        return orm_instances
    except Exception as e:
        print(str(e))
        print("Error in validation")


In [13]:
orm_instances = create_orm_instances_and_validate_using_pydantic(processed_file_path)

170 are validated


In [14]:
def create_database_if_not_exists(engine, database):
    connection = engine.connect()
    connection.execute("CREATE DATABASE IF NOT EXISTS {}".format(database))
    connection.close()
def upload_to_snowflake(engine, orm_instances, database):
        create_database_if_not_exists(engine, database)
        Base.metadata.bind = engine
        if not engine.dialect.has_table(engine, LearningOutcomesOrm.__tablename__):
            LearningOutcomesOrm.__table__.create(bind=engine)
        else:
            print(f"Table '{LearningOutcomesOrm.__tablename__}' already exists.")
        
        SessionClass = sessionmaker(bind= engine)
        session = SessionClass()

        # Get the total number of ORM instances to insert
        total_instances = len(orm_instances)

        # Set a threshold for printing progress updates (adjust as needed)
        progress_threshold = 10

        for idx, orm_instance in enumerate(orm_instances, start=1):
            session.add(orm_instance)
            # Commit the changes periodically to the database
            if idx % progress_threshold == 0:
                session.commit()
                print(f"Inserted {idx}/{total_instances} records.")

        # Commit any remaining changes
        session.commit()
        print(f"Inserted {total_instances}/{total_instances} records.")


In [15]:
engine = create_engine(
    f'snowflake://{user}:{password}@{account}/{database}/{schema}?warehouse={warehouse}'
)
upload_to_snowflake(engine=engine, orm_instances=orm_instances, database=database)

Inserted 10/170 records.
Inserted 20/170 records.
Inserted 30/170 records.
Inserted 40/170 records.
Inserted 50/170 records.
Inserted 60/170 records.
Inserted 70/170 records.
Inserted 80/170 records.
Inserted 90/170 records.
Inserted 100/170 records.
Inserted 110/170 records.
Inserted 120/170 records.
Inserted 130/170 records.
Inserted 140/170 records.
Inserted 150/170 records.
Inserted 160/170 records.
Inserted 170/170 records.
Inserted 170/170 records.
