## Import Langchain, Kor, OpenAI

In [None]:
#importing openai and secret key
import os
from dotenv import load_dotenv

# Load environment variables from .env
load_dotenv()

openai_key = os.getenv["OPEN_API_KEY"]


In [None]:
#importing module for langchain and kor
import openai
from typing import List, Optional

from langchain.callbacks import get_openai_callback
from langchain.chat_models import ChatOpenAI

from kor.extraction import create_extraction_chain
from kor.nodes import Object, Text, Number

import pandas as pd
from pydantic import BaseModel, Field, validator
from kor import extract_from_documents, from_pydantic, create_extraction_chain


from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
#importing llm model

#from langchain.llms import OpenAI

llm = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    temperature=0,#dont be creative and make up answer
    openai_api_key= openai_key
)


## Building Schema (Example)

In [None]:
schema = Object(
    id="personal_info",
    description="Personal information about a given person.",
    attributes=[
        Text(
            id="first_name",
            description="The first name of the person",
            examples=[("John Smith went to the store", "John")],
        ),
        Text(
            id="last_name",
            description="The last name of the person",
            examples=[("John Smith went to the store", "Smith")],
        ),
        Number(
            id="age",
            description="The age of the person in years.",
            examples=[("23 years old", "23"), ("I turned three on sunday", "3")],
        ),
    ],
    examples=[
        (
            "John Smith was 23 years old. He was very tall. He knew Jane Doe. She was 5 years old.",
            [
                {"first_name": "John", "last_name": "Smith", "age": 23},
                {"first_name": "Jane", "last_name": "Doe", "age": 5},
            ],
        )
    ],
    many=True,
)


In [None]:
#creating the chain - default CSVEncoder
chain = create_extraction_chain(llm, schema)

#for creating chain - JSONEncoder
#chain = create_extraction_chain(llm, schema, encoder_or_encoder_class="json", input_formatter=None)

In [None]:
#printing the actual prompt that being sent to the LLM

print(chain.prompt.format_prompt(text="[user input]").to_string())

ways on seing the raw data 

In [None]:
chain.run(("David Jones was 34 years old a long time ago."))

In [None]:
#finding what kind of data it contains
result = chain.predict(text="David Jones was 34 years old a long time ago.")
print(result)


In [None]:
result = chain.predict(text="David Jones was 34 years old a long time ago.")
parsed_data = result["data"]["personal_info"]
print(parsed_data)


## Load Document

In [None]:
def import_document(filename):
    try:
        with open(filename, 'r', encoding='utf-8') as file:
            document_text = file.read()
        return document_text
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return None
    except Exception as e:
        print(f"Error occurred while importing the document: {e}")
        return None


filename = "data/Kuiper_FCC-20-102A1.txt"
document = import_document(filename)
if document is not None:
    print("Document content:")
    print(document)

## Exploring Document

In [None]:
len(document)

In [None]:
import textacy
doc = textacy.make_spacy_doc(document, lang="en_core_web_sm")
print(doc._.preview)

In [None]:
from textacy import text_stats as ts

# Number of words and number of unique words
print("Number of words: ", ts.n_words(doc))
print("Number of unique words: ", ts.n_unique_words(doc))

# Entropy of words in the document- measures how much informations produced on the average of the word
print("Entropy: ", ts.entropy(doc))

# Compute the Type-Token Ratio (TTR) of doc_or_token,a direct ratio of the number of unique words (types) of all words (token)
print("Diversity: ", ts.diversity.ttr(doc))

# Flesch Kincaid grade level: readability tests designed to indicate how difficult a passage is
print("Flesch Kincaid: ",ts.flesch_kincaid_grade_level(doc))


## Splitting Document

In [None]:
doc = Document(page_content = document)
split_docs = RecursiveCharacterTextSplitter().split_documents([doc])

In [None]:
# TEMPLATE - building regular expression to split the document to its section of
""" I. INTRODUCTION
II. BACKGROUND
III. DISCUSSION
IV. ORDERING CLAUSES


import re
from langchain.document import Document
from langchain.splitter import RecursiveCharacterTextSplitter

def section_level_split_with_titles(text):
    # Define a regular expression to match section titles with Roman numerals
    section_pattern = r'^([IVXLCDM]+)\. ([A-Z\s]+)$'
    sections = []

    # Find all matches for section titles and their corresponding content
    for match in re.finditer(section_pattern, text, re.MULTILINE):
        section_title = match.group(0)
        section_content_start = match.end()
        next_match = re.search(section_pattern, text[match.end():], re.MULTILINE)
        section_content_end = next_match.start() if next_match else None
        section_content = text[section_content_start:section_content_end].strip()
        sections.append((section_title, section_content))

    return sections

# Example usage:
file_path = "path/to/your/document.txt"
with open(file_path, 'r', encoding='utf-8') as file:
    document_text = file.read()

sections = section_level_split_with_titles(document_text)

# Create the main content by combining all section contents
main_content = "\n\n".join(section_text for _, section_text in sections)

doc = Document(page_content=main_content)
split_docs = RecursiveCharacterTextSplitter().split_documents([doc])

# Now you have the split documents in the split_docs list
# You can process each split document further as needed """


## Building Schema - Orbital Environment

need to specify object and gave example to get good extracting. giving either object description and example or just the example can be done but the resut produce is not as good

In [None]:
schema = Object(
    id="orbital_environment",
    description="Orbital Environment of satellite constellation authorized by Federal Communications Commission (FCC)",
    attributes=[
        Text(
            id="company_name",
            description="The name of the company that sent the application to deploy or operate the satellite constellation",
            examples=[("In this Order and Declaratory Ruling, we grant the request of ViaSat, Inc.", "ViaSat, Inc.")],
        ),
        Text(
            id="orbit_type",
            description="The orbit type into which the satellites will be launched",
            examples=[("For the proposed of non-geostationary orbit (NGSO) satellite system.", "non-geostationary orbit (NGSO)")],
        ),
        Text(
            id="application",
            description="The application or services that the satellites would provide",
            examples=[("The satellites constellation as amended will provide fixed-satellite service (FSS).", "fixed-satellite service (FSS)"),
                      ("We authorize them to continue and enhance its provision of mobile-satellite service (MSS)", "mobile-satellite service (MSS)")],
        ),
        Text(
            id="date_50",
            description="Date on which the company was ordered to launch and operate 50 percent of its satellites",
            examples=[("They must launch 50 percent of the maximum number of proposed space stations in accordane with the authorization no later than November 2, 2027.", "2 November 2027")],
        ),
        Text(
            id="date_100",
            description="Date on which the company was ordered to launch and operate the remaining (100 percent) of its satellites",
            examples=[("They also must launch the remaining space stations no later than November 2, 2030.", "2 November 2030")],
        ),
        Number(
            id="total_sat_const",
            description="TThe concluding total number of satellites that the company has been authorized to deploy and operate for the constellation",
            examples=[("As modified, the constellation will operate with four fewer satellites of 716 satellites", "716")],
        ),
        Number(
            id="altitude",
            description="The granted altitudes of the satellites that the company has been authorized to deploy",
            examples=[("These satellites will be placed at an altitudes of 590 kilometers", "590")],
        ),
        Number(
            id="inclination",
            description="The granted inclination of the satellites that the company has been authorized to deploy, respective to the altitudes",
            examples=[("The proposed orbital planes is at an 87.9 degree inclination", "87.9")],
        ),
        Number(
            id="number_orb_plane",
            description="The number of orbital planes, respective to the altitudes and inclination, that the company has been authorized to deploy",
            examples=[("The constellation would have 42 orbital planes", "42")],
        ),
        Number(
            id="total_sat_per_orb_plane",
            description="The specific count of satellites located in each individual orbital plane. This count refers to the total number of satellites within one orbital plane, and it can vary from plane to plane based on the altitude and inclination, and if not mentioned in text, 'total_sat_per_alt_incl' divide by 'number_orb_plane' will give this value",
            examples=[("Each orbital plane would have 36 satellites per plane", "36")],
        ),
        Number(
            id="total_sat_per_alt_incl",
            description="The total number of satellites at a specific altitude and inclination across all orbital planes sharing these characteristics. This count represents the overall number of satellites with the specified altitude and inclination parameters, and if not mentioned in the text, the multiplication of 'number_orb_plane' and 'total_sat_per_orb_plane' will give this value",
            examples=[("A total of 560 satellites at inclination of 33 degree and altitude of 800km will be placed.", "560")],
        ),
        Text(
            id="orbit_shape",
            description="The shape of the orbital plane whether its circular, elliptical or are not mention in the document",
            examples=[("The constellation will consist of satellites in a circular orbit", "circular")],
        ),
    ],
    examples=[
        (
           
            """In this Order and Authorization, we grant, to the extent set forth below, the request of Kuiper Systems LLC (Kuiper or Amazon) to deploy a non-geostationary satellite orbit (NGSO) system to provide service using certain Fixed-Satellite Service (FSS).
                Operating 3,372 satellites in 98 orbital planes at altitudes of 590 km, 610 km, and 630 km in a circular orbit.
                At 590 km, Kuiper plans 30 orbital planes with 28 satellites per plane for a total of 840 satellites at inclination of 33 degree.
                At 610 km, Kuiper plans 42 orbital planes with 36 satellites per plane for a total of 1512 satellites at inclination of 42 degree.
                At 630 km, Kuiper plans 30 orbital planes with 34 satellites per plane for a total of 1020 satellite at inclination of 51.9 degree.
                It requires Kuiper to launch and operate 50 percent of its satellites no later than July 30, 2026, and Kuiper must launch the remaining space stations necessary to complete its authorized service constellation, place them in their assigned orbits, and operate each of them in accordance with the authorization no later than July 30, 2029.""",
                
            [
            {"company_name": "Kuiper Systems LLC", "orbit_type": "non-geostationary satellite orbit (NGSO)", "application": "Fixed-Satellite Service (FSS)", "date_50": "30 July 2026", "date_100": "30 July 2029", "total_sat_const": 3372, "altitude": 590, "inclination": 33, "number_orb_plane": 30, "total_sat_per_orb_plane": 28, "total_sat_per_alt_incl": 840, "orbit_shape": "circular"},
            {"company_name": "Kuiper Systems LLC", "orbit_type": "non-geostationary satellite orbit (NGSO)", "application": "Fixed-Satellite Service (FSS)", "date_50": "30 July 2026", "date_100": "30 July 2029", "total_sat_const": 3372, "altitude": 610, "inclination": 42, "number_orb_plane": 42, "total_sat_per_orb_plane": 36, "total_sat_per_alt_incl": 1512,"orbit_shape": "circular"},
            {"company_name": "Kuiper Systems LLC", "orbit_type": "non-geostationary satellite orbit (NGSO)", "application": "Fixed-Satellite Service (FSS)", "date_50": "30 July 2026", "date_100": "30 July 2029", "total_sat_const": 3372, "altitude": 630, "inclination": 51.9, "number_orb_plane": 30, "total_sat_per_orb_plane": 34, "total_sat_per_alt_incl": 1020, "orbit_shape": "circular"}
            ]
        )
    ],
    many=True,
)

elaborate why it is important to use many= True

it is important to get the examples right and very specific to that paricular attributes. dont combine or mix it up with other attributes

In [None]:
chain = create_extraction_chain(llm, schema)

In [None]:
print(chain.prompt.format_prompt(text="[user input]").to_string())

In [None]:
#testing the date
chain.run(("Boeing must launch 50 percent of the maximum number of proposed space stations, place them in the assigned orbits, and operate them in accordance with this grant no later than November 12 2028, and must launch the remaining space stations necessary to complete its authorized service constellation, place them in their assigned orbits, and operate them in accordance with the authorization no later than May 16 2030."))

In [None]:
#testing company_name, total number of satellites
chain.run(("In this Order and Declaratory Ruling, we grant in part and defer in part the petition for declaratory ruling of WorldVu Satellites Limited (OneWeb) for modification of its grant of U.S. market access for a 720-satellite constellation authorized by the United Kingdom. As modified, the constellation will operate with four fewer satellites, reduced from 720 to 716 satellites."))

In [None]:
#testing orbit_type, application
chain.run(("In April 2016, OneWeb sought Commission approval for a non-geostationary satellite orbit (NGSO), fixed-satellite service (FSS) system designed “to provide high-speed, affordable broadband connectivity to anyone, anywhere” in the United States."))

In [None]:
#testing altitude, inclination, number _orbital_plane, number_satellites_per_plane
chain.run(("The proposed Telesat system consists of a constellation of 117 satellites in 11 orbital planes. 59 of the satellites will be place at 6 orbital planes, which are inclined 99.5 degrees, satellites will be at an approximate altitude of 1,000 kilometers. 58 of the satellites will be at other 5 planes, which are inclined 37.4 degrees will be at an approximate altitude of 1,248km. All of the satellites will be in a circular orbit."))

CSVEncoder is produce more robust extraction than JSONEncoder.However if we want to do a nested objects or nested lists need to use JSONEncoder

## Spare Part

this one have maneuverable ad spin_stabilized 

In [None]:
class OrbitEnv(BaseModel):
    company_name: str = Field(
        description="The name of the company that sent the application to deploy or operate the satellite constellation",
    )
    orbit_type: str = Field(
        description="The orbit type into which the satellites will be launched"
    )
    application: str = Field(
        description="The application or services that the satellites would provide"
    )
    date_50: str = Field(
        description="Date on which the company was ordered to launch and operate 50 percent of its satellites."
    )
    date_100: str = Field(
        description="Date on which the company was ordered to launch and operate the remaining (100 percent) of its satellites"
    )
    total_sat_const: int = Field(
        description="The concluding total number of satellites that the company has been authorized to deploy and operate for the constellation"
    )
    altitude: Optional[List[float]]= Field(
        description="The granted altitudes of the satellites that the company has been authorized to deploy"
    )
    inclination: Optional[List[float]] = Field(
        description="The granted inclination of the satellites that the company has been authorized to deploy, respective to the altitudes"
    )
    number_orb_plane: Optional[List[int]] = Field(
        description="The number of orbital planes, respective to the altitudes and inclination, that the company has been authorized to deploy"
    )
    total_sat_per_orb_plane: Optional[List[int]]= Field(
        description="The specific count of satellites located in each individual orbital plane. This count refers to the total number of satellites within one orbital plane, and it can vary from plane to plane based on the altitude and inclination, and if not mentioned in text, 'total_sat_per_alt_incl' divide by 'number_orb_plane' will give this value"
    )
    total_sat_per_alt_incl: Optional[List[int]] = Field(
        description="The total number of satellites at a specific altitude and inclination across all orbital planes sharing these characteristics. This count represents the overall number of satellites with the specified altitude and inclination parameters, and if not mentioned in the text, the multiplication of 'number_orb_plane' and 'total_sat_per_orb_plane' will give this value"
    )
    orbit_shape: Optional[str] = Field(
        description="The shape of the orbital plane whether its circular, elliptical or are not mention in the document"
    )
    operational_lifetime : Optional[str] = Field(
        description="The operational lifetime of the satellite in the constellation in years"
    )
    maneuverable: Optional[str] = Field(
        description="The satellite having propulsion or can be maneuver. Return 'y' only if the satellite authorized have propulsion or are maneuverable"
    )
    spin_stabilized:  Optional[str] = Field(
        description="The satellites are spin-stabilized. Return 'y' only if satellite authorized have spin-stabilizer"
    )


    @validator("company_name", "orbit_type", "application", "maneuverable", "spin_stabilized")
    def validate_name(cls, v):
        if not re.match("^[a-zA-Z\s().,-]*$", v):
            raise ValueError("The field can only contain alphabetic characters, spaces, parentheses, periods, commas and hyphen.")
        return v
    
    @validator("total_sat_const", "number_orb_plane", "total_sat_per_orb_plane", "total_sat_per_alt_incl", "operational_lifetime")
    def validate_whole_number(cls, v):
        if isinstance(v, list):
            if not all(isinstance(i, int) for i in v):
                raise ValueError("All elements of the list must be whole numbers.")
        elif v is not None and not isinstance(v, int):
            raise ValueError("The field must be a whole number.")
        return v

    @validator("altitude", "inclination")
    def validate_number(cls, v):
        if isinstance(v, list):
            if not all(isinstance(i, (int, float)) for i in v):
                raise ValueError("All elements of the list must be numbers (integer or decimal).")
        elif v is not None and not isinstance(v, (int, float)):
            raise ValueError("The field must be a number (integer or decimal).")
        return v

    @validator("orbit_shape")
    def validate_orbit_shape(cls, v):
        if not re.match("^[a-zA-Z\s]*$", v):
            raise ValueError("orbit_shape can only contain alphabetic characters and spaces.")
        return v


In [None]:
"""     @validator("date_50", "date_100")
    def validate_date(cls, v):
        if not re.match("^[A-Za-z]+\s[0-3]?[0-9],?\s[0-9]{4}$", v):
            raise ValueError("The field must be a date in the format 'Month DD YYYY' or 'Month DD, YYYY'")
        return v """

In [None]:
# this is an attempt to only choose the row that have validated data

""" # Validate the extracted data
valid_data = []
for item in document_extraction_results:
    if isinstance(item, dict) and 'data' in item and 'orbitenv' in item['data']:
        for data in item['data']['orbitenv']:
            try:
                valid_data.append(OrbitEnv(**data))
            except ValidationError:
                pass

valid_data
 """