In [1]:
import os
from openai import OpenAI

token = os.environ["GITHUB_TOKEN"]
# print(token)
# print(os.environ["OPENAI_API_KEY"])
endpoint = "https://models.inference.ai.azure.com"
model_name = "gpt-4o-mini"

client = OpenAI(
    base_url=endpoint,
    api_key=token,
)

response = client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant.",
        },
        {
            "role": "user",
            "content": "What is the capital of England?",
        }
    ],
    temperature=0,
    top_p=1.0,
    max_tokens=1000,
    model=model_name
)


In [8]:
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter

In [9]:
chunk_size = 26 
chunk_overlap = 4

In [24]:
text1 = "abcdefghijklmnopqrstuvwxyz"
text2 = 'abcdefghijklmnopqrstuvwxyzabcdefg'
c_splitter = CharacterTextSplitter(chunk_size=chunk_size,
                                    chunk_overlap=chunk_overlap)
r_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
                                            chunk_overlap=chunk_overlap)

In [26]:
r_splitter.split_text(text1)

['abcdefghijklmnopqrstuvwxyz']

In [28]:
r_splitter.split_text(text2)

['abcdefghijklmnopqrstuvwxyz', 'wxyzabcdefg']

In [29]:
text3 = "a b c d e f g h i j k l m n o p q r s t u v w x y z"

In [30]:
r_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

In [31]:
c_splitter.split_text(text3)

['a b c d e f g h i j k l m n o p q r s t u v w x y z']

In [32]:
c_splitter = CharacterTextSplitter(chunk_size=chunk_size,
                                chunk_overlap=chunk_overlap,
                                separator=" ")
c_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

## Recursive splitting details

`RecursiveCharacterTextSplitter` is recommended for generic text. 

In [33]:
some_text = """When writing documents, writers will use document structure to group content. \
This can convey to the reader, which idea's are related. For example, closely related ideas \
are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n  \
Paragraphs are often delimited with a carriage return or two carriage returns. \
Carriage returns are the "backslash n" you see embedded in this string. \
Sentences have a period at the end, but also, have a space.\
and words are separated by space."""

In [34]:
len(some_text)

496

In [35]:
c_splitter = CharacterTextSplitter(chunk_size=450,
                                chunk_overlap=0,
                                separator=" ")

r_splitter = RecursiveCharacterTextSplitter(chunk_size=450,
                                            chunk_overlap=0,
                                            separators=["\n\n","\n", " ",""])

In [36]:
c_splitter.split_text(some_text)

['When writing documents, writers will use document structure to group content. This can convey to the reader, which idea\'s are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also,',
 'have a space.and words are separated by space.']

In [37]:
r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.",
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

In [47]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", ". ", " ", ""]
)
r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related",
 '. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.',
 'Paragraphs are often delimited with a carriage return or two carriage returns',
 '. Carriage returns are the "backslash n" you see embedded in this string',
 '. Sentences have a period at the end, but also, have a space.and words are separated by space.']

In [65]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)
r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example,",
 'closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.',
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this',
 'string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

In [None]:
from langchain.document_loaders import PyPDFLoader

In [70]:
loader = PyPDFLoader("motivation.pdf")
pages = loader.load()

In [77]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=150,
    separators=["\n\n", "\n", "\n \n" "(?<=\. )", " ", ""],
    length_function=len
)

In [78]:
r_splitter.split_documents(pages)

[Document(metadata={'source': 'motivation.pdf', 'page': 0}, page_content='I  am  Mohamed  Fares  Landoulsi,  AI  engineer  within  Novobit,  a  software  company  located  in  \nBraunschweig,\n \nGermany.\n \nI\n \ngraduated\n \nin\n \nDecember\n \n2023\n \nas\n \na\n \nsoftware\n \nengineer\n \nfrom\n \nthe\n \nHigher\n \nInstitute\n \nof\n \nComputer\n \nScience\n \nAriana,\n \nin\n \nTunisia,\n \nafter\n \na\n \nsuccessful\n \nend\n \nof\n \nstudies\n \ninternship\n \nin\n \nNovobit\n \nin\n \nGermany\n \nthat\n \nled\n \nto\n \nbeing\n \nhired.\n  My  end  of  studies  internship  lasted  about  6  months  from  May  2023  to  November  2023,  in  which  \nI\n \napplied\n \nmy\n \nknowledge\n \nin\n \ndeep\n \nlearning\n \nto\n \nbuild\n \nsmart\n \nglasses\n \nfor\n \nvisually\n \nimpaired\n \npersons,\n \nin\n \na\n \nteam\n \nof\n \n8\n \npersons.\n \nMy\n \nresponsibility\n \nwas\n \nto\n \nbuild\n \nan\n \nobstacle\n \ndetection\n \nand\n \navoidance\n \nsystem\n \nthat\n \nwo

In [81]:
from langchain.document_loaders import NotionDirectoryLoader

loader = NotionDirectoryLoader("notion")
docs = loader.load()

In [83]:
r_splitter.split_documents(docs)

[Document(metadata={'source': 'notion\\Self study & Improvement.md'}, page_content='# Self study & Improvement\n\nI want to outline my study plans to grow in artificial intelligence as an AI engineer. \n\nThis plan is more focused on long term growth with solid understanding of the basics and competence in translating foundational knowledge into clean and professional code. \n\n### **Goals of the self study plans :**\n\n- Become proficient at Pytorch while implementing the state of art breakthrough in AI.\n- Get deep insights and foundational knowledge about NLPs, Transformers, finetuning, LLMs\n- Get so familiar with vector databases and RAG ( Retrieval augmented generation)\n\n### **Strategy :**\n\nTo be able to achieve the aforementioned goals, a strategy will come with measurable actions and deeds that will be exercised on daily/weekly basis. \n\nThis strategy consists of varying the resources of learning, from courses, tutorials, articles to youtube videos, and personal projects.'