# RAG

In [4]:
from langchain_ollama import OllamaLLM
import pandas as pd
from langchain.document_loaders import DataFrameLoader

## Loaders

### 1. Pandas

In [2]:
df = pd.read_csv("youtube.csv")
df.head()

Unnamed: 0,RANK,GRADE,NAME,VIDEOS,SUBSCRIBERS,VIEWES
0,1,A++,T-Series,13629,105783888,76945588449
1,2,A,PewDiePie,3898,97853589,22298927681
2,3,A+,5-Minute Crafts,3341,58629572,14860695079
3,4,A++,Cocomelon - Nursery Rhymes,441,53163816,33519273951
4,5,A++,SET India,31923,51784081,36464793233


In [3]:
df = df.iloc[:,2:]
df.head()

Unnamed: 0,NAME,VIDEOS,SUBSCRIBERS,VIEWES
0,T-Series,13629,105783888,76945588449
1,PewDiePie,3898,97853589,22298927681
2,5-Minute Crafts,3341,58629572,14860695079
3,Cocomelon - Nursery Rhymes,441,53163816,33519273951
4,SET India,31923,51784081,36464793233


In [None]:
llm_data = llm_loader = DataFrameLoader(
    data_frame=df,
    page_content_column="NAME"
).load()
llm_data

[Document(metadata={'VIDEOS': 13629, 'SUBSCRIBERS': 105783888, 'VIEWES': 76945588449}, page_content='T-Series'),
 Document(metadata={'VIDEOS': 3898, 'SUBSCRIBERS': 97853589, 'VIEWES': 22298927681}, page_content='PewDiePie'),
 Document(metadata={'VIDEOS': 3341, 'SUBSCRIBERS': 58629572, 'VIEWES': 14860695079}, page_content='5-Minute Crafts'),
 Document(metadata={'VIDEOS': 441, 'SUBSCRIBERS': 53163816, 'VIEWES': 33519273951}, page_content='Cocomelon - Nursery Rhymes'),
 Document(metadata={'VIDEOS': 31923, 'SUBSCRIBERS': 51784081, 'VIEWES': 36464793233}, page_content='SET India'),
 Document(metadata={'VIDEOS': 1100, 'SUBSCRIBERS': 50560964, 'VIEWES': 25446405744}, page_content='Canal KondZilla'),
 Document(metadata={'VIDEOS': 42404, 'SUBSCRIBERS': 46098586, 'VIEWES': 34085586984}, page_content='WWE'),
 Document(metadata={'VIDEOS': 134, 'SUBSCRIBERS': 45873439, 'VIEWES': 625649566}, page_content='Justin Bieber'),
 Document(metadata={'VIDEOS': 209, 'SUBSCRIBERS': 43796634, 'VIEWES': 83543218

In [10]:
llm_data[0].model_dump()

{'id': None,
 'metadata': {'VIDEOS': 13629,
  'SUBSCRIBERS': 105783888,
  'VIEWES': 76945588449},
 'page_content': 'T-Series',
 'type': 'Document'}

### 2. URLs

In [13]:
from langchain.document_loaders import UnstructuredURLLoader
urls = [
    "https://www.apple.com/in/support/products/faqs.html",
    "https://www.apple.com/legal/sales-support/",
]

llm_loader = UnstructuredURLLoader(urls=urls)
llm_data = llm_loader.load()

In [15]:
print(llm_data[0])

page_content='Frequently Asked Questions

Where can I purchase the AppleCare Protection Plan?

The AppleCare Protection Plan is available at the Apple Online Store and many Apple-authorised resellers and wireless service providers.

How do I initiate repair service under the AppleCare Protection Plan?

Carry-in service. Carry your product into an Apple Authorised Service Provider.

Onsite service. Contact us and we’ll help you arrange an Apple-authorised repair for your desktop Mac at your location, at no additional charge.

Do-It-Yourself service. Contact us and we may be able to send you what you need to service your own product, such as accessories.

The AppleCare Protection Plan for Mac, for iPod and for Apple TV provide global repair coverage. Service will be limited to the options available in the country where service is requested. Service options, parts availability and response times vary by country.

All repairs will be completed using genuine Apple parts for repair service. 

### 3. WikiPedia

In [16]:
from langchain.document_loaders import WikipediaLoader

query = "Elon Musk"
llm_data = WikipediaLoader(query=query,load_max_docs=3,doc_content_chars_max=20_000).load()

In [18]:
print(llm_data[0])

page_content='Elon Reeve Musk  ( EE-lon; born June 28, 1971) is a businessman, known for his leadership of Tesla, SpaceX, X (formerly Twitter), and the Department of Government Efficiency (DOGE). Musk has been the wealthiest person in the world since 2021; as of May 2025, Forbes estimates his net worth to be US$424.7 billion.
Born to a wealthy family in Pretoria, South Africa, Musk emigrated in 1989 to Canada. He received bachelor's degrees from the University of Pennsylvania in 1997 before moving to California, United States, to pursue business ventures. In 1995, Musk co-founded the software company Zip2. Following its sale in 1999, he co-founded X.com, an online payment company that later merged to form PayPal, which was acquired by eBay in 2002. That year, Musk also became an American citizen.
In 2002, Musk founded the space technology company SpaceX, becoming its CEO and chief engineer; the company has since led innovations in reusable rockets and commercial spaceflight. Musk joine

### 4.PDF

In [22]:
from langchain.document_loaders import PyPDFLoader
pages = PyPDFLoader("CV.pdf").load_and_split()

In [23]:
print(pages[0])

page_content='MOAAZ ANWAR SOLIMAN
AI ENGINEER
moaazanwarsoliman@gmail.com
 
+201116823857 +201014397578
 
Egypt, Menofia
 
github.com/Animo-GD
 
linkedin.com/in/moaaz solomon/
 
OBJECTIVE
I am motivated to engage in a new experience in the field of Al and Machine Learning, expand my horizons of 
knowledge, and gain experience from experts on the ground. I am looking for a training opportunity as a fresh 
graduated to gain experience from experts in this field and work on real-world projects.
EDUCATION
Faculty of Electronic Engineering – Menoufia University
Major: Computer Science and Engineering
2019 – 2025
National Telecommunication Institute (NTI)
Completed 120 hours of intensive training in artificial intelligence, covering 
machine learning, deep learning, computer vision, and natural language processing.
09/2023 – 11/2023
Information Technology Institute (ITI)
Gained hands-on experience in Internet of Things concepts, including sensor 
integration, data collection, communication p

In [None]:
print(pages[0])

[Document(metadata={'source': 'https://www.apple.com/in/support/products/faqs.html'}, page_content='Frequently Asked Questions\n\nWhere can I purchase the AppleCare Protection Plan?\n\nThe AppleCare Protection Plan is available at the Apple Online Store and many Apple-authorised resellers and wireless service providers.\n\nHow do I initiate repair service under the AppleCare Protection Plan?\n\nCarry-in service. Carry your product into an Apple Authorised Service Provider.\n\nOnsite service. Contact us and we’ll help you arrange an Apple-authorised repair for your desktop Mac at your location, at no additional charge.\n\nDo-It-Yourself service. Contact us and we may be able to send you what you need to service your own product, such as accessories.\n\nThe AppleCare Protection Plan for Mac, for iPod and for Apple TV provide global repair coverage. Service will be limited to the options available in the country where service is requested. Service options, parts availability and response 

### 5. PDF Contains Images

In [30]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io
from langchain.schema import Document
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
class OCRPDFLoader:
    def __init__(self, file_path):
        self.file_path = file_path

    def load(self):
        doc = fitz.open(self.file_path)
        documents = []
        
        for page_num in range(len(doc)):
            # Render page as image
            pix = doc[page_num].get_pixmap()
            img = Image.open(io.BytesIO(pix.tobytes()))
            
            # OCR
            text = pytesseract.image_to_string(img)
            
            # Wrap into LangChain Document
            documents.append(
                Document(
                    page_content=text,
                    metadata={"source": self.file_path, "page": page_num+1}
                )
            )
        return documents


# ---- USAGE ----
loader = OCRPDFLoader("ImagePDF.pdf")
docs = loader.load()

# Now you can send `docs` to any LangChain text splitter or vector store
from langchain.text_splitter import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = splitter.split_documents(docs)

print(chunks[0].page_content)  # preview


@ Machine Learning Pipeline Cheat Sheet

1. Data Collection
© Ed what: Gather raw data from files, APIs, sensors, etc.

© R Tools: pandas, requests, SQL, scrapy, Seautifulsoup

F = pd. read_csv(‘data.csv')

2. Data Preprocessing
a.Cleaning

‘© Handle missing values ( aw )
+ Fixdata types

# Remove duplicates

# Normalize formats

4€( peice’ |-astype(+ioat)

Text Cleanup (NLP-specific)
© Remove punctuation, stopwords, URLS
# Lowercase conversion

* Tokenization

from sklearn feature extraction text inport ENGLISM_STOP_WORDS
from nitk.tokenize inport word tokentize

3. Exploratory Data Analysis (EDA)


### 6. Directory

In [47]:
from langchain.document_loaders import DirectoryLoader
llm_loader = DirectoryLoader("md-data",glob="*.txt",show_progress=True)

llm_data = llm_loader.load()

  0%|          | 0/3 [00:00<?, ?it/s]libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
100%|██████████| 3/3 [00:00<00:00, 200.45it/s]


In [49]:
print(llm_data[0])

page_content='If you want, I can set this up so LangChain downloads → checks PDF type → uses OCR if needed, so it works even for screenshot PDFs.

Do you want me to make that robust version?' metadata={'source': 'md-data\\1.txt'}


In [None]:
from langchain.document_loaders import PyPDFDirectoryLoader
llm_loader = PyPDFDirectoryLoader("pdf-files")
llm_data = llm_loader.load()

-----
## Text Splitter

In [50]:
from langchain.document_loaders import WikipediaLoader
query_1 = "Yahya Sinwar"
query_2 = "mohamed el deif"

docs_1 = WikipediaLoader(query=query_1,load_max_docs=1,doc_content_chars_max=20_000).load()
docs_2 = WikipediaLoader(query=query_2,load_max_docs=1,doc_content_chars_max=20_000).load()

In [51]:
text_document1 = docs_1[0].page_content
text_document2 = docs_2[0].page_content

In [53]:
print(text_document1)

Yahya Ibrahim Hassan Sinwar (Arabic: يحيى إبراهيم حسن السنوار, romanized: Yaḥyá Ibrāhīm Ḥasan al-Sinwār; 29 October 1962 – 16 October 2024) was a Palestinian militant and politician who served as fourth chairman of the Hamas Political Bureau from August 2024, and as the second leader of Hamas in the Gaza Strip from February 2017, succeeding Ismail Haniyeh in both roles. He was killed in a clash with the Israel Defense Forces (IDF) in October 2024.
Sinwar was born in the Khan Yunis refugee camp in Egyptian-occupied Gaza in 1962 to a family who had been expelled or fled from Majdal 'Asqalan during the 1948 Palestine War. He finished his studies at the Islamic University of Gaza, where he received a bachelor's degree in Arabic studies. In 1989, Sinwar was sentenced to four life sentences in Israel for orchestrating the abduction and killing of two Israeli soldiers and four Palestinians he considered to be collaborators. He spent 22 years in prison until his release among 1,026 others in a

In [54]:
print(text_document2)

On 19 August 2014, the Israel Defense Forces carried out an airstrike at the home of Mohammed Deif, leader of the Al-Qassam Brigades. Deif was unharmed, but his wife, Widad Asfura, and two of their children were killed.


== Background ==
Deif, born Mohammed Diab Ibrahim al-Masri, joined Hamas in 1987, weeks after its establishment during the First Intifada. He was arrested by Israeli authorities in 1989 for his involvement with the organization. After 16 months of detention, he was released in a prisoner exchange. Soon after his release, he helped establish the Ezzedeen Al-Qassam Brigades, the armed wing of Hamas. Deif became the head of the Qassam Brigades after Israel assassinated Salah Shehade in July 2002. Between July 2006 and November 2012, effective command was exercised by Deif's deputy, Ahmed Jabari, after Deif was seriously wounded in an Israeli assassination attempt.
Mohammed Deif married Widad Asfura (Arabic: وداد عصفورة, romanized: Widad Asfoura), sometimes referred to as

In [55]:
documents = [text_document1,text_document2]
metadata = [{"document":query_1},{"document":query_2}]

### 1. Characters

In [57]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator="",
    chunk_size=100,
    chunk_overlap=0,
    length_function = len
)

In [58]:
normal_chunks = text_splitter.create_documents(documents,metadatas=metadata)

In [59]:
len(normal_chunks)

241

In [61]:
print(normal_chunks[0])

page_content='Yahya Ibrahim Hassan Sinwar (Arabic: يحيى إبراهيم حسن السنوار, romanized: Yaḥyá Ibrāhīm Ḥasan al-Sin' metadata={'document': 'Yahya Sinwar'}
