In [1]:
import os
import glob
from dotenv import load_dotenv

In [2]:
# imports for langchain

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
# from langchain_chroma import Chroma
from langchain.vectorstores import FAISS
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [3]:
# imports for langchain, plotly and Chroma

from langchain.prompts import ChatPromptTemplate
from langchain_community.llms.ollama import Ollama
from langchain_community.vectorstores import Chroma

In [4]:
MODEL = Ollama(model="llama3.2")
db_name = "vector-database"

  MODEL = Ollama(model="llama3.2")


In [7]:
folders = glob.glob("articles/*")

In [28]:
text_loader_kwargs = {'encoding': 'utf-8'}

In [6]:
import jsonlines
with jsonlines.open("articles.json","r") as documents:
    print(next(iter(documents)))

{'url': 'https://arxiv.org/pdf/2405.10825', 'title': 'Large Language Model (LLM) for Telecommunications: A Comprehensive Survey on Principles, Key Techniques, and Opportunities', 'time': '2023-10-17T14:00:32.000Z', 'tags': ['Large Language Model', 'Telecom', 'Article', '5G'], 'authors': ['Hao Zhou', 'Chengming Hu', 'Ye Yuan', 'Yufei Cui', 'Yili Jin']}


In [8]:
from langchain.document_loaders import PyPDFLoader
from langchain_core.documents import Document
folders = glob.glob("articles/*")

In [9]:
langchain_documents = []

with jsonlines.open("articles.json", "r") as metadata_file:
    for file_path, metadata in zip(folders, metadata_file):
        try:
            pdf_loader = PyPDFLoader(file_path)
            pdf_documents = pdf_loader.load()

            for pdf_doc in pdf_documents:
            # Metadata'yı dökümana ekle
                pdf_doc.metadata.update({
                    "url": metadata.get("url"),
                    "tags": metadata.get("tags"),
                    "title": metadata.get("title"),
                    "authors": metadata.get("authors")
                })

            # Dökümanı listeye ekle
                langchain_documents.append( Document(
                        page_content=pdf_doc.page_content,
                        metadata=pdf_doc.metadata
                    )
                )
        except Exception as e:
            print(f"Error loading file {file_path}: {e}")

In [10]:
len(langchain_documents), langchain_documents[:1]

(107,
 [Document(metadata={'source': 'articles/2308.06013v2.pdf', 'page': 0, 'url': 'https://arxiv.org/pdf/2405.10825', 'tags': ['Large Language Model', 'Telecom', 'Article', '5G'], 'title': 'Large Language Model (LLM) for Telecommunications: A Comprehensive Survey on Principles, Key Techniques, and Opportunities', 'authors': ['Hao Zhou', 'Chengming Hu', 'Ye Yuan', 'Yufei Cui', 'Yili Jin']}, page_content='Large Language Models for Telecom:\nForthcoming Impact on the Industry\nAli Maatouk∗, Nicola Piovesan ∗, Fadhel Ayed∗, Antonio De Domenico ∗, and Merouane Debbah †\n∗Paris Research Center, Huawei Technologies, Boulogne-Billancourt, France\n†Khalifa University of Science and Technology, Abu Dhabi, UAE\nAbstract—Large Language Models (LLMs), AI-driven models\nthat can achieve general-purpose language understanding and\ngeneration, have emerged as a transformative force, revolution-\nizing fields well beyond Natural Language Processing (NLP)\nand garnering unprecedented attention. As LLM

In [11]:
from langchain.vectorstores import Qdrant
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings

In [12]:
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(
    model="llama3.2",
)

In [13]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(langchain_documents)

In [14]:

vectorstore = FAISS.from_documents(chunks, embedding=embeddings)

In [46]:
store = Qdrant.from_documents(
    langchain_documents,
    embeddings,
    path="/tmp/ai_qdrant",
    collection_name="AI-Embeddings",
)  

In [47]:
store.similarity_search_with_score(
    query="AI and authors",
    k=1
)

[(Document(metadata={'source': 'articles/2405.10825v2.pdf', 'page': 6, 'url': 'https://arxiv.org/pdf/2305.13102', 'tags': ['Large Language Model', 'Telecom', 'Article', 'Industry'], 'title': 'OBSERVATIONS ON LLMS FOR TELECOM DOMAIN: CAPABILITIES AND LIMITATIONS', 'authors': ['Sumit Soman', 'Ranjani H G'], '_id': '1bee23b66b75410da6b5618dd734c7ff', '_collection_name': 'AI-Embeddings'}, page_content='understanding. Meanwhile, literary works also serve as a rich\nreservoir of formal and lengthy texts [63]. These materi-\nals are crucial for teaching LLMs complex linguistic con-\nstructs, facilitating the modelling of long-range dependencies.\nSpecialized data involves scientific texts and programming-\nrelated data. For example, scientific literature comprises a\nwealth of formal writing imbued with domain-specific knowl-\nedge, encompassing academic papers and textbooks. On the\nother hand, programming data drawn from online question-\nanswering platforms like Stack Exchange [64], along 

In [48]:
store.similarity_search_with_score(
    query="AI and authors",
    filter={"authors": "Rex Ying"},
    k=1
)  

[(Document(metadata={'source': 'articles/2306.10249v2.pdf', 'page': 6, 'url': 'https://arxiv.org/pdf/2409.05314', 'tags': ['Large Language Model', 'Telecom', 'Article', 'Industry'], 'title': 'Tele-LLMs: A Series of Specialized Large Language Models for Telecommunications', 'authors': ['Ali Maatouk', 'Rex Ying', 'Kenny Chirino Ampudia', 'Leandros Tassiulas'], '_id': '8fc5d95dd1ac405fb61aca34157f5189', '_collection_name': 'AI-Embeddings'}, page_content='7\nFig. 3: AGI-empowered wireless networks.\nVI. C ONCLUSION\nIn this article, we explored how Large-GenAI-Models can\nbe an essential tool in designing, configuring, and operating\nfuture wireless networks. In particular, we identified the key\nopportunities, with respect to sensing and communication,\nthat can be acquired when employing Large-GenAI-Models\nin wireless networks, and we overviewed the role of wire-\nless networks in enabling machines to communicate using\nLarge-GenAI-Models. Moreover, we laid down the foundation\nfor the 

In [1]:
from langchain_community.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

llm = Ollama(
    model="llama3.2",
    callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
) 

  llm = Ollama(
  llm = Ollama(


In [50]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain.prompts import PromptTemplate

In [51]:
def retrieval_chain_with_filter(llm, filter={}):
    template = """You are a bot that answers user questions using only the context provided.
    If you don't know the answer, simply state that you don't know.
    {context}
    Question: {input}"""

    prompt = PromptTemplate(template=template, input_variables=["context", "input"])
    retriever = store.as_retriever(search_kwargs={'filter': filter})
    llm_with_prompt = create_stuff_documents_chain(llm, prompt)
    return create_retrieval_chain(retriever, llm_with_prompt)  

In [53]:
result = retrieval_chain_with_filter(llm).invoke({
    "input": ""
})

ValueError: Ollama call failed with status code 500. Details: {"error":"model requires more system memory (3.4 GiB) than is available (2.9 GiB)"}

In [54]:
!pip uninstall fastembed

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Found existing installation: fastembed 0.4.2
Uninstalling fastembed-0.4.2:
  Would remove:
    /home/yasar/anaconda3/envs/llms/lib/python3.11/site-packages/fastembed-0.4.2.dist-info/*
    /home/yasar/anaconda3/envs/llms/lib/python3.11/site-packages/fastembed/*
Proceed (Y/n)? ^C
[31mERROR: Operation cancelled by user[0m[31m
[0m

In [55]:
y

NameError: name 'y' is not defined