In [5]:
#importing global dependencies
import os
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from uuid import uuid4
from tqdm.autonotebook import tqdm
import tiktoken

In [6]:
#importing langchain dependencies
import langchain
import langchain
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.chains import RetrievalQA
from langchain_core.prompts import PromptTemplate

In [None]:
# FREE LLM ALTERNATIVES
# Option 1: Groq (Free tier with rate limits)
from langchain_groq import ChatGroq

"""
# Option 2: HuggingFace Pipeline (Local inference)
from langchain_huggingface import HuggingFacePipeline
# Option 3: Ollama (Local LLM server)
from langchain_community.llms import Ollama
"""

'\n# Option 2: HuggingFace Pipeline (Local inference)\nfrom langchain_huggingface import HuggingFacePipeline\n# Option 3: Ollama (Local LLM server)\nfrom langchain_community.llms import Ollama\n'

In [13]:
# FREE EMBEDDING ALTERNATIVES
# Option 1: HuggingFace Sentence Transformers (Free)

from langchain_huggingface import HuggingFaceEmbeddings

# Option 2: Ollama Embeddings (Free local)
# from langchain_community.embeddings import OllamaEmbeddings

In [16]:
# FREE VECTOR STORE ALTERNATIVES  
# Option 1: ChromaDB (Completely free)
#from langchain_community.vectorstores import Chroma
# Option 2: FAISS (Free, in-memory)
#from langchain_community.vectorstores import FAISS

# Option 3: Keep Pinecone (has free tier, but limited)
from langchain_pinecone import PineconeVectorStore

In [17]:
# FREE SEARCH ALTERNATIVES
# Option 1: DuckDuckGo Search (Free, no API key needed) - generally worse
# from langchain_community.tools import DuckDuckGoSearchResults 
# Option 2: Keep Tavily if you have free tier

from langchain_community.tools.tavily_search import TavilySearchResults

In [18]:
# AGENTS
from langchain.agents import AgentExecutor, Tool, AgentType
from langchain.agents.react.agent import create_react_agent
from langchain import hub

In [19]:
# Load environmental variables from a .env file
load_dotenv()

True

In [20]:
#load the datasets

loader = CSVLoader(
    file_path="./Datasets/tedx-transcripts.csv",
    encoding="utf-8",
    source_column="transcript",
    metadata_columns= ["main_speaker", "name", "speaker_occupation", "title", "url", "description"]
)

data = loader.load()

len(data)

2467

In [22]:
#tokenization
#In a given String the number of Tokens are counted by

def num_tokens(question, encoding_name):
    
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = encoding.encode(question)

    return encoding, num_tokens


question = "How many TEDx talk transcripts are on the transcripts-dataset?"

encoding, num_tokens = num_tokens(question, "cl100k_base")

print(f'Number of Words: {len(question.split())}')
print(f'Number of Characters: {len(question)}')
print(f'List of Tokens: {num_tokens}')
print(f'Nr of Tokens: {len(num_tokens)}')


Number of Words: 9
Number of Characters: 62
List of Tokens: [4438, 1690, 84296, 87, 3137, 61412, 527, 389, 279, 61412, 1773, 8534, 30]
Nr of Tokens: 13


In [23]:
#decoding tokenizer
def decode_tokens(tokens, encoding):
    return encoding.decode(tokens)

decoded_question = decode_tokens(num_tokens, encoding)
print(f'Decoded Question: {decoded_question}')

Decoded Question: How many TEDx talk transcripts are on the transcripts-dataset?
