In [27]:
import os
import re
import csv
import math
from sklearn.metrics.pairwise import cosine_similarity
import google.generativeai as genai
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
from openai import OpenAI
from tqdm import tqdm
from google.colab import userdata
from huggingface_hub import login
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, set_seed
from peft import LoraConfig, PeftModel
from datetime import datetime
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset, Dataset, DatasetDict

In [3]:
openai_api_key = userdata.get('OPENAI_API_KEY').strip()
os.environ['OPENAI_API_KEY'] = openai_api_key

In [4]:
if openai_api_key:
  print("OpenAI API key is set")
else:
  print("OpenAI API key is not set")

OpenAI API key is set


In [6]:
!wget https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-llama-articles.csv

--2025-10-01 09:08:57--  https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-llama-articles.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 173646 (170K) [text/plain]
Saving to: ‘mini-llama-articles.csv’


2025-10-01 09:08:57 (3.12 MB/s) - ‘mini-llama-articles.csv’ saved [173646/173646]



In [7]:
def split_into_chunks(text,chunk_size=1024):
    chunks = []
    for i in range(0,len(text),chunk_size):
        chunks.append(text[i:i+chunk_size])
    return chunks

In [9]:
chunks = []
with open("./mini-llama-articles.csv", mode="r", encoding="utf-8") as file:
  csv_reader = csv.reader(file)
  for idx, row in enumerate( csv_reader ):
    if idx ==0: continue;
    chunks.extend(split_into_chunks(row[1]))

print("number of articles:", idx)
print("number of chunks:", len(chunks))

number of articles: 14
number of chunks: 174


In [12]:
df = pd.DataFrame(chunks,columns=['chunk'])
print(df.keys())

Index(['chunk'], dtype='object')


In [30]:
client = OpenAI()

In [14]:
def get_embedding(text):
  try:
    text =text.replace("\n", " ")
    res =client.embeddings.create(input=[text], model="text-embedding-3-small")
    return res.data[0].embedding
  except:
    return

In [16]:
embeddings =[]
for index, row in tqdm(df.iterrows()):
  embeddings.append(get_embedding(row['chunk']))




174it [00:53,  3.24it/s]


In [17]:
# Adding column to the dataframe
embeddings_values = pd.Series(embeddings)
df.insert(loc=1,column='embedding',value=embeddings_values)

In [18]:
QUESTION ="How many parameters LLaMA2 model has?"
QUESTION_emb =get_embedding(QUESTION)
BAD_SOURCE_emb =get_embedding("The sky is blue.")
GOOD_SOURCE_emb =get_embedding("LLaMA2 model has a total of 2B parameters.")

In [19]:
print(len(QUESTION_emb))

1536


In [23]:
#Comparison of cosine similarity
print("> Bad Response Score: ",cosine_similarity([QUESTION_emb],[BAD_SOURCE_emb]))
print("> Good Response Score:", cosine_similarity([QUESTION_emb], [GOOD_SOURCE_emb]))

> Bad Response Score:  [[0.02578727]]
> Good Response Score: [[0.83154609]]


In [24]:
QUESTION ="How many parameters LLaMA2 model has?"
QUESTION_emb =get_embedding(QUESTION)
cosine_similarities =cosine_similarity([QUESTION_emb], df['embedding'].tolist())
print(cosine_similarities)

[[0.46767499 0.46912464 0.25982343 0.29393922 0.319654   0.40157167
  0.41500898 0.4525136  0.45935869 0.1259955  0.11750504 0.01348838
  0.22602134 0.21423916 0.10145219 0.33064027 0.1074138  0.34682608
  0.16311555 0.08726645 0.3482437  0.22839007 0.19203919 0.26471736
  0.24928956 0.34824073 0.24828999 0.32761311 0.41416043 0.41337977
  0.46363194 0.38341214 0.46851769 0.35636739 0.35398223 0.3027117
  0.29929401 0.29252605 0.40035147 0.4646832  0.39473083 0.41042047
  0.4470362  0.43173664 0.35909244 0.33965997 0.51344046 0.20932135
  0.40206751 0.32829097 0.42863159 0.48270619 0.45036044 0.34256287
  0.32083244 0.42588004 0.24622426 0.18089188 0.23648678 0.34271678
  0.3437286  0.20476358 0.19768159 0.22446578 0.21110849 0.42281591
  0.26382997 0.30438172 0.33609101 0.38368357 0.23536253 0.24351588
  0.37074498 0.28025883 0.49052816 0.53044055 0.37853176 0.43770825
  0.37750013 0.39255233 0.30081934 0.41710617 0.4674553  0.45420047
  0.35169137 0.21222866 0.4262131  0.31603804 0.4

In [25]:
number_of_chunks_to_retrieve = 3
indices =np.argsort(cosine_similarities[0])[::-1][:number_of_chunks_to_retrieve]
print(indices)

[114  75  89]


In [26]:
for idx, item in enumerate(df.chunk[indices]):
  print(f"> Chunk {idx+1}")
  print(item)
  print("----")

> Chunk 1
by Meta that ventures into both the AI and academic spaces. The model aims to help researchers, scientists, and engineers advance their work in exploring AI applications. It will be released under a non-commercial license to prevent misuse, and access will be granted to academic researchers, individuals, and organizations affiliated with the government, civil society, academia, and industry research facilities on a selective case-by-case basis. The sharing of codes and weights allows other researchers to test new approaches in LLMs. The LLaMA models have a range of 7 billion to 65 billion parameters. LLaMA-65B can be compared to DeepMind's Chinchilla and Google's PaLM. Publicly available unlabeled data was used to train these models, and training smaller foundational models require less computing power and resources. LLaMA 65B and 33B have been trained on 1.4 trillion tokens in 20 different languages, and according to the Facebook Artificial Intelligence Research (FAIR) team,

In [36]:
try:
  system_prompt = (
      "You are an assistant and expert in answering questions from a chunks of content. "
      "Only answer AI-related question, else say that you cannot answer this question.")
  prompt = (
      "Read the following informations that might contain the context you require to answer the question. You can use the informations starting from the <START_OF_CONTEXT> tag and end with the <END_OF_CONTEXT> tag. Here is the content:\n\n<START_OF_CONTEXT>\n{}\n<END_OF_CONTEXT>\n\n"
      "Please provide an informative and accurate answer to the following question based on the avaiable context. Be concise and take your time. \nQuestion: {}\nAnswer:"
  )
  # Adding the retrieves pieces of text to our prompt
  prompt = prompt.format("".join(df.chunk[indices]),QUESTION)
  #print(prompt)
  #model = genai.GenerativeModel(model_name="gemini-1.5-flash",system_instruction=system_prompt)
  #result = model.generate_content(prompt,request_options={"timeout": 1000})
  #res = result.text
  response = client.chat.completions.create(
      model="gpt-4o-mini",
      messages = [
          {"role":"system","content":system_prompt},
          {"role":"user","content":prompt}
      ],
      max_tokens = 500,
      temperature=0.7
  )
  res = response.choices[0].message.content
  print(res)
except Exception as e:
  print(f"An error has occured: {e}")

The LLaMA 2 model has four different sizes with the following parameters: 7 billion, 13 billion, 34 billion, and 70 billion.
