In [10]:
from dotenv import load_dotenv
import os

# Load the specific .env file
load_dotenv(dotenv_path="keys.env")

# Set the API keys from environment variables
openai_key = os.getenv("OPENAI_API_KEY")
if openai_key is None:
    raise ValueError("OPENAI_API_KEY not found in environment variables")
os.environ["OPENAI_API_KEY"] = openai_key

google_key = os.getenv("GOOGLE_API_KEY")
if google_key is None:
    raise ValueError("GOOGLE_API_KEY not found in environment variables")
os.environ["GOOGLE_API_KEY"] = google_key

In [None]:
# False: Generate the embedding for the dataset. (Associated cost with using OpenAI endpoint)
# True: Load the dataset that already has the embedding vectors.
load_embedding = False

LOAD DATASETS

In [12]:
# !wget https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-llama-articles.csv
# !wget https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-llama-articles-with_embeddings.csv

In [13]:
def splitInputText(text, max_chunk_size=1024):
    """
    Splits the input text into smaller chunks of a specified maximum size.
    """
    chunks = []
    print(text)
    for i in range(0, len(text), max_chunk_size):
        chunks.append(text[i:i+max_chunk_size])
        # print(f"Chunk {len(chunks)}: {text[i:i+max_chunk_size]}")
    return chunks
    

In [14]:
import csv

chunks = []

#load the file as a csv
with open("mini-llama-articles.csv", "r", encoding="utf-8") as f:
    csv_reader = csv.reader(f)

    for idx, row in enumerate(csv_reader):
        print(f"Row {idx}: {row}")
        if idx == 0:
            continue
        chunks.extend(splitInputText(row[1]))
        
print(f"Total number of chunks: {len(chunks)}")


Row 0: ['title', 'content', 'url', 'source']
Row 1: ["Beyond GPT-4: What's New?", 'LLM Variants and Meta\'s Open Source Before shedding light on four major trends, I\'d share the latest Meta\'s Llama 2 and Code Llama. Meta\'s Llama 2 represents a sophisticated evolution in LLMs. This suite spans models pretrained and fine-tuned across a parameter spectrum of 7 billion to 70 billion. A specialized derivative, Llama 2-Chat, has been engineered explicitly for dialogue-centric applications. Benchmarking revealed Llama 2\'s superior performance over most extant open-source chat models. Human-centric evaluations, focusing on safety and utility metrics, positioned Llama 2-Chat as a potential contender against proprietary, closed-source counterparts. The development trajectory of Llama 2 emphasized rigorous fine-tuning methodologies. Meta\'s transparent delineation of these processes aims to catalyze community-driven advancements in LLMs, underscoring a commitment to collaborative and responsi

In [15]:
import pandas as pd

# Convert the JSON list to a Pandas Dataframe
df = pd.DataFrame(chunks, columns=["chunk"])

df.keys()
df.head()

Unnamed: 0,chunk
0,LLM Variants and Meta's Open Source Before she...
1,ational code model;Codel Llama - Python specia...
2,"erm ""multimodal"" refers to their ability to pr..."
3,"es it matter? LLM connections, like the LlamaI..."
4,understand data in the AI-driven future. Fro...


EMBEDDINGS


In [16]:
from openai import OpenAI
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv(dotenv_path="keys.env")


# Initialize the OpenAI client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Define a function to get embeddings
def get_embedding(text):
    try:
        response = client.embeddings.create(
            model="text-embedding-3-small",  # Use the appropriate model
            input=text
        )
        return response.data[0].embedding
    except Exception as e:
        print(f"Error getting embedding: {e}")
        return None


In [19]:
from tqdm.notebook import tqdm
import numpy as np

# generate the embedding for the dataset
if not load_embedding:
    print("Generating embeddings for the dataset...")
    embeddings = []
    for index,row in tqdm(df.iterrows()):
        embeddings.append(get_embedding(row['chunk']))
    
    embeddings_values = pd.Series(embeddings)
    df.insert(1, "embeddings_created", embeddings_values)

# Or, load the embedding from the file.
else:
    print("Loaded the embedding file.")
    # Load the file as a CSV
    df = pd.read_csv("mini-llama-articles-with_embeddings_created.csv")
    # Convert embedding column to an array
    df["embedding"] = df["embeddings_created"].apply(lambda x: np.array(eval(x)), 0)

Loaded the embedding file.


In [9]:
df.to_csv('mini-llama-articles-with_embeddings_created.csv')

User Question

In [21]:
QUESTION = "HOW MANY PARAMETERS DOES LLAMA 2 HAVE?"
QUESTION_emb = get_embedding(QUESTION)
print(f"Question: {len(QUESTION)}")
print(f"Question embedding: {len(QUESTION_emb)}")

Question: 38
Question embedding: 1536


TEST COSINE SIMILARITY

Calculating the cosine similarity of embeddings representations can help us to find pieces of text that are close to each other. In 
the following sample we will see how cosine similarity can identify which sentence could be a possible answer for the given user question. Obviously, the unrelated answer will score lower.

In [22]:
# Examples for cosine similarity
Bad_source_emb = get_embedding("the sky is blue")
Good_source_emb = get_embedding("Llama 2 model has 2B parameters")

In [25]:
print(([Bad_source_emb]))

[[0.015619354322552681, -0.026077529415488243, -0.021805359050631523, -0.002915509743615985, 0.01666887477040291, -0.025200869888067245, -0.013248668983578682, 0.018842002376914024, -0.04588262364268303, -0.05408124253153801, 0.030818898230791092, -0.0050840070471167564, -0.002538916654884815, 0.0296829454600811, -0.005226001143455505, 0.021768316626548767, -0.02118799276649952, -0.01857036165893078, -0.017891259863972664, -0.004358602222055197, -0.0012887510238215327, 0.002245668089017272, 0.03210302069783211, 0.04084491729736328, -0.01928650587797165, 0.018545666709542274, 0.0350169874727726, -0.005386516451835632, -0.027707375586032867, -0.01011862512677908, 0.01708868518471718, -0.047216132283210754, -0.011575608514249325, -0.05783482640981674, 0.02457115799188614, -0.03716541826725006, 0.03763461858034134, -0.010970589704811573, -0.023348772898316383, -0.04343785345554352, -0.013890729285776615, -0.047043271362781525, 0.02802840620279312, -0.03353530913591385, 0.01649601384997368,

In [26]:
from sklearn.metrics.pairwise import cosine_similarity

# Sample showin how a good piece of text can achieve high similarity score compared to completely unrelated text
print("bad respnse score: ", cosine_similarity([QUESTION_emb], [Bad_source_emb]))
print("good response score: ", cosine_similarity([QUESTION_emb], [Good_source_emb]))


bad respnse score:  [[0.06699473]]
good response score:  [[0.68333342]]


Calculation COsine Similarities

In [39]:
type(df["embedding"].tolist())


list

In [41]:
# The cosine similarity of question and each part of the eassay.
# cosine_similarties = [cosine_similarity([QUESTION_emb],[emb]) for emb in df["embedding"].tolist()]
cosine_similarties = cosine_similarity([QUESTION_emb], df["embedding"].tolist())
print(cosine_similarties)

[[ 0.54192945  0.52594417  0.24331534  0.3300685   0.31164356  0.38935919
   0.3793556   0.47170285  0.5145466   0.11844389  0.1126919  -0.00080107
   0.21021863  0.21266593  0.11786954  0.28907358  0.12356457  0.3735306
   0.13711413  0.13183514  0.3429402   0.24216189  0.23163203  0.23697913
   0.19785134  0.30201776  0.18961446  0.281385    0.34500204  0.37105055
   0.42685896  0.32740101  0.50980917  0.39222082  0.35711672  0.25891578
   0.23953615  0.27728078  0.32786947  0.3953107   0.36628344  0.38063905
   0.47461513  0.45208432  0.38730138  0.31422229  0.56546322  0.19789381
   0.36209526  0.28351501  0.40035846  0.48349244  0.49700034  0.3453213
   0.30777701  0.36534134  0.21508867  0.16248172  0.22558267  0.31713596
   0.32948148  0.19227367  0.17204432  0.16868371  0.15728668  0.36110047
   0.24260014  0.27020994  0.27260996  0.4058374   0.20498193  0.22110487
   0.365291    0.22686626  0.52756315  0.44405977  0.3121069   0.33526483
   0.37191631  0.32389031  0.25448387  0

In [45]:
import numpy as np

number_of_chunks_to_retrieve = 3

#sort the scores
highest_index = np.argmax(cosine_similarties)
print(cosine_similarties[0][highest_index])
#pick the N highest scored chunks
indices = np.argsort(cosine_similarties[0])[::-1][:number_of_chunks_to_retrieve]
indices

0.6142183702213526


array([89, 46,  0])

In [46]:
# lets see the chunks of text that are most relevant to the question
for i in indices:
    print(f"Chunk {i}: {chunks[i]}")
    print(f"Score: {cosine_similarties[0][i]}")
    print("\n")

Chunk 89: I. Llama 2: Revolutionizing Commercial Use Unlike its predecessor Llama 1, which was limited to research use, Llama 2 represents a major advancement as an open-source commercial model. Businesses can now integrate Llama 2 into products to create AI-powered applications. Availability on Azure and AWS facilitates fine-tuning and adoption. However, restrictions apply to prevent exploitation. Companies with over 700 million active daily users cannot use Llama 2. Additionally, its output cannot be used to improve other language models.  II. Llama 2 Model Flavors Llama 2 is available in four different model sizes: 7 billion, 13 billion, 34 billion, and 70 billion parameters. While 7B, 13B, and 70B have already been released, the 34B model is still awaited. The pretrained variant, trained on a whopping 2 trillion tokens, boasts a context window of 4096 tokens, twice the size of its predecessor Llama 1. Meta also released a Llama 2 fine-tuned model for chat applications that was trai