## Set Up

In [121]:
import os
from dotenv import load_dotenv,find_dotenv
_=load_dotenv(find_dotenv())


In [122]:
import cohere
co=cohere.Client(os.environ["COHERE_API_KEY"])

In [123]:
import pandas as pd
import numpy as np

In [124]:
import weaviate 
auth_config=weaviate.auth.AuthApiKey(
    api_key=os.environ['WEAVIATE_ADMIN_API_KEY']
)

client=weaviate.Client(
    url=os.environ['WEAVIATE_REST_END_POINT_URL'],
    auth_client_secret=auth_config,
    additional_headers={
        "X-Cohere-Api-Key":os.environ["COHERE_API_KEY"]
    }
)

Python client v3 `weaviate.Client(...)` connections and methods are deprecated and will
            be removed by 2024-11-30.

            Upgrade your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.
                - For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
                - For code migration, see: https://weaviate.io/developers/weaviate/client-libraries/python/v3_v4_migration

            If you have to use v3 code, install the v3 client and pin the v3 dependency in your requirements file: `weaviate-client>=3.26.7;<4.0.0`
  client=weaviate.Client(
            be removed by 2024-11-30.

            Upgrade your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.
                - For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
                - For code migration, see: https://weaviate.io/developers/weaviate/client-librar

### Loading WikiPedia Data Frame

In [129]:
wiki_articles=pd.read_pickle('wikipedia.pkl')
wiki_articles=wiki_articles.drop(columns=["emb"])

In [130]:
print(wiki_articles.shape)
headers=wiki_articles.columns
print(headers)

(2000, 8)
Index(['id', 'title', 'text', 'url', 'wiki_id', 'views', 'paragraph_id',
       'langs'],
      dtype='object')


## Use cohere to embede
#### Of all the properties of the page lets embede the text attribute to a vector

In [22]:
# Ensure the first row of the 'text' column is accessed correctly
# def count_tokens(text):
#     response = co.tokenize(text)
#     return len(response['tokens'])

# # Access the text of the first row using iloc
# print(count_tokens(wiki_articles.iloc[0]["text"]))


In [84]:
def estimate_tokens(text):
    return len(text.split()) # Approximate token count

def calculate_article_token(wiki):
    wiki["token_count"] = wiki["text"].apply(estimate_tokens)
    total_tokens = wiki_articles["token_count"].sum()

    print(f"Total tokens for all articles: {total_tokens}")



calculate_article_token(wiki_articles)


Total tokens for all articles: 133397


### Lets drop the last 500 rows to reduce the token size and make it below 100,000

In [131]:
wiki_articles = wiki_articles.iloc[:-1800]

print(wiki_articles.shape)
# # calculate_article_token(wiki_articles_reduced)

(200, 8)


### Truncate the text in the text column to reduce the number of tokens per entry

In [132]:
def truncate_text(text, max_tokens=100):
    # Tokenize the text and truncate it to the max_tokens limit
    tokens = text.split()  # This splits the text into words (tokens)
    truncated_text = " ".join(tokens[:max_tokens])# Keep the first max_tokens tokens
    return truncated_text

# Apply the function to the 'text' column to truncate the texts
wiki_articles['text'] = wiki_articles['text'].apply(truncate_text)

# Now the text column is truncated, and the token count will be reduced
print(calculate_article_token(wiki_articles))
print("nahom1")


Total tokens for all articles: 13159
None
nahom1


In [133]:
wiki_articles["emb"]=co.embed(
    texts=wiki_articles["text"].tolist(),
    model="embed-english-v2.0"
).embeddings

In [134]:
wiki_articles.columns

Index(['id', 'title', 'text', 'url', 'wiki_id', 'views', 'paragraph_id',
       'langs', 'token_count', 'emb'],
      dtype='object')

## Define the Weaviate Schema and Upload Object to Weaviate

In [136]:
# client.schema.delete_class("TurncatedWikipediaSemanticSearch")

# schema = {
#     "class": "TurncatedWikipediaSemanticSearch",
#     "description": "A collection of Wikipedia articles with Cohere embeddings",
#     "properties": [
#         {"name": "title", "dataType": ["string"], "description": "The title of the article"},
#         {"name": "text", "dataType": ["string"], "description": "The text content of the article"},
#         {"name": "url", "dataType": ["string"], "description": "The URL of the article"},
#         {"name": "wiki_id", "dataType": ["number"], "description": "The Wikipedia ID of the article"},
#         {"name": "views", "dataType": ["number"], "description": "The number of views of the article"},
#         {"name": "paragraph_id", "dataType": ["number"], "description": "The paragraph ID within the article"},
#     ],
#     "vectorizer": "none"
# }


# client.schema.create_class(schema)
client.is_ready()

True

In [137]:
wiki_articles.shape

(200, 10)

In [138]:
className="TurncatedWikipediaSemanticSearch"
for _,row in wiki_articles.iterrows():
    data_obj={
            "title":row["title"],
            "text":row["text"],
            "url":row["url"],
            "wiki_id":row["wiki_id"],
            "views":row["views"],
            "paragraph_id":row["paragraph_id"],
    }
    vector=row["emb"]
    
    client.data_object.create(data_object=data_obj,class_name=className,vector=vector)

## visualizing the data along with its embedding 


In [140]:
from my_utils import umap_plot_big
data=wiki_articles[["title","text"]]
data_embed=np.array([d for d in wiki_articles["emb"]])

data_chart=umap_plot_big(data,data_embed)


In [141]:
data_chart.interactive()

## Semantic Search

In [168]:
def dense_retrieval(query,
                    results_lang="en",
                    properties=["text","title","url","_additional {distance}"],
                    num_results=10):
    try:
        query_embedding=co.embed(texts=[query],model="embed-english-v2.0").embeddings[0]
        # print("embedding query",query_embedding)
        nearVector={"vector":query_embedding}
        
        # #To filter by language
        # where_filter={
        #     "path":["lang"],
        #     "operator":"Equal",
        #     "valueString":results_lang
        # }

        response=(
        client.query
        .get("TurncatedWikipediaSemanticSearch",properties=properties)
        .with_near_vector(nearVector)
        .with_limit(num_results)
        .do()
            )
        
        result=response["data"]["Get"]["TurncatedWikipediaSemanticSearch"]
        return result
    
    except Exception as e:
        print(f"error {e}")
        return []


        

In [180]:

query="What is ethiopian time zone?"
result=dense_retrieval(query)


print(result)

[{'_additional': {'distance': 0.40590203}, 'text': 'Eastern Time Zone (ET) is the time zone for the eastern part of the Americas including the United States, Canada and parts of South America and the Caribbean. ET is five hours behind UTC in winter, which is called Eastern Standard Time (EST). It is four hours behind UTC during summer Daylight saving time, when it is called Eastern Daylight Time (EDT).', 'title': 'Eastern Time Zone', 'url': 'https://simple.wikipedia.org/wiki?curid=63489'}, {'_additional': {'distance': 0.5194272}, 'text': 'The Central Time Zone subtracts six hours from UTC during standard time (UTC−6) and five hours during daylight saving time (UTC−5).', 'title': 'Central Time Zone', 'url': 'https://simple.wikipedia.org/wiki?curid=50536'}, {'_additional': {'distance': 0.66636634}, 'text': 'The 24-hour clock is a way of telling the time in which the day runs from midnight to midnight and is divided into 24 hours, numbered from 0 to 23. It does not use a.m. or p.m. This s

In [181]:
from dense_retrieval_utils import print_result
print_result(result=result)

item 0
_additional:{'distance': 0.40590203}

text:Eastern Time Zone (ET) is the time zone for the eastern part of the Americas including the United States, Canada and parts of South America and the Caribbean. ET is five hours behind UTC in winter, which is called Eastern Standard Time (EST). It is four hours behind UTC during summer Daylight saving time, when it is called Eastern Daylight Time (EDT).

title:Eastern Time Zone

url:https://simple.wikipedia.org/wiki?curid=63489


item 1
_additional:{'distance': 0.5194272}

text:The Central Time Zone subtracts six hours from UTC during standard time (UTC−6) and five hours during daylight saving time (UTC−5).

title:Central Time Zone

url:https://simple.wikipedia.org/wiki?curid=50536


item 2
_additional:{'distance': 0.66636634}

text:The 24-hour clock is a way of telling the time in which the day runs from midnight to midnight and is divided into 24 hours, numbered from 0 to 23. It does not use a.m. or p.m. This system is also referred to 