# Embed tweet data and store in a ChromaDB vector database

This notebook's goal is to store all the tweet data in the vector database. This data will then be used in the RAG system to answer customer's tweets.

## Import librairies

In [None]:
import uuid
import os
import configparser
import pandas as pd

import chromadb
import chromadb.utils.embedding_functions as embedding_functions


if not os.path.exists('../output'):
    os.makedirs('../output')

## Get data

In [None]:
df = pd.read_csv("../data/twitter_data_clean.csv")

Let's suppose we will build a tool for AmazonHelp customer service.

In [None]:
df = df[df.company == 'AmazonHelp']

In [None]:
df.shape

In [None]:
df.head(10)

In [None]:
# Generate a unique id for each tweet
df['id'] = [str(uuid.uuid4()) for tweet in df.company_tweet.to_list()]

In [None]:
df[df.customer_tweet.str.len() == 0]

In [None]:
# Save the data used for this project
df.to_csv('../output/twitter_data_clean_Amazon.csv', index=False)

## Initiate ChromaDB

In [None]:
df = pd.read_csv('../data/twitter_data_clean_Amazon.csv')

In [None]:
# Load OpenAI API key from config.ini
config = configparser.ConfigParser()
config.read('../config.ini')
openai_api_key = config['OPENAI_API']['OPENAI_KEY']

In [None]:
# Initiate embedding function to be used by ChromaDB when storing data
"""
The embedding function takes text as input, and performs tokenization and embedding. 
"""
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                api_key=openai_api_key,
                model_name="text-embedding-3-small"
            )

In [None]:
# Initialize the ChromaDB client, saving it in a sqlite3 database file 
client = chromadb.PersistentClient(path="../chromadb")

In [None]:
client.delete_collection(name="tweet_amazon_collection")

In [None]:
# Create a new collection for Amazon tweets using the OpenAI embedding model and cosine as a distance metric
collection = client.get_or_create_collection(name="tweet_amazon_collection", 
                                             embedding_function=openai_ef,
                                             metadata={"hnsw:space": "cosine"})

## Embedding and storing into ChromaDB

In [None]:
"""
This cell is embedding and storing all the tweets in the vector database.
We are processing it by chunks to not reach the rate limit of OpenAI API for the embedding model.
"""
def chunk_dataframe(df, chunk_size):
    """Yield successive chunks of dataframe df with size chunk_size."""
    for start in range(0, len(df), chunk_size):
        yield df.iloc[start:start + chunk_size]
        
chunk_size = 1000

for i, chunk in enumerate(chunk_dataframe(df, chunk_size)):
    print(f"Processing chunk {i}")
    customer_tweets = chunk.customer_tweet.to_list()
    company_tweets = [{"company_tweet": tweet} for tweet in chunk.company_tweet.to_list()]
    ids_chunk = chunk.id.to_list()

    # Process the current chunk
    collection.add(
        documents=customer_tweets,
        metadatas=company_tweets,
        ids=ids_chunk
    )

In [None]:
collection.count()

In [None]:
collection.peek()