In [104]:
import pandas as pd
import numpy as np
from langchain.text_splitter import CharacterTextSplitter
import openai
from pinecone import Pinecone
from sklearn.metrics.pairwise import cosine_similarity




In [105]:
df = pd.read_csv('/Users/shuai/Desktop/clear.csv')

In [106]:
formatted_series = df.apply(lambda x: f"{x['title']}: {x['lyrics']}", axis=1)

# Combine all formatted strings into a single string, separated by new lines
data = '\n'.join(formatted_series)
print(len(data))


557284


In [109]:
def get_chunks(text):
    """
    Function to get the chunks of text from the raw text

    Args:
        text (str): The raw text from the PDF file

    Returns:
        chunks (list): The list of chunks of text
    """

    # Initialize the text splitter
    splitter = CharacterTextSplitter(
        separator="\n", # Split the text by new line
        chunk_size=1250, # Split the text into chunks of 1600 characters
        chunk_overlap=200, # Overlap the chunks by 200 characters
        length_function=len # Use the length function to get the length of the text
    )

    # Get the chunks of text
    chunks = splitter.split_text(text)

    return chunks

chunk_data = get_chunks(data)


def get_embeddings(chunk_data):
    """
    Get the embedding vectors for the chunk data

    Arg:
    - chunk_data: a list of chunk data

    Return:
    - Embedded vectors

    """
    
    client = openai.OpenAI(api_key = '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')
    response = client.embeddings.create(
        input=chunk_data,
        model="text-embedding-3-small"
        )

    vectors_list = [item.embedding for item in response.data]
    return vectors_list

vectors_list = get_embeddings(chunk_data)

pc = Pinecone(api_key='6e30cd30-b1fb-4fd9-849f-1507d42c1df2')
index = pc.Index("openai153")

# Store vectors in vector database
def vector_store(vectors_list):
    # Iterate over the vectors_list
    for i in range(len(vectors_list)):
        index.upsert(
            vectors=[
                {
                    'id': f'vec_{i}',
                    'values': vectors_list[i],
                    'metadata': {"text":chunk_data[i]}
                }
            ],
        )

vector_store(vectors_list)





# if __name__ == "__main__":
#     chunk_data = get_chunks(data)
#     vectors_list = get_embeddings(chunk_data)
#     pc = Pinecone(api_key='7658c7bf-1ee9-4e8d-9a5e-bc100843b51f')
#     index = pc.Index("openai")
#     vector_store(vectors_list)
#     context = 

In [121]:
query = 'create a poem for me in the style of Bob Dylan in this topic: Rivers of Time'
query_embedding = client.embeddings.create(
    input=query,
    model="text-embedding-3-small"
)
query_vector = [item.embedding for item in query_embedding.data]

def retrieve_embedding(index, num_embed):
    """
    Convert the information of vectors in the database into a panda dataframe
    
    Args:
    - index: Name of vector database(already set up)
    - num_embed: total number of vectors in the vector databse

    Return:
    - a dataframe which contains the embedded vectors and corresponding text
    """
    # Initialize a dictionary to store embedding data
    embedding_data = {"id":[], "values":[], "text":[]}
    
    # Fetch the embeddings 
    embedding = index.fetch([f'vec_{i}' for i in range(num_embed)])
    
    for i in range(num_embed):
        embedding_data["id"].append(i)
        idx = f"vec_{i}"
        embedding_data["text"].append(embedding['vectors'][idx]['metadata']['text'])
        embedding_data["values"].append(embedding['vectors'][idx]['values'])
        
    return pd.DataFrame(embedding_data)


embedding_data = retrieve_embedding(index,len(vectors_list))

def semantic_search(query_vector, db_embeddings):
    """
    Find the top three vectors which have the highest comsine similarity with the query vector

    Args:
    - query_vector: embedded vector of user query
    - db_embeddings: embedded vectors from vector database

    Return:
    - The indices of top three most similar vectors with the query vector
    """
    
    similarities = cosine_similarity(query_vector, db_embeddings)[0]
    # Get the indices of the top three similarity scores
    top_10_indices = np.argsort(similarities)[-10:][::-1]  # This sorts and then reverses to get top 3
    # Retrieve the top three most similar chunks and their similarity scores
    
    return top_10_indices

top_10_indices = semantic_search(query_vector, vectors_list)


def get_text(embedding_data, top_10_indices):
    """
    Extracts text corresponding to the given top vectors from embedding data.

    Args:
    - embedding_data (DataFrame): DataFrame containing columns 'id', 'values', and 'text'.
    - top_vectors (list): List of indices for which corresponding text needs to be extracted.

    Returns:
    - combined_text (str): Combined text corresponding to the top vectors.
    """
   # Extract text from selected rows
    selected_texts = embedding_data.loc[top_10_indices, 'text'].tolist()

    # Combine the selected texts into a single string
    combined_text = ' '.join(selected_texts)

    return combined_text

context = get_text(embedding_data, top_10_indices)


In [122]:
client = openai.OpenAI(api_key = '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')
def response_rag(user_input):
    """
    Generate response using rag

    Args:
    - questions given by users

    Returns:
    - answer using rag  
    """
    
    response_rag = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
      {"role": "system", "content": "You are a helpful assistant."},
      {"role": "assistant", "content": context},
      {"role": "user", "content": user_input}
    ]
    )
    return response_rag.choices[0].message.content

response_rag('create a poem for me in the style of Bob Dylan in this topic: Rivers of Time')

"Down the rivers of time, we sail and we roam\nThrough the twisting waters, we find our way home\nMemories like echoes, linger in the air\nWhispering secrets of joys and despair\n\nThe current keeps moving, never looking back\nCarrying us forward on an endless track\nThrough valleys of laughter and mountains of tears\nWe navigate bravely, overcoming our fears\n\nEach bend brings a story, a lesson to find\nIn the rivers of time, where the past intertwines\nWith the present unfolding, a mystery untold\nIn the flow of existence, where destinies mold\n\nSo let's navigate softly, with hearts open wide\nEmbrace every moment, let love be our guide\nFor in the rivers of time, where we ebb and we flow\nWe discover the beauty of life's sacred glow\n\nSo sail on, my friend, in the rivers of time\nEmbrace every twist and turn, and make each moment shine\nFor in the depths of the waters, where the soul finds its rhyme\nWe dance with the rhythm of love's infinite chime"

In [123]:
def response_wo_rag(user_input):
    """
    Generate response withou using rag

    Args:
    - questions given by users

    Returns:
    - answer without using rag  
    """

    response_wo_rag = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
      {"role": "system", "content": "You are a helpful assistant."},
      {"role": "user", "content": user_input}
    ]
    )
    return response_wo_rag.choices[0].message.content

response_wo_rag('create a poem for me in the style of Bob Dylan in this topic: Rivers of Time')

"In the rivers of time, we all do flow\nUpon its currents, we come and we go\nA never-ending journey, twisting and turning\nAs the days pass by, the flames keep burning\n\nFrom the cradle to the grave, we journey on\nThrough the trials and tribulations we're drawn\nThe rivers of time, they shape and define\nEach moment cherished, each memory sublime\n\nAs we walk this road, we learn and we grow\nThrough the highs and lows, the joys and woe\nThe rivers of time, they carry us far\nGuiding us gently like a guiding star\n\nSo let us embrace the journey ahead\nWith open hearts and minds, let us tread\nThe rivers of time will always flow\nBut in its depths, our stories will glow"