In [6]:
import pandas as pd
import numpy as np
from langchain.text_splitter import CharacterTextSplitter
import openai
from pinecone import Pinecone
from sklearn.metrics.pairwise import cosine_similarity




In [7]:
df = pd.read_csv('/Users/shuai/Desktop/clear.csv')

In [8]:
formatted_series = df.apply(lambda x: f"{x['title']}: {x['lyrics']}", axis=1)

# Combine all formatted strings into a single string, separated by new lines
data = '\n'.join(formatted_series)
print(len(data))


557284


In [9]:
def get_chunks(text):
    """
    Function to get the chunks of text from the raw text

    Args:
        text (str): The raw text from the PDF file

    Returns:
        chunks (list): The list of chunks of text
    """

    # Initialize the text splitter
    splitter = CharacterTextSplitter(
        separator="\n", # Split the text by new line
        chunk_size=1250, # Split the text into chunks of 1600 characters
        chunk_overlap=200, # Overlap the chunks by 200 characters
        length_function=len # Use the length function to get the length of the text
    )

    # Get the chunks of text
    chunks = splitter.split_text(text)

    return chunks

chunk_data = get_chunks(data)


def get_embeddings(chunk_data):
    """
    Get the embedding vectors for the chunk data

    Arg:
    - chunk_data: a list of chunk data

    Return:
    - Embedded vectors

    """
    
    client = openai.OpenAI(api_key = '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')
    response = client.embeddings.create(
        input=chunk_data,
        model="text-embedding-3-small"
        )

    vectors_list = [item.embedding for item in response.data]
    return vectors_list

vectors_list = get_embeddings(chunk_data)

pc = Pinecone(api_key='6e30cd30-b1fb-4fd9-849f-1507d42c1df2')
index = pc.Index("openai153")

# Store vectors in vector database
def vector_store(vectors_list):
    # Iterate over the vectors_list
    for i in range(len(vectors_list)):
        index.upsert(
            vectors=[
                {
                    'id': f'vec_{i}',
                    'values': vectors_list[i],
                    'metadata': {"text":chunk_data[i]}
                }
            ],
        )

vector_store(vectors_list)




In [10]:
client = openai.OpenAI(api_key = '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')

poems_title = [
    "Whispers of the Wind",
    "Echoes of the Forgotten",
    "Shadows at Noon",
    "Rivers of Time",
    "Beneath the Harvest Moon",
    "Silent Symphony",
    "Dancing in the Rain",
    "Embers of Yesterday",
    "Paths Untrodden",
    "Midnight's Serenade",
    "Shadows in the Moonlight",
    "Echoes of Forgotten Dreams",
    "Tides of Change",
    "Beneath the Velvet Sky",
    "Dancing with the Stars",
    "In the Heart of the Storm",
    "Serenade of the Sea",
    "Flames of Desire",
    "Through the Eyes of Time",
    "Whispers Among the Ruins",
    "Glimpses of Eternity",
    "Rhythms of the Rain",
    "Veils of Mist",
    "Embers of a Fading Sun",
    "Bridges Over Silent Waters"
]

queries = []

# Loop through each title in the poems_title list
for title in poems_title:
    query = f"create a poem for me in the style of Bob Dylan on the topic: {title}"
    queries.append(query)

query_vectors = []

# Loop through each query in the list
for query in queries:
    # Generate an embedding for the current query
    embedding = client.embeddings.create(
        input=query,
        model="text-embedding-3-small"
    )
    # Append the generated embedding to the query_embeddings list
    query_vector = [item.embedding for item in embedding.data]
    query_vectors.append(query_vector)



In [11]:
def retrieve_embedding(index, num_embed):
    """
    Convert the information of vectors in the database into a panda dataframe
    
    Args:
    - index: Name of vector database(already set up)
    - num_embed: total number of vectors in the vector databse

    Return:
    - a dataframe which contains the embedded vectors and corresponding text
    """
    # Initialize a dictionary to store embedding data
    embedding_data = {"id":[], "values":[], "text":[]}
    
    # Fetch the embeddings 
    embedding = index.fetch([f'vec_{i}' for i in range(num_embed)])
    
    for i in range(num_embed):
        embedding_data["id"].append(i)
        idx = f"vec_{i}"
        embedding_data["text"].append(embedding['vectors'][idx]['metadata']['text'])
        embedding_data["values"].append(embedding['vectors'][idx]['values'])
        
    return pd.DataFrame(embedding_data)


embedding_data = retrieve_embedding(index,len(vectors_list))

def semantic_search(query_vector, db_embeddings):
    """
    Find the top three vectors which have the highest comsine similarity with the query vector

    Args:
    - query_vector: embedded vector of user query
    - db_embeddings: embedded vectors from vector database

    Return:
    - The indices of top three most similar vectors with the query vector
    """
    
    similarities = cosine_similarity(query_vector, db_embeddings)[0]
    # Get the indices of the top three similarity scores
    top_10_indices = np.argsort(similarities)[-10:][::-1]  # This sorts and then reverses to get top 3
    # Retrieve the top three most similar chunks and their similarity scores
    
    return top_10_indices

top_10_indices_list = []
for query_vector in query_vectors:
    top_10_indices = semantic_search(query_vector, vectors_list)
    top_10_indices_list.append(top_10_indices)

def get_text(embedding_data, top_10_indices):
    """
    Extracts text corresponding to the given top vectors from embedding data.

    Args:
    - embedding_data (DataFrame): DataFrame containing columns 'id', 'values', and 'text'.
    - top_vectors (list): List of indices for which corresponding text needs to be extracted.

    Returns:
    - combined_text (str): Combined text corresponding to the top vectors.
    """
   # Extract text from selected rows
    selected_texts = embedding_data.loc[top_10_indices, 'text'].tolist()

    # Combine the selected texts into a single string
    combined_text = ' '.join(selected_texts)

    return combined_text

contexts = []
for indices in top_10_indices_list:
    # Retrieve text for the current set of top indices
    context = get_text(embedding_data, indices)
    contexts.append(context)



In [12]:
rag_response_list = []

for title,context in zip(poems_title, contexts):
    
    system_prompt = """
        you are a Bob Dylan poetry generator bot. 
        Please generate a poem in Bob Dylan's style. The topic is: 
        """
        
    completion = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": title},
        {"role": "assistant", "content": context},

    ]
    )
    
    rag_response_list.append({title + ": ": completion.choices[0].message.content})



In [13]:
rag_response_list

[{'Whispers of the Wind: ': "Whispers of the wind, secrets it brings\n \n Carrying stories of forgotten things\n \n Through the valleys and over the hills\n \n It speaks in whispers that give us chills\n \n \n \n The wind it howls, the wind it cries\n \n Telling tales of love and goodbyes\n \n It carries memories of days gone by\n \n And echoes of dreams that reach the sky\n \n \n \n Listen close to the whispers of the wind\n \n For in its voice, truths you'll find\n \n It speaks of love, loss, and everything in between\n \n Whispering secrets that have always been seen\n \n \n \n So heed the whispers that the wind does send\n \n For in its words, lessons to comprehend\n \n It carries the wisdom of the ages untold\n \n Whispers of the wind, a treasure to behold"},
 {'Echoes of the Forgotten: ': 'In the echoes of the forgotten, stories untold\nWhispers of the past in the wind, gently unfold\nMemories flicker like shadows in the night\nLost in the depths of time, out of sight\n\nIn the q

In [14]:
import json

with open('poems_rag_gpt35.json', 'w') as f:
    json.dump(rag_response_list, f)