In [1]:
from openai import OpenAI
import pandas as pd

client = OpenAI(api_key=#Addyourapikey)

#Get the embedding vectors from OAI API
def fetch_oai_embeddings(input_strings:list,
                         embedding_model: str,
                         oai_client:OpenAI)-> list:
    """
    Get the embedding vectors from the OpenAI API.

    This function takes a list of text inputs and retrieves their corresponding
    embedding vectors using the specified OpenAI embedding model. The function
    returns a list of embedding vectors.

    Parameters:
    -----------
    input_strings : list
        A list of text strings for which to generate embeddings.
    embedding_model : str
        The name of the OpenAI embedding model to use (e.g., "text-embedding-3-small").
    oai_client : OpenAI
        An authenticated OpenAI client instance.

    Returns:
    --------
    list
        A list of embedding vectors, where each vector corresponds to an input string.

    Example:
    --------
    client = OpenAI(api_key="your_api_key")
    >>> texts = [
    ...     'I love embeddings, they make so much sense',
    ...     'Embeddings are useful for various applications',
    ...     'Understanding embeddings can enhance machine learning models'
    ... ]
    >>> embeddings = fetch_oai_embeddings(texts, "text-embedding-3-small", client)
    >>> print(embeddings)
    """
    
    responses = []

    #Get the embedding values for each input in your list
    #Probably a way to do this async
    for input_string in input_strings:
        response = oai_client.embeddings.create(
            input=input_string,
            model=embedding_model)
        responses.append(response.data[0].embedding)

    return responses



In [25]:
text = [
    'I love embeddings, they make so much sense',
    'Embeddings are useful for various applications',
    'Understanding embeddings can enhance machine learning models'
]

test_embs = fetch_oai_embeddings(text, 
                     "text-embedding-3-small", 
                     client)

In [3]:
import pandas as pd

df = pd.read_csv('/Users/adamfletcher/Desktop/JA_PP - Test Set_220524.csv')

df.head()

Unnamed: 0.1,Unnamed: 0,organization,gkg_record_id,document_identifier,company_sentiment,article_sentiment,company_relevance
0,Facebook20231116140000-1898https://slguardian....,Facebook,20231116140000-1898,https://slguardian.org/afghanistan-collapse-of...,neutral,very negative,very weak
1,Marawa Research and Exploration20231116104500-...,Marawa Research and Exploration,20231116104500-493,https://www.mining.com/web/the-promise-and-ris...,negative,negative,weak
2,Rio Tinto20231116104500-493https://www.mining....,Rio Tinto,20231116104500-493,https://www.mining.com/web/the-promise-and-ris...,negative,negative,weak
3,Blue Minerals Jamaica20231116104500-493https:/...,Blue Minerals Jamaica,20231116104500-493,https://www.mining.com/web/the-promise-and-ris...,negative,negative,weak
4,Metals Co20231116104500-493https://www.mining....,Metals Co,20231116104500-493,https://www.mining.com/web/the-promise-and-ris...,negative,negative,weak


In [26]:


def trim_text_to_600_words(text: str) -> str:
    """
    Reads the text from a specified file and trims it to the first 600 words if it exceeds that length.
    
    This function is particularly useful when working with models like the OpenAI embedding model, which have a 
    maximum token input limit. Since the average word length in English roughly corresponds to 0.75 tokens, 
    limiting the text to 600 words helps ensure that the token count remains within the model's acceptable range.

    Parameters:
    - file_path (str): The path to the text file to be read.

    Returns:
    - str: The trimmed text containing up to 600 words.
    
    Example:
    >>> trimmed_text = trim_text_to_600_words(text)
    >>> print(trimmed_text)
    """
    # Split the text into words
    words = text.split()
    
    # Check if the number of words is more than 600
    if len(words) > 600:
        # Trim the list of words to the first 600 words
        words = words[:600]
    
    # Join the words back into a string
    trimmed_text = ' '.join(words)
    
    return trimmed_text


In [24]:
import os

df_embs = pd.DataFrame(columns=['gkg_record_id', 'oai_embeddings'])

for doc_id in df['gkg_record_id'].unique():
    # print(doc_id)
    if doc_id == 'nan':
        continue
    elif type(doc_id) == float:
        continue 
    
    doc_id_filename = doc_id + ".txt"
    file_path = f"/Users/adamfletcher/Desktop/article_text/{doc_id_filename}"
    if not os.path.exists(file_path):
        print(f"{file_path} not found")
    else: 
        with open(file_path) as f:
            content = f.read()
        
        content = trim_text_to_600_words(content)
        embs = fetch_oai_embeddings([content], 
                                    "text-embedding-3-small", 
                                    client)
        df_embs = pd.concat([df_embs, pd.DataFrame({'gkg_record_id': doc_id, 'oai_embeddings': embs})])



Unnamed: 0,gkg_record_id,oai_embeddings


/Users/adamfletcher/Desktop/article_text/20231101230000-1352.txt not found
/Users/adamfletcher/Desktop/article_text/20231108194500-2193.txt not found
/Users/adamfletcher/Desktop/article_text/20230926160000-926.txt not found
/Users/adamfletcher/Desktop/article_text/20150321104500-913.txt not found
/Users/adamfletcher/Desktop/article_text/20170905171500-999.txt not found
/Users/adamfletcher/Desktop/article_text/20171115103000-T60.txt not found
/Users/adamfletcher/Desktop/article_text/20200128191500-2296.txt not found
/Users/adamfletcher/Desktop/article_text/20200924150000-T1243.txt not found
/Users/adamfletcher/Desktop/article_text/20191118090000-788.txt not found
/Users/adamfletcher/Desktop/article_text/20161130053000-T2477.txt not found
/Users/adamfletcher/Desktop/article_text/20200128204500-2558.txt not found
/Users/adamfletcher/Desktop/article_text/20221207001500-429.txt not found
/Users/adamfletcher/Desktop/article_text/20160525074500-1453.txt not found
/Users/adamfletcher/Desktop/a

In [19]:
df_embs = df_embs.reset_index(drop=True)
df_full = df.merge(df_embs, on='gkg_record_id', how='left')
df_full.to_csv('articles_with_embeddings.csv')

(50, 2)