In [138]:
import os
import pandas as pd
import openai
from dotenv import load_dotenv
import time
import openpyxl
import pandas as pd
import requests
from goose3 import Goose

from newsapi import NewsApiClient

from gather_news import create_company_articles_workbook

In [139]:
load_dotenv()

True

In [140]:
openai.api_key = os.getenv('OPENAI_API_KEY')
newsapi = NewsApiClient(api_key=os.getenv("NEWS_API_KEY"))

In [18]:
create_company_articles_workbook()

Publish date Published: Feb. 14, 2023 at 12:10 p.m. ET could not be resolved to UTC
Publish date on Monday January 30, 2023 @05:02PM could not be resolved to UTC
Publish date Last Updated: Jan. 28, 2023 at 9:49 a.m. ET could not be resolved to UTC


In [141]:
articles = pd.read_excel('sample_fine_tune.xlsx')

In [143]:
# wrapper for rate limit exponential delay
def ratelimiterror_exponential_backoff(
    func,
    init_delay=60,
    expo_base=2,
    errors=(openai.error.RateLimitError),
    max_retries=5
):
    def wrapper(*args, **kwargs):
        retries = 0
        delay = init_delay
        while True:
            try:
                return func(*args, **kwargs)
            except errors as e:
                retries += 1
                if retries > max_retries:
                    raise Exception(f'Exceeded maximum number of retries ({max_retries})')
                    
                print(f'RateLimitError: Sleeping for {delay} seconds')
                time.sleep(delay)
                delay *= expo_base
                print(f'Next error wait time is {delay} seconds. {max_retries - retries} retries remaining')
                
                
            except Exception as e:
                raise e
    return wrapper
                

In [204]:
@ratelimiterror_exponential_backoff
def generate_questions(prompt):
    wait_time = 3.5
    response = openai.Completion.create(
        model='text-davinci-003',
        prompt=f'Write three questions based on the article below\nArticle: {prompt}\nQuestions:\n1.',
        presence_penalty=0.5,
        frequency_penalty=0.5,
        temperature=1,
        top_p=1,
        max_tokens=750,
        stop=['\n\n']
    )
    time.sleep(wait_time)
    questions = '1.' + response['choices'][0]['text']
    return questions, response

@ratelimiterror_exponential_backoff
def generate_answers(context, questions):
    wait_time = 3.5
    response = openai.Completion.create(
        model='text-davinci-003',
        prompt=f'Write answers to the questions based on the article below\nQuestions:\n{questions}\n\nArticle: {context}\n\nAnswers:\n1.',
        presence_penalty=0.5,
        frequency_penalty=0.5,
        temperature=1,
        top_p=1,
        max_tokens=750,
    )
    time.sleep(wait_time)
    answers = '1.' + response['choices'][0]['text']
    return answers, response

def generate_prompt(context, questions):
    return f'Write answers to the questions based on the article below\nQuestions:\n{questions}\n\nArticle: {context}\n\nAnswers:\n1.'


In [157]:
def get_trimmed_articles(df):
	def split_article(title, date, timeout, content):
		if timeout:
			return None
		header = f'{title}\n{date}\n'
		prompts = []
		divs = len(content) // 4000
		for _ in range(divs):
			div = ''
			while len(div) < 4000:
				newline_index = content.find('\n')
				cut = content[:newline_index]
				div += cut + '\n'
				content = content[newline_index + 1:]
			
			prompts.append(header + div)
		prompts.append(header + content)
		return pd.Series(prompts)
	
	resp = articles.apply(lambda row: split_article(row['title'], row['date_published'], row['timeout'], row['content']), axis=1)
	return resp.melt(value_name='content').drop('variable', axis=1).dropna()

In [158]:
resp = get_trimmed_articles(articles)

In [159]:
resp

Unnamed: 0,content
0,Samsung Galaxy Book 3 Ultra hands-on: NVIDIA R...
1,"US, Netherlands and Japan reportedly agree to ..."
2,"Nvidia CEO says AI will need regulation, socia..."
3,Sweden to upgrade Berzelius supercomputer with...
4,ChatGPT's massive hype has made these 5 artifi...
...,...
544,How Nvidia’s CUDA Monopoly in Machine Learning...
570,GPUs for Deep Learning in 2023 – An In-depth A...
670,GPUs for Deep Learning in 2023 – An In-depth A...
770,GPUs for Deep Learning in 2023 – An In-depth A...


In [195]:
prompt1 = resp['content'].iloc[0]

In [203]:
questions, response = generate_questions(prompt1)

In [205]:
questions

'1. What processor and graphics card does the Galaxy Book 3 Ultra feature? \n2. How light is the Galaxy Book 3 Ultra?\n3. What features does the Samsung Multi Control feature support on PCs and smartphones?'

In [208]:
response

<OpenAIObject text_completion id=cmpl-6k2Cxr1QrjaR7P3wB4y9as1UG07Um at 0x7f33f9546110> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "text": " What processor and graphics card does the Galaxy Book 3 Ultra feature? \n2. How light is the Galaxy Book 3 Ultra?\n3. What features does the Samsung Multi Control feature support on PCs and smartphones?"
    }
  ],
  "created": 1676429595,
  "id": "cmpl-6k2Cxr1QrjaR7P3wB4y9as1UG07Um",
  "model": "text-davinci-003",
  "object": "text_completion",
  "usage": {
    "completion_tokens": 43,
    "prompt_tokens": 1032,
    "total_tokens": 1075
  }
}

In [206]:
answers, response_a = generate_answers(prompt1, questions)

In [207]:
answers

'1. The Galaxy Book 3 Ultra features 13th-gen Intel Core i5 or i7 processors, with Iris X graphics and up to 32GB of DDR5 RAM, as well as NVIDIA’s RTX 4050 and 4070 graphics cards.\n2. The Galaxy Book 3 Ultra is 1.79kg (3.9 pounds) in weight and 16.5mm (0.64 inches) thin.\n3. The Samsung Multi Control feature supports recent websites, instant hotspot, a keyboard and trackpad across PCs and smartphones.'

In [209]:
response_a

<OpenAIObject text_completion id=cmpl-6k2DyTMyc07ijLQN4rzLKFacaZHvS at 0x7f33f6329030> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "text": " The Galaxy Book 3 Ultra features 13th-gen Intel Core i5 or i7 processors, with Iris X graphics and up to 32GB of DDR5 RAM, as well as NVIDIA\u2019s RTX 4050 and 4070 graphics cards.\n2. The Galaxy Book 3 Ultra is 1.79kg (3.9 pounds) in weight and 16.5mm (0.64 inches) thin.\n3. The Samsung Multi Control feature supports recent websites, instant hotspot, a keyboard and trackpad across PCs and smartphones."
    }
  ],
  "created": 1676429658,
  "id": "cmpl-6k2DyTMyc07ijLQN4rzLKFacaZHvS",
  "model": "text-davinci-003",
  "object": "text_completion",
  "usage": {
    "completion_tokens": 109,
    "prompt_tokens": 1085,
    "total_tokens": 1194
  }
}