In [12]:
import os
import openai
from dotenv import load_dotenv
import pandas as pd
import time

In [13]:
load_dotenv()
openai.api_key = os.getenv('OPENAI_API_KEY')

In [38]:
# wrapper for rate limit exponential delay
def ratelimiterror_exponential_backoff(
    func,
    init_delay=60,
    expo_base=2,
    errors=(openai.error.RateLimitError),
    max_retries=5
):
    def wrapper(*args, **kwargs):
        retries = 0
        delay = init_delay
        while True:
            try:
                return func(*args, **kwargs)
            except errors as e:
                retries += 1
                if retries > max_retries:
                    raise Exception(f'Exceeded maximum number of retries ({max_retries})')
                    
                print(f'RateLimitError: Sleeping for {delay} seconds')
                time.sleep(delay)
                delay *= expo_base
                print(f'Next error wait time is {delay} seconds. {max_retries - retries} retries remaining')
                
                
            except Exception as e:
                raise e
    return wrapper

'''
Uses a GPT-3 model to:
- generate questions based on the articles
- generate answers to the generated questions given the articles
- construct a dataset that contains the articles and questions as prompts,
	and the answers as completions

'''

def get_trimmed_articles(df):
	def split_article(title, date, timeout, content):
		if timeout:
			return None
		header = f'{title}\n{date}\n'
		prompts = []
		divs = len(content) // 4000
		for _ in range(divs):
			div = ''
			while len(div) < 4000:
				newline_index = content.find('\n')
				cut = content[:newline_index]
				div += cut + '\n'
				content = content[newline_index + 1:]
			
			prompts.append(header + div)
		prompts.append(header + content)
		return pd.Series(prompts)
	
	resp = articles.apply(lambda row: split_article(row['title'], row['date_published'], row['timeout'], row['content']), axis=1)
	return resp.melt(value_name='content').drop('variable', axis=1).dropna()

@ratelimiterror_exponential_backoff
def generate_questions(prompt):
    wait_time = 3.5
    response = openai.Completion.create(
        model='text-davinci-003',
        prompt=f'Write three questions based on the article below\nArticle: {prompt}\nQuestions:\n1.',
        presence_penalty=0.5,
        frequency_penalty=0.5,
        temperature=1,
        top_p=1,
        max_tokens=750,
        stop=['\n\n']
    )
    time.sleep(wait_time)
    questions = '1.' + response['choices'][0]['text']
    return questions

@ratelimiterror_exponential_backoff
def generate_answers(context, questions):
    wait_time = 3.5
    response = openai.Completion.create(
        model='text-davinci-003',
        prompt=f'Write answers to the questions based on the article below\nQuestions:\n{questions}\n\nArticle: {context}\n\nAnswers:\n1.',
        presence_penalty=0.5,
        frequency_penalty=0.5,
        temperature=1,
        top_p=1,
        max_tokens=750,
    )
    time.sleep(wait_time)
    answers = '1.' + response['choices'][0]['text']
    return answers

def generate_ft_prompt(context, questions):
    return f'{questions}\n\nArticle: {context}\n\n###\n\n'
def generate_ft_completion(answer):
    return f' {answer}\n==++==\n'


In [15]:
articles = pd.read_excel('sample_fine_tune.xlsx')

In [16]:
dataset = get_trimmed_articles(articles)

In [17]:
dataset

Unnamed: 0,content
0,Samsung Galaxy Book 3 Ultra hands-on: NVIDIA R...
1,"US, Netherlands and Japan reportedly agree to ..."
2,"Nvidia CEO says AI will need regulation, socia..."
3,Sweden to upgrade Berzelius supercomputer with...
4,ChatGPT's massive hype has made these 5 artifi...
...,...
544,How Nvidia’s CUDA Monopoly in Machine Learning...
570,GPUs for Deep Learning in 2023 – An In-depth A...
670,GPUs for Deep Learning in 2023 – An In-depth A...
770,GPUs for Deep Learning in 2023 – An In-depth A...


In [18]:
dataset['questions'] = dataset['content'].apply(generate_questions)

In [19]:
dataset['questions']

0      1. What Intel processors are used in the Samsu...
1      1. What domestic chip industry is the Biden Ad...
2      1. What are the legal and social regulations t...
3      1. What is the purpose of the upgraded Berzeli...
4      1. How has OpenAI's investment from Microsoft ...
                             ...                        
544    1. What are the core components of OpenAI's Tr...
570    1. What benefits does the BrainFloat 16 format...
670    1. What kind of solution does power limiting p...
770    1. How does the Tensor Memory Accelerator (TMA...
870    1. What is the RTX 4070 Ti best suited for?\n2...
Name: questions, Length: 156, dtype: object

In [20]:
dataset['questions'].iloc[0]

'1. What Intel processors are used in the Samsung Galaxy Book 3 line? \n2. What features does the Galaxy Book 3 Ultra offer that other laptops in its class do not? \n3. How much does the Samsung Galaxy Book 3 Ultra cost?'

In [21]:
dataset['answers'] = dataset.apply(lambda row: generate_answers(row['content'], row['questions']), axis=1)

In [22]:
dataset['answers']

0      1. The Galaxy Book 3 line is powered by 13th-g...
1      1. The Biden Administration is attempting to f...
2      1. Legal regulations and social norms that nee...
3      1. The purpose of the upgraded Berzelius super...
4      1. OpenAI's investment from Microsoft has cont...
                             ...                        
544    1. The core components of OpenAI's Triton are ...
570    1. The BrainFloat 16 format (BF16) provides gr...
670    1. Power limiting provides a solution for effi...
770    1. The Tensor Memory Accelerator (TMA) helps o...
870    1. The RTX 4070 Ti is best suited for 8-bit an...
Name: answers, Length: 156, dtype: object

In [23]:
dataset['answers'].iloc[0]

'1. The Galaxy Book 3 line is powered by 13th-gen Intel Core i5 or i7 processors. \n2. The Galaxy Book 3 Ultra offers NVIDIA RTX 4050 and 4070 graphics cards, 16GB of DDR5 RAM in the i7 model and 32GB in the i9 configuration, a 400 nit 3K Dynamic AMOLED screen running at 120Hz, dual Thunderbolt 4 USB-C sockets, a USB-A slot, a 3.5mm audio jack, PC-smartphone connectivity tools like Recent Websites and Instant Hotspot via Microsoft’s Phone Link, Multi Control support for Galaxy smartphones. \n3. The Samsung Galaxy Book 3 Ultra costs $2,400.'

In [39]:
dataset['prompt'] = dataset.apply(lambda row: generate_ft_prompt(row['content'], row['questions']), axis=1)
dataset['completion'] = dataset['answers'].apply(generate_ft_completion)

In [40]:
dataset['prompt']

0      1. What Intel processors are used in the Samsu...
1      1. What domestic chip industry is the Biden Ad...
2      1. What are the legal and social regulations t...
3      1. What is the purpose of the upgraded Berzeli...
4      1. How has OpenAI's investment from Microsoft ...
                             ...                        
544    1. What are the core components of OpenAI's Tr...
570    1. What benefits does the BrainFloat 16 format...
670    1. What kind of solution does power limiting p...
770    1. How does the Tensor Memory Accelerator (TMA...
870    1. What is the RTX 4070 Ti best suited for?\n2...
Name: prompt, Length: 156, dtype: object

In [41]:
dataset['prompt'].iloc[0]

"1. What Intel processors are used in the Samsung Galaxy Book 3 line? \n2. What features does the Galaxy Book 3 Ultra offer that other laptops in its class do not? \n3. How much does the Samsung Galaxy Book 3 Ultra cost?\n\nArticle: Samsung Galaxy Book 3 Ultra hands-on: NVIDIA RTX 4070 power in a super slim frame\n2023-02-01T18:00:29Z\nSamsung is ready to take its Ultra branding to the final frontier — at least, as far as its mobile products go. After introducing an Ultra variant of its tablets last year, the company is launching a similarly high-specced model of its Galaxy Book laptops in 2023. Alongside the new S23 series of flagship phones, Samsung launched the Galaxy Book 3 Pro, Galaxy Book 3 Pro 360 and Galaxy Book 3 Ultra at its Unpacked event in San Francisco today.\n\nThe Ultra and the Pro 360 are only available in 16 inches, while the clamshell Galaxy Book 3 Pro comes in 14- and 16-inch sizes. I was able to check out a few of them at a recent hands-on event, though Samsung did

In [42]:
dataset['completion']

0       1. The Galaxy Book 3 line is powered by 13th-...
1       1. The Biden Administration is attempting to ...
2       1. Legal regulations and social norms that ne...
3       1. The purpose of the upgraded Berzelius supe...
4       1. OpenAI's investment from Microsoft has con...
                             ...                        
544     1. The core components of OpenAI's Triton are...
570     1. The BrainFloat 16 format (BF16) provides g...
670     1. Power limiting provides a solution for eff...
770     1. The Tensor Memory Accelerator (TMA) helps ...
870     1. The RTX 4070 Ti is best suited for 8-bit a...
Name: completion, Length: 156, dtype: object

In [43]:
dataset['completion'].iloc[0]

' 1. The Galaxy Book 3 line is powered by 13th-gen Intel Core i5 or i7 processors. \n2. The Galaxy Book 3 Ultra offers NVIDIA RTX 4050 and 4070 graphics cards, 16GB of DDR5 RAM in the i7 model and 32GB in the i9 configuration, a 400 nit 3K Dynamic AMOLED screen running at 120Hz, dual Thunderbolt 4 USB-C sockets, a USB-A slot, a 3.5mm audio jack, PC-smartphone connectivity tools like Recent Websites and Instant Hotspot via Microsoft’s Phone Link, Multi Control support for Galaxy smartphones. \n3. The Samsung Galaxy Book 3 Ultra costs $2,400.\n==++==\n'

In [96]:
dataset.to_csv('qa_ft_dataset_ver1.csv')

In [127]:
dataset[['prompt', 'completion']].to_json('ft_dataset.jsonl', orient='records', lines=True)