In [1]:
!pip install numpy pandas python-dotenv openai tenacity tqdm --quiet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import numpy as np
import pandas as pd
import dotenv
import openai
import asyncio
import os
import json

dotenv.load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

In [3]:
# load csv file, ignore decoding errors
with open('News.csv', "r", encoding='utf-8', errors='ignore') as f:
    df = pd.read_csv(f)

In [4]:
df = df[['title', 'subtitle', 'text']]

In [5]:
len(df), df.head()

(3824,
                                                title subtitle  \
 0  Betsy DeVos Confirmed as Education Secretary, ...      NaN   
 1  Melania Trump Says White House Could Mean Mill...      NaN   
 2  As Trump Fears Fraud, GOP Eliminates Election ...      NaN   
 3  Appeals Court to Decide on Challenge to Trump'...      NaN   
 4  At Least 4 Tornadoes Reported in Southeast Lou...      NaN   
 
                                                 text  
 0  Michigan billionaire education activist Betsy ...  
 1  First lady Melania Trump has said little about...  
 2  A House committee voted on Tuesday to eliminat...  
 3  This afternoon, three federal judges from the ...  
 4  At least four tornadoes touched down in Louisi...  )

In [7]:
export_dataset_len = 100
max_wait_time = 300

## Export Human Dataset

In [None]:
human_dataset = []

for index, row in df.iterrows():
    title = row['title']
    subtitle = row['subtitle']
    text = row['text']

    if not pd.isnull(subtitle):
        article = f'{title}\n{subtitle}\n{text}'
    else:
        article = f'{title}\n{text}'

    human_dataset.append({"article": article})

In [None]:
with open('human_dataset.json', 'w') as f:
    json.dump(human_dataset, f)

## AI Generation Dataset

In [None]:
generation_prompts = []

generation_prompt_template_subtitle = """
Write a news article about the following topic:
Title: {title}
Subtitle: {subtitle}
Only output the written news article.
"""

generation_prompt_template_no_subtitle = """
Write a news article about the following topic:
Title: {title}
Only output the written news article.
"""

for _, row in df.iterrows():
    title = row['title']
    subtitle = row['subtitle']

    if subtitle != np.nan:
        generation_prompt = generation_prompt_template_subtitle.format(title=title, subtitle=subtitle)
    else:
        generation_prompt = generation_prompt_template_no_subtitle.format(title=title)

    generation_prompts.append(generation_prompt)

In [11]:
from tenacity import retry, stop_after_attempt, wait_exponential
from tqdm import tqdm

def after_retry_callback(retry_state):
    print(f"Retry attempt: {retry_state.attempt_number}. Waiting for next attempt...")

async def generate(prompt):
    try:
        # Setting a timeout of 1 minute
        return await asyncio.wait_for(
            openai.ChatCompletion.acreate(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "user", "content": prompt}
                ]
            ),
            timeout=max_wait_time  # Set your desired timeout in seconds here
        )
    except asyncio.TimeoutError:
        # Handle the timeout, e.g., return an error message or log it
        return {'error': 'timeout', 'prompt': prompt}

def get_text(output):
    return output['choices'][0]['message']['content']

async def generate_prompts(prompts):
    tasks = [
        generate(prompt) for prompt in prompts
    ]

    results = []
    success_indices = []
    index = 0
    for future in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc='Tasks', unit='task', leave=False, ncols=100):
        try:
            result = await future
            if result is None:
                raise Exception('Timeout')
            results.append({
                'article': get_text(result)
            })
            success_indices.append(index)
        except Exception as e:
            print(f"A task failed with error: {e}")  # Handle error (if needed)

    return results, success_indices


In [None]:
%autoawait
from aiohttp import ClientSession
openai.aiosession.set(ClientSession())

In [None]:
generation_dataset, _ = await generate_prompts(generation_prompts[:export_dataset_len])

In [None]:
len(generation_dataset)

## AI Refinement Dataset

In [12]:
refinement_prompts = []

refinement_prompt_template_subtitle = """
For the following news article, refine the article to be more concise and professional. Be entertaining, informative and engaging.
Title: {title}
Subtitle: {subtitle}
Article: {text}
Only output the written news article.
"""

refinement_prompt_template_no_subtitle = """
For the following news article, refine the article to be more concise and professional. Be entertaining, informative and engaging.
Title: {title}
Article: {text}
Only output the written news article.
"""

for _, row in df.iterrows():
    title = row['title']
    subtitle = row['subtitle']
    text = row['text']

    if subtitle != np.nan:
        refinement_prompt = refinement_prompt_template_subtitle.format(title=title, subtitle=subtitle, text=text)
    else:
        refinement_prompt = refinement_prompt_template_no_subtitle.format(title=title, text=text)

    refinement_prompts.append(refinement_prompt)

In [13]:
max_wait_time = 60

In [14]:
from time import sleep

batch_size = 100

refinement_datasets = []
for i in range(0, 200, batch_size):
    print(f'Batch {i // batch_size + 1}')
    refinement_dataset, _ = await generate_prompts(refinement_prompts[i:i + batch_size])
    refinement_datasets.extend(refinement_dataset)

Batch 1


Tasks:   0%|                                                              | 0/100 [00:00<?, ?task/s]

Tasks:   1%|▌                                                     | 1/100 [00:01<01:52,  1.13s/task]

A task failed with error: This model's maximum context length is 4097 tokens. However, your messages resulted in 4206 tokens. Please reduce the length of the messages.


Tasks:  98%|███████████████████████████████████████████████████▉ | 98/100 [00:20<00:03,  1.55s/task]Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f279ecbb9a0>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f279df99e40>
                                                                                                    

A task failed with error: 'choices'
A task failed with error: 'choices'
Batch 2


                                                                                                    

In [15]:
len(refinement_datasets)

197

In [16]:
with open('refinement_dataset.json', 'w') as f:
    json.dump(refinement_dataset, f)

## AI Completion Dataset

In [None]:
completion_prompts = []
completion_masks = []

completion_prompt_template_subtitle = """
Finish the following uncomplete news article. Be entertaining, informative and engaging, and make sure the tone is consistent with the previous article.
Title: {title}
Subtitle: {subtitle}
Article: {text}
Only output the written news article.
"""

completion_prompt_template_no_subtitle = """
Finish the following uncomplete news article. Be entertaining, informative and engaging, and make sure the tone is consistent with the previous article.
Title: {title}
Article: {text}
Only output the written news article.
"""

for _, row in df.iterrows():
    title = row['title']
    subtitle = row['subtitle']
    text = row['text']
    if not isinstance(text, str):
        continue

    text = text[:(len(text) // 2)]

    completion_masks.append(len(text))

    if subtitle != np.nan:
        completion_prompt = completion_prompt_template_subtitle.format(title=title, subtitle=subtitle, text=text)
    else:
        completion_prompt = completion_prompt_template_no_subtitle.format(title=title, text=text)

    completion_prompts.append(completion_prompt)

In [51]:
from time import sleep

batch_size = 100
max_wait_time = 60

completion_datasets = []
success_masks = []
for i in range(0, len(refinement_prompts), batch_size):
    print(f'Batch {i // batch_size + 1}')
    completion_dataset, success_indicies = await generate_prompts(completion_prompts[i:i + batch_size])
    refinement_datasets.extend(refinement_dataset)
    success_masks.extend([completion_masks[i] for i in success_indicies])

                                                                                                    

Batch 8


                                                                                                    

Batch 9


                                                                                                    

A task failed with error: 'choices'
Batch 10


                                                                                                    

A task failed with error: 'choices'
Batch 11


                                                                                                    

Batch 12


                                                                                                    

Batch 13


                                                                                                    

CancelledError: 

In [52]:
completion_dataset = [
    {
        'article': data['article'],
        'mask': mask
    }
    for data, mask in zip(completion_dataset, completion_masks)
]

with open('completion_dataset.json', 'w') as f:
    json.dump(completion_dataset, f)

In [None]:
print("Generated dataset")

for d in generation_dataset[:10]:
    print(d['article'][:100])

print("=" * 100)

print("Refinement dataset")

for d in refinement_dataset[:10]:
    print(d['article'][:100])

print("=" * 100)

print("Completion dataset")

for d in completion_dataset[:10]:
    print(d['article'][:100])


In [None]:
print(len(generation_dataset), len(refinement_dataset), len(completion_dataset))