# OpenAI GPT-3 article generation
Remember to upload `articles1.csv` and `constants.py` the latter which should include variable `OPENAI_API_KEY` like such:
```python
OPENAI_API_KEY = "<your api key>"
```

## Step 1: Handle imports and add .csv file

In [None]:
!pip -qqq install openai wandb language-tool-python

In [None]:
import openai
import pandas as pd
import numpy as np
import datetime
import os
import re
import nltk
nltk.download("stopwords")
nltk.download("wordnet")
import language_tool_python

from google.colab import drive

OPENAI_API_KEY = "YOUR KEY"
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [None]:
if not os.path.exists("drive"): 
    drive.mount("/content/drive") # The size of our dataset requires a drive mount

In [None]:
if not os.path.exists("YOUR DRIVE"): # omit if you don't use drive
    csv_path = input() # copy and paste path to articles.csv
else:
    csv_path = "YOUR DRIVE"

In [None]:
df = pd.read_csv(csv_path, sep="\t")

print(len(df))

In [None]:
print(len(df["title"].value_counts()))
print(len(df["content"].value_counts()))

print()

print(print(df["category"].value_counts()))

## Step 2: Preprocessing the data

In [None]:
df.head()

In [None]:
drop_columns = ["filename"]
if all(item in df.columns for item in drop_columns):
    df = df.drop(columns=drop_columns)

df = df.drop_duplicates("title")
df = df.dropna(subset=["content"])

# df["year"] = df["year"].astype("int64")
# df["year"] = pd.to_numeric(df["year"], downcast='integer')

len(df)

In [None]:
df["category"].unique()

In [None]:
df["category"].value_counts()

In [None]:
stop_words = stopwords.words("english")
lemmatizer = WordNetLemmatizer()

def regex_title(text):
    text = re.sub('(\s+\s+)', '', text)
    text = re.sub('\s\-\s(((([A-Z][a-z0-9]+)\s)+[A-Z][a-z0-9]+)|([A-Z][a-z0-9]+))', '', text)
    return text.title()

def clean_content(text):
    text = re.sub('([A-Z]+\s+\—+\s+)|([A-Z]+\s+[A-Z]+\s+\—+\s+)|([A-Z]+,\s[A-Z][a-z]+\s+\—\s+)|([A-Z]+,\s+[A-Z][a-z]+\s[A-Z][a-z]+\s+\—\s+)', '', text)
    text = re.sub('(\s’s)', '', text)
    text = re.sub('’(\s+’+)+\s+', '', text)
    text = re.sub('(\s+\s+)', ' ', text)
    text = re.sub('’{2}', '’', text)
    text = re.sub('((’t)+)', '', text)
    text = re.sub("\\\\", "", text)
    text = re.sub('\A\s', '', text)
    text = re.sub('\s$', '', text)
    return text


"""
def grammar_score(text, tool):
    error_count = 0
    for sentence in text_to_sentences(text):
        check = tool.check(sentence)
        sentence_errors = len(check)
        error_count += sentence_errors
    
    word_count = re.split('\s+', text)
    word_count = len([tok for tok in word_count if tok not in stop_words])

    error_score = 1 - (float(error_count) / float(word_count))

    return error_score
"""


def regex_content(text):
    text = re.sub('http\S+', ' ', text)
    text = re.sub('[^a-zA-Z]', ' ', text)
    return text.lower()

"""
def clean_text(text):
    text = re.sub('(\s\(CNN\)\s)', '', text)
    text = re.sub('(\sREAD:.*)', '', text)
    text = re.sub('(\.\s\.\.)', '...', text)
    text = re.sub('(.\”\s’)', '”', text)
    text = re.sub('(\s’\s)', ' ', text)
    return text
"""

def tokenize(text):
    tokens = re.split('\s+', text)
    tokens = [tok for tok in tokens if tok not in stop_words]
    return tokens

def lemmatize(tokens):
    lemms = [lemmatizer.lemmatize(tok) for tok in tokens]
    return lemms

def text_to_sentences(text):
    assert type(text) == type("") or type(text) == np.str_
    return re.split("[\.\?\!]", text)

In [None]:
df["title"] = df['title'].apply(lambda x: regex_title(x))
# df["title"] = df["title"].apply(lambda x: re.sub('(’S)', '’s', x))
df["content"] = df["content"].apply(lambda x: clean_content(x))
# df["content"] = df["content"].apply(lambda x: clean_text(x))

In [None]:
df["content"][200]

In [None]:
df['regexed'] = df['content'].apply(lambda x: regex_content(x))
df['tokens'] = df['regexed'].apply(lambda x: tokenize(x))
df["lemmas"] = df["tokens"].apply(lambda x: lemmatize(x))
df['word_count'] = df['tokens'].apply(lambda x: len(x))
df['title_len'] = df['title'].apply(lambda x: len(x))

In [None]:
df = df[df['word_count'] > 200]
df = df[df['word_count'] <= 500]

In [None]:
df = df.reset_index(drop=True)
df

In [None]:
df.to_csv("YOUR PATH")

## Step 3: Partition pre-processed data into GPT-3 vs classifier data

In [None]:
p = "YOUR PATH"
df = pd.read_csv(p)
df = df.drop(columns=["Unnamed: 0"])
len(df)

In [None]:
df["category"].value_counts()

In [None]:
"""
def clear_sample(text):
    for sentence in text_to_sentences(text):
        sentence = re.sub('”\s', '', sentence)
        sentence = re.sub('\A\s”', '', sentence)
        sentence = re.sub('\A””\s', '', sentence)
        sentence = re.sub('\A”’', '', sentence)
        sentence = re.sub('\A’\s', '', sentence)
        sentence = re.sub('\A\s', '', sentence)
        sentence = re.sub('\A’', '', sentence)
        sentence = re.sub('\A”', '', sentence)
        sentence = re.sub('\A\t', '', sentence)
        sentence += "."
        sentence = re.sub('\A\.', '', sentence)
    return text
"""


df_total = df.groupby('category', group_keys=False).apply(lambda x: x.sample(min(len(x), 72)))
df_total = df_total.reset_index(drop=True)

#df_total["content"] = df_total["content"].apply(lambda x: clear_sample(x))

df_gpt3 = df_total.groupby('category', group_keys=False).apply(lambda x: x.sample(min(len(x), 30)))     # This is for fine-tuning + training classifier
df_class = df_total[~df_total.isin(df_gpt3)].dropna(how = 'all')                                        # Partition into two seperate below
headlines = df_class.groupby('category', group_keys=False).apply(lambda x: x.sample(min(len(x), 36)))   # headlines is for testing classifier + testing human + training classifier
last_30 = df_class[~df_class.isin(headlines)].dropna(how = 'all')                                       # holdout data                               

df_gpt3 = df_gpt3.reset_index(drop=True)
df_class = df_class.reset_index(drop=True)

prompt_completion = df_gpt3

prompt_completion = prompt_completion.drop(columns=["category", "regexed", "tokens", "lemmas", "word_count", "title_len"])
prompt_completion = prompt_completion.rename(columns={"title": "prompt", "content": "completion"})

In [None]:
len(df_gpt3)

In [None]:
df_gpt3.to_csv("YOUR PATH", index=False)
headlines.to_csv("YOUR PATH")
prompt_completion.to_csv("YOUR PATH", index=False)

## Step 4: Prepare fine-tuned OpenAI model using prompt-completion keys

In [None]:
if not os.path.exists("YOUR PATH"):
    path = input() # Insert path to new cleaned .csv file
else:
    path = "YOUR PATH"

In [None]:
df_gpt3 = pd.read_csv(path)
# df_gpt3 = df_gpt3.drop(columns=["Unnamed: 0"])

len(df_gpt3)

In [None]:
openai.api_key = OPENAI_API_KEY

!openai tools fine_tunes.prepare_data -f "YOUR PATH"

## Step 5: Generate articles using GPT-3

In [None]:
openai.api_key = OPENAI_API_KEY
openai.organization = "YOUR ORGANIZATION"

openai.FineTune.list()

In [None]:
headlines = pd.read_csv("YOUR PATH")
headlines = headlines.drop(columns=["Unnamed: 0"])
headlines = headlines.reset_index(drop=True)
headlines = headlines.drop(columns=["regexed", "tokens", "lemmas", "word_count", "title_len"])
headlines

In [None]:
import time

def call_gpt3(headline):
    prompt = headline+" ->"
    response_full = openai.Completion.create(model='YOUR MODEL', prompt=prompt, stop=["\n"], max_tokens=300, n=1, temperature=0.7, frequency_penalty=0.2)
    response = response_full.get('choices')[0].text.strip()
    return response

In [None]:
gpt3_outputs = pd.DataFrame(outputs, columns=["gpt3"])
gpt3_outputs.to_csv("YOUR DRIVE", index=False)

In [None]:
outputs

In [None]:
rest = pd.DataFrame(outputs, columns=["gpt3"])
rest

In [None]:
pd.set_option('display.max_rows', None)
combined = gpt3_outputs.append(rest)
combined = combined.reset_index(drop=True)

In [None]:
headlines["gpt3"] = combined["gpt3"]
headlines

In [None]:
headlines.to_csv("YOUR PATH", index=False)