In [1]:
import os
import json
import random
import time

# from openai import OpenAI
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm




In [2]:
import tiktoken

In [10]:
encoder = tiktoken.get_encoding("cl100k_base")

In [4]:
files = os.listdir("AllSidesArticles/")

In [5]:
with open(f"AllSidesArticles/{files[0]}") as f:
        s = json.load(f)

In [6]:
articles = {}
for path in files:
    if path.endswith(".json"):
        with open(f"AllSidesArticles/{path}") as f:
            dic = json.load(f)
            title = dic.get("newsTitle")
            body = dic.get("fullArticle")
            if not body:
                continue
            else:
                body = "\n".join(body)
            articles[path[:-5]] = f"{title}\n{body}"

In [7]:
len(articles)

2140

In [115]:
# parse articles into proper format for api_request_parallel_processor
with open("articles.jsonl", "w+") as f:
    for key, text in articles.items():
        prompt = "Summarize the following:\n" + text
        if len(encoder.encode(prompt)) > 8192:
            print(key)
            continue
        d = dict(model="gpt-4", messages=[{"role": "user", "content": prompt}])
        json_record = json.dumps(d)
        f.write(json_record + '\n')

5188_left
2049_center
4198_left


=> put these manually into GPT-4 web interface

# Convert unindexed output summaries in jsonl to txt

In [10]:
summaries = []
with open("summaries.jsonl") as f:
    for line in f:
        summaries.append(json.loads(line))

In [21]:
def find_key_by_value(my_dict, search_value):
    for key, value in my_dict.items():
        if value == search_value:
            return key
    return None

In [166]:
for item in summaries:
    name = find_key_by_value(articles, item[0]["messages"][0]["content"][25:])
    with open(f"Summaries/{name}.txt", "w+") as f:
        f.write(item[1]["choices"][0]["message"]["content"])

# Make dataset

In [83]:
from collections import defaultdict

In [64]:
summs.keys() == articles.keys()

True

In [65]:
biases = {"left", "right", "center"}

In [78]:
data = []
for key, summ in summs.items():
    topic_id, summ_bias = key.split('_')
    for bias in biases:
        article = articles.get(f"{topic_id}_{bias}")
        if article:
            data.append({
                  "id": topic_id,
                  "article_bias": bias,
                  "summary_bias": summ_bias,
                  "article": article,
                  "summary": summ
              })

In [79]:
for d in data:
    if d['id'] == '93':
        display(d)
        print()

{'id': '93',
 'article_bias': 'center',
 'summary_bias': 'left',
 'article': 'Lower-Key Ceremonies For This Year\'s Sept. 11 Commemoration\nHide caption Airline pilots Capt. Anthony Chapman (right) and Capt. Paul Evans salute with others as the U.S. flag is lowered to half staff at the 9/11 Flight Crew Memorial in Grapevine, Texas. Flight crews gathered at the memorial near Dallas-Fort Worth airport to remember the flight crews lost in the Sept. 11 attacks. Previous Next LM Otero/AP\nHide caption Judy Parisio (right) and Linda Malbrba make a rubbing of their niece\'s name, Frances Ann Cilente, who worked at the World Trade Center, during ceremonies marking the anniversary of the Sept. 11 attacks. Previous Next Chang W. Lee/Pool/Reuters/Landov\nHide caption President Obama, first lady Michelle and members of the White House staff pause for a moment of silence to mark the anniversary on the South Lawn of the White House. Previous Next Carolyn Kaster/AP\nHide caption New York City police 




{'id': '93',
 'article_bias': 'left',
 'summary_bias': 'left',
 'article': 'Nation marks 9/11 anniversary with somber ceremonies\nAcross the country, Americans marked the 11th anniversary of the Sept. 11, 2001, terrorist attacks, tolling church bells, pausing in silence to reflect and mourning the loss of the nearly 3,000 people who died.\nOn the White House South Lawn, President Obama and first lady Michelle Obama bowed their heads at 8:46 a.m., the moment the first plane struck the North Tower of the World Trade Center. They later laid a wreath at the Pentagon, where the third plane struck. A flag was draped over the building to mark the day.\n“Eleven times we have marked another September 11th come and gone. Eleven times, we have paused in remembrance, in reflection, in unity and in purpose,” Obama said to families and military brass who gathered at the Pentagon, where 184 were killed. “This is never an easy day.”\nAt Arlington National Cemetery, the Obamas visited the graves in Sec




{'id': '93',
 'article_bias': 'right',
 'summary_bias': 'left',
 'article': '11th anniversary ceremony of 9/11 begins in N.Y.\nNEW YORK - Americans paused again Tuesday to mark the 11th anniversary of the Sept. 11, 2001, terror attacks with familiar ceremony, but also a sense that it\'s time to move forward after a decade of remembrance.\nAs in past years, thousands gathered at the World Trade Center site in New York, the Pentagon and Shanksville, Pa., to read the names of nearly 3,000 victims killed in the worst terror attack in U.S. history. President Barack Obama was to attend the Pentagon memorial, and Vice President Joe Biden was to speak in Pennsylvania.\nBut many felt that last year\'s 10th anniversary was an emotional turning point for public mourning of the attacks. For the first time, elected officials weren\'t speaking at the ceremony, which often allowed them a solemn turn in the spotlight, but raised questions about the public and private Sept. 11. Fewer families attended 




{'id': '93',
 'article_bias': 'center',
 'summary_bias': 'center',
 'article': 'Lower-Key Ceremonies For This Year\'s Sept. 11 Commemoration\nHide caption Airline pilots Capt. Anthony Chapman (right) and Capt. Paul Evans salute with others as the U.S. flag is lowered to half staff at the 9/11 Flight Crew Memorial in Grapevine, Texas. Flight crews gathered at the memorial near Dallas-Fort Worth airport to remember the flight crews lost in the Sept. 11 attacks. Previous Next LM Otero/AP\nHide caption Judy Parisio (right) and Linda Malbrba make a rubbing of their niece\'s name, Frances Ann Cilente, who worked at the World Trade Center, during ceremonies marking the anniversary of the Sept. 11 attacks. Previous Next Chang W. Lee/Pool/Reuters/Landov\nHide caption President Obama, first lady Michelle and members of the White House staff pause for a moment of silence to mark the anniversary on the South Lawn of the White House. Previous Next Carolyn Kaster/AP\nHide caption New York City polic




{'id': '93',
 'article_bias': 'left',
 'summary_bias': 'center',
 'article': 'Nation marks 9/11 anniversary with somber ceremonies\nAcross the country, Americans marked the 11th anniversary of the Sept. 11, 2001, terrorist attacks, tolling church bells, pausing in silence to reflect and mourning the loss of the nearly 3,000 people who died.\nOn the White House South Lawn, President Obama and first lady Michelle Obama bowed their heads at 8:46 a.m., the moment the first plane struck the North Tower of the World Trade Center. They later laid a wreath at the Pentagon, where the third plane struck. A flag was draped over the building to mark the day.\n“Eleven times we have marked another September 11th come and gone. Eleven times, we have paused in remembrance, in reflection, in unity and in purpose,” Obama said to families and military brass who gathered at the Pentagon, where 184 were killed. “This is never an easy day.”\nAt Arlington National Cemetery, the Obamas visited the graves in S




{'id': '93',
 'article_bias': 'right',
 'summary_bias': 'center',
 'article': '11th anniversary ceremony of 9/11 begins in N.Y.\nNEW YORK - Americans paused again Tuesday to mark the 11th anniversary of the Sept. 11, 2001, terror attacks with familiar ceremony, but also a sense that it\'s time to move forward after a decade of remembrance.\nAs in past years, thousands gathered at the World Trade Center site in New York, the Pentagon and Shanksville, Pa., to read the names of nearly 3,000 victims killed in the worst terror attack in U.S. history. President Barack Obama was to attend the Pentagon memorial, and Vice President Joe Biden was to speak in Pennsylvania.\nBut many felt that last year\'s 10th anniversary was an emotional turning point for public mourning of the attacks. For the first time, elected officials weren\'t speaking at the ceremony, which often allowed them a solemn turn in the spotlight, but raised questions about the public and private Sept. 11. Fewer families attende




{'id': '93',
 'article_bias': 'center',
 'summary_bias': 'right',
 'article': 'Lower-Key Ceremonies For This Year\'s Sept. 11 Commemoration\nHide caption Airline pilots Capt. Anthony Chapman (right) and Capt. Paul Evans salute with others as the U.S. flag is lowered to half staff at the 9/11 Flight Crew Memorial in Grapevine, Texas. Flight crews gathered at the memorial near Dallas-Fort Worth airport to remember the flight crews lost in the Sept. 11 attacks. Previous Next LM Otero/AP\nHide caption Judy Parisio (right) and Linda Malbrba make a rubbing of their niece\'s name, Frances Ann Cilente, who worked at the World Trade Center, during ceremonies marking the anniversary of the Sept. 11 attacks. Previous Next Chang W. Lee/Pool/Reuters/Landov\nHide caption President Obama, first lady Michelle and members of the White House staff pause for a moment of silence to mark the anniversary on the South Lawn of the White House. Previous Next Carolyn Kaster/AP\nHide caption New York City police




{'id': '93',
 'article_bias': 'left',
 'summary_bias': 'right',
 'article': 'Nation marks 9/11 anniversary with somber ceremonies\nAcross the country, Americans marked the 11th anniversary of the Sept. 11, 2001, terrorist attacks, tolling church bells, pausing in silence to reflect and mourning the loss of the nearly 3,000 people who died.\nOn the White House South Lawn, President Obama and first lady Michelle Obama bowed their heads at 8:46 a.m., the moment the first plane struck the North Tower of the World Trade Center. They later laid a wreath at the Pentagon, where the third plane struck. A flag was draped over the building to mark the day.\n“Eleven times we have marked another September 11th come and gone. Eleven times, we have paused in remembrance, in reflection, in unity and in purpose,” Obama said to families and military brass who gathered at the Pentagon, where 184 were killed. “This is never an easy day.”\nAt Arlington National Cemetery, the Obamas visited the graves in Se




{'id': '93',
 'article_bias': 'right',
 'summary_bias': 'right',
 'article': '11th anniversary ceremony of 9/11 begins in N.Y.\nNEW YORK - Americans paused again Tuesday to mark the 11th anniversary of the Sept. 11, 2001, terror attacks with familiar ceremony, but also a sense that it\'s time to move forward after a decade of remembrance.\nAs in past years, thousands gathered at the World Trade Center site in New York, the Pentagon and Shanksville, Pa., to read the names of nearly 3,000 victims killed in the worst terror attack in U.S. history. President Barack Obama was to attend the Pentagon memorial, and Vice President Joe Biden was to speak in Pennsylvania.\nBut many felt that last year\'s 10th anniversary was an emotional turning point for public mourning of the attacks. For the first time, elected officials weren\'t speaking at the ceremony, which often allowed them a solemn turn in the spotlight, but raised questions about the public and private Sept. 11. Fewer families attended




In [80]:
len(data)

5808

In [81]:
with open("data_final.json", "w+") as f:
    json.dump(data, f)

## train/val/test split based on topic id

In [82]:
import json

In [83]:
from sklearn.model_selection import train_test_split

In [84]:
with open("data_final.json") as f:
    data = json.load(f)

In [86]:
# split based on topic ids
topic_ids = set()
for key, summ in summs.items():
    topic_id, summ_bias = key.split('_')
    topic_ids.add(topic_id)

In [87]:
len(topic_ids)

846

In [112]:
tids_train, tids_test = train_test_split(topic_ids, test_size=0.1, random_state=42)

In [113]:
tids_train, tids_val = train_test_split(tids_train, test_size=0.1, random_state=42)

In [114]:
len(tids_train), len(tids_val), len(tids_test)

(684, 77, 85)

In [116]:
train, val, test = [], [], []
for d in data:
    if d['id'] in tids_train:
        train.append(d)
    elif d['id'] in tids_val:
        val.append(d)
    else:
        test.append(d)

In [117]:
len(train), len(val), len(test)

(4664, 542, 602)

In [119]:
with open("train.json", "w+") as f:
    json.dump(train, f)

In [120]:
with open("val.json", "w+") as f:
    json.dump(val, f)

In [121]:
with open("test.json", "w+") as f:
    json.dump(test, f)

## make hf dataset

In [4]:
from datasets import load_dataset, DatasetDict, load_from_disk

In [123]:
dataset = load_dataset("json", data_files={
    "train": "train.json", "validation": "val.json", "test": "test.json"
})

Downloading data files: 100%|████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 17142.93it/s]
Extracting data files: 100%|███████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 706.71it/s]
Generating train split: 4664 examples [00:00, 15201.35 examples/s]
Generating validation split: 542 examples [00:00, 24784.81 examples/s]
Generating test split: 602 examples [00:00, 26205.97 examples/s]


In [124]:
dataset

DatasetDict({
    train: Dataset({
        features: ['summary', 'article', 'article_bias', 'id', 'summary_bias'],
        num_rows: 4664
    })
    validation: Dataset({
        features: ['summary', 'article', 'article_bias', 'id', 'summary_bias'],
        num_rows: 542
    })
    test: Dataset({
        features: ['summary', 'article', 'article_bias', 'id', 'summary_bias'],
        num_rows: 602
    })
})

In [125]:
dataset.save_to_disk("hf_dataset")

Saving the dataset (1/1 shards): 100%|██████████████████████████████████| 4664/4664 [00:00<00:00, 56738.31 examples/s]
Saving the dataset (1/1 shards): 100%|████████████████████████████████████| 542/542 [00:00<00:00, 45971.02 examples/s]
Saving the dataset (1/1 shards): 100%|████████████████████████████████████| 602/602 [00:00<00:00, 63271.38 examples/s]


# Make gpt-4/3.5 test prompts

In [5]:
dataset = load_from_disk("hf_dataset")

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['summary', 'article', 'article_bias', 'id', 'summary_bias'],
        num_rows: 4664
    })
    validation: Dataset({
        features: ['summary', 'article', 'article_bias', 'id', 'summary_bias'],
        num_rows: 542
    })
    test: Dataset({
        features: ['summary', 'article', 'article_bias', 'id', 'summary_bias'],
        num_rows: 602
    })
})

In [11]:
with open("gpt-3.5-test-prompts.jsonl", "w+") as f:
    for item in dataset["test"]:
        prompt = f"Summarize from the perspective of the political {item['summary_bias']}:\n{item['article']}"
        if len(encoder.encode(prompt)) > 16385: # no articles exceed this
            print(f"{item['id']}_{item['article_bias']}")
            continue
        d = dict(model="gpt-3.5-turbo-1106", messages=[{"role": "user", "content": prompt}], metadata=item)
        json_record = json.dumps(d)
        f.write(json_record + '\n')

In [12]:
with open("gpt-4-test-prompts.jsonl", "w+") as f: # 128k context, none are too big
    for item in dataset["test"]:
        prompt = f"Summarize from the perspective of the political {item['summary_bias']}:\n{item['article']}"
        # if len(encoder.encode(prompt)) > 16385:
        #     print(f"{item['id']}_{item['article_bias']}")
        #     continue
        d = dict(model="gpt-4-1106-preview", messages=[{"role": "user", "content": prompt}], metadata=item)
        json_record = json.dumps(d)
        f.write(json_record + '\n')