# NYT Predictions Exploration

In this notebook, we create a bar graph depicting the average predicted normative rating of each category. We also retrieve captions for our other plots here.

Below is a code chunk that allows you to pop-out all graphs and visuals produced in this notebook. It will making viewing some graphs in this notebook significantly easier, as otherwise, the text for these graphs may not fit nicely.

Below that is another code chunk that reverts this.

In [17]:
# Run to make the plots pop out of the .ipynb file
%matplotlib qt

In [None]:
# Run to make the plots stay in the .ipynb file (if using vscode)
%matplotlib inline 

## Imports and Setting-up

In [5]:
# Imports
import json
import matplotlib.pyplot as plt
import numpy as np
import pickle
import math
import ruptures as rp
import scipy.stats as sp

In [6]:
# Load necessary data
with open(".\JointPredictions.pickle", 'rb') as handle:
    predictions = pickle.load(handle)
with open(".\imagedetails.pickle", 'rb') as handle:
    details = pickle.load(handle) # YYYY-MM, ID, Link
with open(".\\token_mask.pickle", 'rb') as handle:
    mask = pickle.load(handle) # YYYY-MM, ID, Link
details = details[mask]

unique_mask = np.unique(details[:,2:], return_index=True, axis=0)[1]
details = details[unique_mask]
predictions = predictions[unique_mask]

In [7]:
JSONFOLDER = ".\Data\captioning_dataset.json"
with open(JSONFOLDER) as f:
    caption_data = json.load(f)
def retrieve_caption(article_id, image_id):
    return caption_data[article_id]['images'][image_id]

## Ebola Plot (and Changepoint Detection)

In [None]:
with open(".\JointPredictions.pickle", 'rb') as handle:
    predictions_j = pickle.load(handle)
with open(".\imagedetails.pickle", 'rb') as handle:
    details = pickle.load(handle)
with open(".\\token_mask.pickle", 'rb') as handle:
    token_mask = pickle.load(handle)
details = details[token_mask]
# Two links aren't associated with dates. Get rid of them.
# The links are:
# https://www.nytimes.com/interactive/2015/health/stillbirth-reader-stories.html
# https://www.nytimes.com/interactive/2015/world/nobel-peace-prize-timeline.html
mask = (details[:,0]!='2015-00')
details = details[mask]
predictions_j = predictions_j[mask]

months = []
all_months = np.unique(details[:,0])
for y in ["2013","2014", "2015"]:
    mask = np.flatnonzero(np.core.defchararray.find(all_months, y)!=-1)
    months.extend(all_months[mask])
months = np.array(months)

unique_mask = np.unique(details[:,2:], return_index=True, axis=0)[1]
details = details[unique_mask]
predictions_j = predictions_j[unique_mask]

In [None]:
def get_category_subset(predictions, details, category):
    #Returns the subset of predictions and details that match the category
    # Code below taken from:
    # https://stackoverflow.com/questions/38974168/finding-entries-containing-a-substring-in-a-numpy-array
    mask = np.flatnonzero(np.core.defchararray.find(details[:,3], category)!=-1)
    category_details = details[mask]
    category_predictions = predictions[mask]
    return category_predictions, category_details

def get_timecourse(predictions, details, months=months):
    means = []
    error = []
    for m in months:
        month_mask = (details[:,0]==m)
        means.append(np.mean(predictions[month_mask]))
        error.append(sp.sem(predictions[month_mask]))
    return np.array(means), np.array(error)

In [None]:
health_auth_p, health_auth_d= get_category_subset(predictions_j[:,3], details, "/health/")
health_auth_m, health_auth_e=get_timecourse(health_auth_p,health_auth_d)
sports_auth_p, sports_auth_d= get_category_subset(predictions_j[:,3], details, "/sports/")
sports_auth_m, sports_auth_e=get_timecourse(sports_auth_p,sports_auth_d)

In [None]:
detectors = rp.Pelt(model="rbf")
detectors.fit(np.array(health_auth_m))

changepoints = detectors.predict(pen=2)
if 0 in changepoints:
    changepoints.remove(0)
if len(months) in changepoints:
    changepoints.remove(len(months))

In [None]:
# Set up Title and axes
plt.title('Impact of Ebola Outbreak on NYT News')
plt.ylabel("Mean Relevance to Authority")
plt.xlabel("Time (in months)")
xticks = range(0, len(months), 1)
plt.xticks(xticks, months[xticks], rotation ='vertical', fontsize=8)
# Set up the background colours
num_years = math.ceil(len(months)/12)
for i in range(0, num_years, 2):
    plt.axvspan(12*i-0.5, 12*(i+1)-0.5, color='xkcd:sky blue', alpha=0.2)
# Set up the background text
for i in range(0, num_years, 1):
    plt.text(12*i+5, 2.8, months[12*i][:4], color="tab:gray", size="large")
plt.axvline(x = 12+8-1, color = 'green', alpha=0.5)

# Plot the points
plt.plot(range(len(months)), health_auth_m, color="tab:purple", label="health")
plt.fill_between(range(len(months)), np.array(health_auth_m)-np.array(health_auth_e), 
                 np.array(health_auth_m)+np.array(health_auth_e), color="tab:purple", alpha=0.25)
plt.plot(range(len(months)), sports_auth_m, color="tab:blue", label="sports")
plt.fill_between(range(len(months)), np.array(sports_auth_m)-np.array(sports_auth_e), 
                 np.array(sports_auth_m)+np.array(sports_auth_e), color="tab:blue", alpha=0.25)
# Plot the changepoint detection points
for cp in changepoints:
    plt.plot(cp, health_auth_m[cp], 'ro') 

plt.legend(loc='lower right')
plt.show()

## Bar Graph of each category

In [None]:
plt.rcParams.update({'font.size': 12})

mask_1 = np.flatnonzero(np.core.defchararray.find(details[:,3], "/sports/")!=-1)
mask_2 = np.flatnonzero(np.core.defchararray.find(details[:,3], "/business/")!=-1)
mask_3 = np.flatnonzero(np.core.defchararray.find(details[:,3], "/technology/")!=-1)
mask_4 = np.flatnonzero(np.core.defchararray.find(details[:,3], "/science/")!=-1)
mask_5 = np.flatnonzero(np.core.defchararray.find(details[:,3], "/health/")!=-1)

cut = [3,4,5,6,7]

var_names = np.array(["Valence", "Arousal", 
            "Moral", "Authority",
            "Fairness", "Care",
            "Ingroup", "Purity"])[cut]
  
X_axis = np.arange(len(var_names)) 
  
w=0.15

vals = predictions[mask_1][:,cut]
plt.bar(X_axis - 2*w, np.average(vals, axis=0), w, label = 'Sports (n={})'.format(len(vals)))
plt.errorbar(X_axis - 2*w, np.average(vals, axis=0), sp.sem(vals, axis=0), fmt="none", color="black")
vals = predictions[mask_2][:,cut]
plt.bar(X_axis - w, np.average(vals, axis=0), w, label = 'Business (n={})'.format(len(vals)))
plt.errorbar(X_axis - w, np.average(vals, axis=0), sp.sem(vals, axis=0), fmt="none", color="black")
vals = predictions[mask_3][:,cut]
plt.bar(X_axis, np.average(vals, axis=0), w, label = 'Technology (n={})'.format(len(vals))) 
plt.errorbar(X_axis, np.average(vals, axis=0), sp.sem(vals, axis=0), fmt="none", color="black")
vals = predictions[mask_4][:,cut]
plt.bar(X_axis + w, np.average(vals, axis=0), w, label = 'Science (n={})'.format(len(vals))) 
plt.errorbar(X_axis + w, np.average(vals, axis=0), sp.sem(vals, axis=0), fmt="none", color="black")
vals = predictions[mask_5][:,cut]
plt.bar(X_axis + 2*w, np.average(vals, axis=0), w, label = 'Health (n={})'.format(len(vals))) 
plt.errorbar(X_axis + 2*w, np.average(vals, axis=0), sp.sem(vals, axis=0), fmt="none", color="black")


plt.xticks(X_axis, var_names) 
plt.xlabel("Moral Foundation") 
plt.ylabel("Mean Inferred Relevance of Images to Moral Foundation") 
plt.ylim(1, 5)
plt.title("Average Relevance of NYT Images to the different Moral Foundations") 
plt.legend() 
plt.show()

#ax = plt.subplot(111)
#ax.bar(x-0.2, y, width=0.2, color='b', align='center')
#ax.bar(x, z, width=0.2, color='g', align='center')
#ax.bar(x+0.2, k, width=0.2, color='r', align='center')
#ax.xaxis_date()

cats=["spo","bus","tec","sci","hea"]
masks=[mask_1,mask_2,mask_3,mask_4,mask_5]
for i in range(len(masks)):
    print(cats[i])
    vals = predictions[masks[i]][:,2]
    print(np.mean(vals), sp.sem(vals, axis=0))


In [64]:
# Health--Moral
category_mask = np.flatnonzero(np.core.defchararray.find(details[:,3], "/health/")!=-1)
ordered = np.argsort(predictions[category_mask][:,2])
for i in range(-1, -21, -1):
    article_id = details[category_mask][:,1][ordered[i]]
    image_id = details[category_mask][:,2][ordered[i]]
    print(details[category_mask][:,3][ordered[i]])
    print(retrieve_caption(article_id, image_id))

https://www.nytimes.com/2017/03/27/health/medicaid-obamacare.html
Tracie Scott with her newborn daughter, Izabella, at a hospital in Ohio. Ms. Scott, who has multiple sclerosis, is covered by Medicaid, along with her four children.
https://www.nytimes.com/2012/09/20/health/transplant-experts-blame-allocation-system-for-discarding-kidneys.html

A moment of silence for a donor at Fairview Southdale Hospital in Edina, Minn.


https://www.nytimes.com/2015/02/26/health/researchers-call-for-more-study-of-anesthesia-risks-to-young-children.html
Dr. Randall Flick, pediatric anesthesiologist at the Mayo Clinic Children's Center, gives anesthesia to an infant for a minor surgical procedure.
https://www.nytimes.com/2018/01/15/health/baby-spina-bifida-surgery.html
After undergoing experimental surgery for spina bifida in September while still in the womb, a baby boy was born in Texas Children’s Hospital in Houston on Friday.
https://www.nytimes.com/2012/12/23/health/new-drugs-aim-to-make-cells-des

In [41]:
# Sports--Fairness
category_mask = np.flatnonzero(np.core.defchararray.find(details[:,3], "/sports/")!=-1)
ordered = np.argsort(predictions[category_mask][:,4])
for i in range(-1, -21, -1):
    article_id = details[category_mask][:,1][ordered[i]]
    image_id = details[category_mask][:,2][ordered[i]]
    print(details[category_mask][:,3][ordered[i]])
    print(retrieve_caption(article_id, image_id))

https://www.nytimes.com/2013/11/27/sports/hockey/nfl-concussion-case-offers-clues-for-hockey-lawsuit.html

Curt Bennett (19) of St. Louis in a brawl in Boston in 1972. Bennett is among the former N.H.L. players who have sued the league.


https://www.nytimes.com/2010/06/20/sports/soccer/20usteam.html

United States players argued with the referee Koman Coulibaly after he disallowed a goal Friday.


https://www.nytimes.com/2011/10/11/sports/john-carlos-of-68-olympics-protest-maintains-his-passion.html

John Carlos, right, and Tommie Smith protested treatment of blacks in America after the 200-meter final at the 1968 Mexico City Olympics.


https://www.nytimes.com/2015/12/01/sports/international/coin-toss-is-secure-in-sports-lore-if-not-in-cricket.html
A coin toss before an N.F.L. game between the Saints and the Giants in November.
https://www.nytimes.com/2015/05/01/sports/a-ringside-seat.html

Boxer Joe Frazier being directed to the ropes by referee Arthur Marcante after knocking down M

In [42]:
# Sports--Ingroup
category_mask = np.flatnonzero(np.core.defchararray.find(details[:,3], "/sports/")!=-1)
ordered = np.argsort(predictions[category_mask][:,6])
#details[mask][ordered]
for i in range(-1, -21, -1):
    article_id = details[category_mask][:,1][ordered[i]]
    image_id = details[category_mask][:,2][ordered[i]]
    print(details[category_mask][:,3][ordered[i]])
    print(retrieve_caption(article_id, image_id))

https://www.nytimes.com/2013/04/19/sports/near-boston-marathons-finish-line-no-talk-of-defeat.html

Tamera Clifton, who finished the race before the bombings, hugged her daughter Aislin near the Boylston Street memorial.


https://www.nytimes.com/2018/05/07/sports/football/eric-reid-nfl-national-anthem.html
Eric Reid, right, kneeling with his former 49ers teammate Colin Kaepernick during the national anthem in 2017.
https://www.nytimes.com/2015/09/27/sports/letters-to-the-editor.html
A young fan placed a baseball at the foot of a Yogi Berra statue at Berra’s museum in Little Falls, N.J., on Wednesday.
https://www.nytimes.com/2011/09/23/sports/golf/2011-solheim-cup-otoole-and-inkster-share-more-than-roles-on-us-team.html

The U.S. captain Rosie Jones, left, at the Solheim ceremony Thursday.


https://www.nytimes.com/2017/05/14/sports/baseball/derek-jeter-new-york-yankees-monument-park.html
On Sept. 11, 2016, pitcher Dellin Betances and Manager Joe Girardi carried a wreath to the Septemb

In [63]:
# Health--Purity
category_mask = np.flatnonzero(np.core.defchararray.find(details[:,3], "/health/")!=-1)
ordered = np.argsort(predictions[category_mask][:,7])
#details[mask][ordered]
for i in range(-1, -21, -1):
    article_id = details[category_mask][:,1][ordered[i]]
    image_id = details[category_mask][:,2][ordered[i]]
    print(details[category_mask][:,3][ordered[i]])
    print(retrieve_caption(article_id, image_id))

https://www.nytimes.com/2013/10/22/health/in-syria-doctors-risk-life-and-juggle-ethics.html

A Syrian man mourned over a victim of a poison gas attack in Douma, a city on the outskirts of Damascus, in August.


https://www.nytimes.com/2017/06/19/health/guns-children-cdc-us-firearms.html
Parents of children killed by guns held a protest in Atlanta near the National Rifle Association’s annual convention in April.
https://www.nytimes.com/2018/06/04/health/nipah-virus-india-vaccine-epidemic.html
Burying a victim of the Nipah virus in Kozhikode, southern India. There is no vaccine and no cure for the disease.
https://www.nytimes.com/2012/05/08/health/research/lenins-death-remains-a-mystery-for-doctors.html

The Soviet leader Vladimir Ilyich Lenin on his death bed, in an undated photo.


https://www.nytimes.com/2013/07/22/health/pakistan-fights-for-ground-in-war-on-polio.html

A child received a polio vaccine in a volatile neighborhood on the outskirts of Karachi, Pakistan.


https://www.nyt

In [62]:
# Health -- Harm
category_mask = np.flatnonzero(np.core.defchararray.find(details[:,3], "/health/")!=-1)
ordered = np.argsort(predictions[category_mask][:,5])
#details[mask][ordered]
for i in range(-1, -21, -1):
    article_id = details[category_mask][:,1][ordered[i]]
    image_id = details[category_mask][:,2][ordered[i]]
    print(details[category_mask][:,3][ordered[i]])
    print(retrieve_caption(article_id, image_id))

https://www.nytimes.com/2017/05/01/health/zika-twins-transmission-theories.html

Ms. Ribeiro giving João Lucas a massage. The boy sometimes became so agitated, he would scratch himself in the face.


https://www.nytimes.com/2015/05/08/health/weeks-after-his-recovery-ebola-lurked-in-a-doctors-eye.html
Dr. Crozier with children at an Ebola treatment unit in Sierra Leone in September 2014.
https://www.nytimes.com/2016/11/28/health/obamacare-gunshots-medicaid.html
Mr. Berry with his son, who’s learning to box. Mr. Berry still needs a cane, but has been getting around better with more treatment.
https://www.nytimes.com/2015/09/29/health/children-with-hiv-more-likely-to-die-of-malaria.html
A mother and her child who has cerebral malaria and is in a coma at Queen Elizabeth Central Hospital in Malawi.
https://www.nytimes.com/2014/07/03/health/Vaccine-Costs-Soaring-Paying-Till-It-Hurts.html

PAINFUL MEDICINE Rachel Chavez, left, and Beth Barnhart administer vaccines to Caius Sims as his mother,

In [48]:
# Sports--Arousal
category_mask = np.flatnonzero(np.core.defchararray.find(details[:,3], "/sports/")!=-1)
ordered = np.argsort(predictions[category_mask][:,1])
#details[mask][ordered]
for i in range(-1, -21, -1):
    article_id = details[category_mask][:,1][ordered[i]]
    image_id = details[category_mask][:,2][ordered[i]]
    print(details[category_mask][:,3][ordered[i]])
    print(retrieve_caption(article_id, image_id))

https://www.nytimes.com/2012/02/28/sports/autoracing/daytona-500-delayed-by-fiery-collision.html

Workers trying to extinguish a fire from a racetrack jet dryer after it was hit by a car during the Daytona 500.


https://www.nytimes.com/2012/02/28/sports/daytona-500.html

Workers trying to extinguish a fire from a racetrack jet dryer after it was hit by a car during the Daytona 500.


https://www.nytimes.com/2017/05/22/sports/hockey/from-hockey-101-in-nashville-to-a-phd-with-predators.html

Predators fans taking turns smashing an Anaheim Ducks-themed car outside Bridgestone Arena in Nashville. The Predators will take on the Ducks in Game 6 of the Western Conference finals on Monday night.


https://www.nytimes.com/2015/06/12/sports/autoracing/on-auto-racings-deadliest-day.html
Spectators fleeing the flames after Pierre Levegh’s Mercedes crashed during the Le Mans race on June 11, 1955, killing 83 people and Levegh.
https://www.nytimes.com/2015/11/03/sports/soccer/russian-soccer-denies-

## Statistical Testing of Categories

In [None]:
np.random.seed(15654)
col_names = ["valence_mean", "arousal_mean", 
             "moral_mean", "authority_mean",
             "fairness_mean", "harm_mean",
             "ingroup_mean", "purity_mean"]

mask = mask_1
arr = []
for i in range(2,8):
    df = predictions[mask][:,i]
    arr.append(np.array([np.mean(np.random.choice(df, 100, replace = True)) for _ in range(10000)]))
arr=np.stack(arr)
spo_arr = arr

mask = mask_2
arr = []
for i in range(2,8):
    df = predictions[mask][:,i]
    arr.append(np.array([np.mean(np.random.choice(df, 100, replace = True)) for _ in range(10000)]))
arr=np.stack(arr)
bus_arr = arr

mask = mask_3
arr = []
for i in range(2,8):
    df = predictions[mask][:,i]
    arr.append(np.array([np.mean(np.random.choice(df, 100, replace = True)) for _ in range(10000)]))
arr=np.stack(arr)
tec_arr = arr

mask = mask_4
arr = []
for i in range(2,8):
    df = predictions[mask][:,i]
    arr.append(np.array([np.mean(np.random.choice(df, 100, replace = True)) for _ in range(10000)]))
arr=np.stack(arr)
sci_arr = arr

mask = mask_5
arr = []
for i in range(2,8):
    df = predictions[mask][:,i]
    arr.append(np.array([np.mean(np.random.choice(df, 100, replace = True)) for _ in range(10000)]))
arr=np.stack(arr)
hea_arr = arr

In [None]:
# Health highest Morality rating
variable_index = 0
highest_arr = hea_arr
other_arrs = [spo_arr, bus_arr, tec_arr, sci_arr]

cum_arr = True
for arr in other_arrs:
    cum_arr=np.logical_and(cum_arr, (highest_arr[variable_index,:])>(arr[variable_index,:]))
print(1-np.sum(cum_arr)/10000)

In [None]:
# Sports highest Authority rating
variable_index = 1
highest_arr = spo_arr
# [spo_arr, bus_arr, tec_arr, sci_arr, hea_arr]
other_arrs = [bus_arr, tec_arr, sci_arr, hea_arr]

cum_arr = True
for arr in other_arrs:
    cum_arr=np.logical_and(cum_arr, (highest_arr[variable_index,:])>(arr[variable_index,:]))
print(1-np.sum(cum_arr)/10000)

In [None]:
# Sports highest Fairness rating
variable_index = 2
highest_arr = spo_arr
# [spo_arr, bus_arr, tec_arr, sci_arr, hea_arr]
other_arrs = [bus_arr, tec_arr, sci_arr, hea_arr]

cum_arr = True
for arr in other_arrs:
    cum_arr=np.logical_and(cum_arr, (highest_arr[variable_index,:])>(arr[variable_index,:]))
print(1-np.sum(cum_arr)/10000)

In [None]:
# Health highest Care rating
variable_index = 3
highest_arr = hea_arr
# [spo_arr, bus_arr, tec_arr, sci_arr, hea_arr]
other_arrs = [spo_arr, bus_arr, tec_arr, sci_arr]

cum_arr = True
for arr in other_arrs:
    cum_arr=np.logical_and(cum_arr, (highest_arr[variable_index,:])>(arr[variable_index,:]))
print(1-np.sum(cum_arr)/10000)

In [None]:
# Sports highest Ingroup rating
variable_index = 4
highest_arr = spo_arr
# [spo_arr, bus_arr, tec_arr, sci_arr, hea_arr]
other_arrs = [bus_arr, tec_arr, sci_arr, hea_arr]

cum_arr = True
for arr in other_arrs:
    cum_arr=np.logical_and(cum_arr, (highest_arr[variable_index,:])>(arr[variable_index,:]))
print(1-np.sum(cum_arr)/10000)

In [None]:
# Health highest Purity rating
variable_index = 5
highest_arr = hea_arr
# [spo_arr, bus_arr, tec_arr, sci_arr, hea_arr]
other_arrs = [spo_arr, bus_arr, tec_arr, sci_arr]

cum_arr = True
for arr in other_arrs:
    cum_arr=np.logical_and(cum_arr, (highest_arr[variable_index,:])>(arr[variable_index,:]))
print(1-np.sum(cum_arr)/10000)