In [47]:
import requests
import pandas as pd
from collections import Counter
from datetime import datetime
import os
from sklearn.cluster import KMeans
import numpy as np
from numpy.linalg import norm
import matplotlib.pyplot as plt
import plotly.express as px

In [7]:
myAPIKey = "iDALMAL9VFMiwzWionTqK3Ve4tFDUDAQ"
year = 2024
month = 2
URL = f"https://api.nytimes.com/svc/archive/v1/{year}/{month}.json?api-key={myAPIKey}"

In [8]:
data = requests.get(URL)
type(data.status_code)

int

In [None]:
articles = data.json()
len(articles)

In [None]:
articles.keys()

In [None]:
len(articles['response']['docs'])

top five “section_name”-s for the articles of Feb 2024

In [None]:
section_names = []
for article in articles['response']['docs']:
    section_names.append(article['section_name'])

In [None]:
df = pd.DataFrame()
df["section_names"] = section_names
df

In [None]:
cnt = Counter()
def countAllHashtags(cell):
    """
    Takes a string or a NaN value. Splits the srings to find hashtags, updates a Counter object
    (a global variable) to keep track of all hashtags.
    """
    if type(cell) == str: # avoid NaN values, which are float
        htList = [el.strip() for el in cell.split(',')] # prepare individual hashtags
        cnt.update(htList) # method update of Counter takes a list and updates all keys
        
_ = df['section_names'].apply(countAllHashtags)
section_freq = cnt.most_common(5)
section_freq

1. Write a Python function that takes a date, for example, "2024-02-12", and returns the list of articles for that day (extracting it from the month’s archive).
2. Write some code that explores whether the fields "abstract" and "snippet" are always the same or often differ. Which one has more information?
3. Write a function that given one article (in its nested structure), creates a flat dictionary with keys that are relevant for analysis: either the abstract or snippet (see point 2); lead paragraph; headline; keywords concatenated via semicolon; pub_date; document_type; section_name; and type_of_material
4. Write another function that calls the function from point 3 on every article, to create a list of article dictionaries, and convert this list into a dataframe and then store it as a CSV file with the date-month in the title (this is important for point 5 below).
5. Once you have done all of these in the notebook, create a Python script that can be called with a date (from a TikTok video). First, the script looks whether a CSV with cleaned articles is in our folder. If not, calls first the API function to get the articles and then the function that converts them into a CSV. Then, it loads the CSV into a dataframe and it uses filtering to get the articles for the desired date. These articles will be used for the Semantic Similarity portion of the TikTok Project.


In [15]:
def get_articles(date):
    """
    function that takes a date, for example, "2024-02-12", and returns the list of articles for 
    that day (extracting it from the month’s archive)
    """
    result = []
    # extract year, month, day from the date
    datestring = date
    dt = datetime.strptime(datestring, '%Y-%m-%d')
    
    # constant 
    myAPIKey = "iDALMAL9VFMiwzWionTqK3Ve4tFDUDAQ"
    
    # access NYT API
    URL = f"https://api.nytimes.com/svc/archive/v1/{dt.year}/{dt.month}.json?api-key={myAPIKey}"
    data = requests.get(URL)
    articles = data.json()
    
    # add to the list if the article index if it is of the day
    n = len(articles['response']['docs'])
    for i in range(n):
        d = datetime.strptime(articles['response']['docs'][i]['pub_date'][:10], '%Y-%m-%d')
        if dt.day == d.day:
            result.append(articles['response']['docs'][i])
            
    return result
    


In [17]:
articles = get_articles('2024-02-12')

In [31]:
len(articles)

116

In [35]:
def get_article_info(article):
    """
    Write a function that given one article (in its nested structure), creates a flat dictionary 
    with keys that are relevant for analysis: either the abstract or snippet (see point 2); lead 
    paragraph; headline; keywords concatenated via semicolon; pub_date; document_type; section_name; 
    and type_of_material
    """
    result = {}
    # either the abstract or snippet (see point 2)
    if len(article['abstract']) >= len(article['snippet']):
        result['abstract/snippet']= article['abstract']
    else:
        result['abstract/snippet']= article['snippet']
    result['lead_paragraph'] = article['lead_paragraph']
    result['headline'] = article['headline']['main']
    
    # keywords concatenated via semicolon
    k = ""
    for keyword in article['keywords']:
        k+=" " + keyword['value']
    keyword = k.replace(" ", ";") 
    result['keywords'] = keyword[1:]
    
    # others
    result['pub_date'] = article['pub_date']
    result['document_type'] = article['document_type']
    result['section_name'] = article['section_name']
    result['type_of_material'] = article['type_of_material']
    
    return result

In [36]:
get_article_info(articles[0])

{'abstract/snippet': 'A Cetaphil commercial showed a father and daughter connecting over football and the music superstar. But a social media influencer said the idea was stolen from her.',
 'lead_paragraph': 'When an advertisement for Cetaphil lotion was released online days before the Super Bowl, it drew rave reviews for a narrative that evoked a familiar story for parents, football fans and followers of Taylor Swift.',
 'headline': 'Ad Nods to Taylor Swift and Football, Drawing Cheers and Criticism',
 'keywords': 'Advertising;and;Marketing;Super;Bowl;Cosmetics;and;Toiletries;Swift,;Taylor;Kelce,;Travis;Social;Media;TikTok;(ByteDance)',
 'pub_date': '2024-02-12T00:30:32+0000',
 'document_type': 'article',
 'section_name': 'Business Day',
 'type_of_material': 'News'}

In [37]:
def dict_to_pd(articles, date):
    """
    Write another function that calls the function from point 3 on every article, to create a list
    of article dictionaries, and convert this list into a dataframe and then store it as a CSV file 
    with the date-month in the title (this is important for point 5 below).
    """
    df = pd.DataFrame()
    for article in articles:
        article_dict = get_article_info(article)
        d = pd.DataFrame([article_dict])
        df = pd.concat([df,d])
    df.to_csv(f"{date}.csv")

In [38]:
dict_to_pd(articles,'2024-02-12')

Unnamed: 0,abstract/snippet,lead_paragraph,headline,keywords,pub_date,document_type,section_name,type_of_material
0,A Cetaphil commercial showed a father and daug...,When an advertisement for Cetaphil lotion was ...,"Ad Nods to Taylor Swift and Football, Drawing ...",Advertising;and;Marketing;Super;Bowl;Cosmetics...,2024-02-12T00:30:32+0000,article,Business Day,News
0,Taylor Swift and Travis Kelce have been the su...,Extending a weekslong right-wing meltdown over...,Trump Says It Would Be ‘Disloyal’ for Taylor S...,"Swift,;Taylor;Kelce,;Travis;Trump,;Donald;J;Bi...",2024-02-12T00:32:24+0000,article,U.S.,News
0,In a halftime set that touched on more than a ...,A few minutes into Usher’s dynamic and sly Sup...,Usher Brings Precise Details to Pop’s Biggest ...,"Rap;and;Hip-Hop;Super;Bowl;Usher;Keys,;Alicia;...",2024-02-12T02:14:00+0000,article,Arts,Review
0,The pop superstar used a Verizon ad to tell fa...,After days of speculation and online sleuthing...,Beyoncé Announces New Album in Super Bowl Comm...,"Pop;and;Rock;Music;Super;Bowl;Knowles,;Beyonce...",2024-02-12T02:41:56+0000,article,Arts,News
0,"William Albert Haynes Jr., 70, went by “Billy ...","William Albert “Billy Jack” Haynes Jr., who in...",Former W.W.F. Wrestler Arrested in Wife’s Murder,"Murders,;Attempted;Murders;and;Homicides;Wrest...",2024-02-12T03:10:32+0000,article,Arts,News
...,...,...,...,...,...,...,...,...
0,Pregnant women with diabetes or high blood pre...,Women who develop high blood pressure or diabe...,Children Born to Mothers With Pregnancy Compli...,your-feed-science;Pregnancy;and;Childbirth;Dia...,2024-02-12T23:13:06+0000,article,Health,News
0,The shooting took place at the Mount Eden Aven...,A 35-year-old man was killed and five other pe...,One Killed and 5 Wounded in Shooting at Bronx ...,"Subways;Murders,;Attempted;Murders;and;Homicid...",2024-02-12T23:13:38+0000,article,New York,News
0,Our guide to the themes dominating the race.,The special election in New York’s Third Congr...,How Special Is New York’s Special Election?,"Politics;and;Government;Elections;Suozzi,;Thom...",2024-02-12T23:20:47+0000,article,U.S.,News
0,The former president showed up at the federal ...,Former President Donald J. Trump and his lawye...,Trump Attends Court Hearing on Access to Class...,Federal;Criminal;Case;Against;Trump;(Documents...,2024-02-12T23:31:02+0000,article,U.S.,News


In [45]:
#paths = []
for root, dirs, files in os.walk(".", topdown=False):
    for name in files:
        print(name)

Algorithm for NYT API-checkpoint.ipynb
Algorithm for Cosine Similarity.ipynb
algorithmic-indifference-the-dearth-of-news-recommendations-on-tiktok.pdf
tiktok_history_sample.json
Algorithm for NYT API.ipynb
2024-02-12.csv


In [None]:
def cosineSimilarity(vec1, vec2):
    """
    Calculate the cosine similarity between the two vectors
    """
    v1 = np.array(vec1)
    v2 = np.array(vec2)
    cosine = np.dot(v1,v2)/(norm(v1)*norm(v2))
    return cosine

In [None]:
def generate_similarity(date, tiktok_descriptions):
    """
    Once you have done all of these in the notebook, create a Python script that can be called with 
    a date (from a TikTok video). First, the script looks whether a CSV with cleaned articles is in 
    our folder. If not, calls first the API function to get the articles and then the function that 
    converts them into a CSV. Then, it loads the CSV into a dataframe and it uses filtering to get 
    the articles for the desired date. These articles will be used for the Semantic Similarity portion 
    of the TikTok Project.
    """
    # check if a CSV w/ cleaned articles is in folder
    files = []
    for root, dirs, files in os.walk(".", topdown=False):
        for name in files:
            files.append(name)
        
    if f"{date}.csv" not in files:
        articles  = get_articles(date)
        dict_to_pd(articles,date)
        

In [None]:
pd.set_option('display.max_colwidth', None)