# Bachelors Thesis: News Sentiment and Inflation Expectation
Luis Nägelin

19-613-926

Gallusstrasse 41, 9000 St.Gallen

luis.naegelin@student.unisg.ch

Disclaimer and declaration of autorship:

The following code has been written by me (Luis Nägelin) without the direct help of any other person.

I have used tools like Stack-overflow and ChatGPT to write the code.

In [1]:
# import packges
import pandas as pd
import requests as req
import time
import os
import nltk
import newspaper #pip3 install newspaper3k
import numpy as np

### Parameters and Path

In [None]:
# path were to save the files
my_base_path = '######'    # always change "\" to "/" !!            Path to store data

# API Key: Personal NYT API Key:
api_key = 'ooFDYm9g0OiQjEx95tjeo67LOe1e8VNp'
# Base URL for the NYT api
base_url = 'https://api.nytimes.com/svc/archive/v1/'

### Functions:

In [None]:
def filter_articles(response, article_list):
    # Extract data from Response
    docs = response['response']['docs']
    
    for doc in docs:
        filteredDoc = {}
        
        # Extract different values of interest from the document
        filteredDoc['title'] = doc['headline']['main']  # Title of the article
        filteredDoc['print_title'] = doc['headline']['print_headline']  # Print title of the article
        filteredDoc['abstract'] = doc['abstract']  # Abstract of the article
        filteredDoc['paragraph'] = doc['lead_paragraph']  # First paragraph of the article
        filteredDoc['text_snippet'] = doc['snippet']  # Text snippet of the article
        filteredDoc['keywords'] = []  # List to store keywords
        keywords = doc.get('keywords')  # Get the list of keywords
        for key in keywords:
            filteredDoc['keywords'].append(key.get('value'))  # Extract the values of keywords
        filteredDoc['date'] = doc['pub_date']  # Publication date of the article
        filteredDoc['news_desk'] = doc['news_desk']  # News desk of the article
        filteredDoc['section_name'] = doc['section_name']  # Section name of the article
        filteredDoc['type_material'] = doc['type_of_material']  # Type of material of the article
        filteredDoc['word_count'] = doc['word_count']  # Word count of the article
        filteredDoc['web_url'] = doc['web_url']  # Web URL of the article
        filteredDoc['uri'] = doc['uri']  # URI of the article
        filteredDoc['id'] = doc['_id']  # ID of the article

        article_list.append(filteredDoc)  # Append the filtered document to the article list

    return article_list


In [None]:
# saves the data on local device:
def save_articles(base_path, base_name, df, year, month):
    # base_name = how to name the file (string)
    # This function creates a specific folder fo each year and stores the articles per month in a seperate csv file.
    name_months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
    
    path = base_path + '/' + year
    # check if folder for that year exists
    isExist = os.path.exists(path)
    if not isExist:
        os.makedirs(path)
    # name of the csv file
    file_name = base_name + year + '_' + name_months[month-1] + '.csv'

    # store data frame
    df.to_csv(path + '/' + file_name)

### Load Metadata over NYT API:

In [None]:
years = list(range(1980, 2021))  # List of years
months = list(range(1, 13))  # List of months

# Iterate over years and months
for year in years:
    year = str(year)
    for month in months:
        articles = []  # List to store the filtered articles
        request_url = base_url + '/' + year + '/' + str(month) + '.json?api-key=' + api_key
        # Send a GET request to the API to fetch the articles for the specified year and month
        my_response = req.get(request_url)

        # Retry if the response is not 200 (OK)
        counter = 0
        while my_response.status_code != 200:
            counter += 1
            time.sleep(5)  # Wait for 5 seconds before retrying
            my_response = req.get(request_url)

            if counter > 4:
                break

        if my_response.status_code == 200:
            # Use the filter_articles() function to filter out relevant information from the response
            articles = filter_articles(my_response.json(), articles)
            # Convert the filtered articles to a DataFrame
            df = pd.DataFrame(articles)
            # Use the save_articles() function to save the DataFrame as a CSV file
            save_articles(my_base_path, 'NYT_metadata', df, year, month)
        else:
            print('Something was wrong with', str(month), year)

        # Add a delay of 2 seconds to avoid hitting the API request limit
        time.sleep(2)
