# Bachelors Thesis: News Sentiment and Inflation Expectation
Luis Nägelin

19-613-926

Gallusstrasse 41, 9000 St.Gallen

luis.naegelin@student.unisg.ch

Disclaimer and declaration of autorship:

The following code has been written by me (Luis Nägelin) without the direct help of any other person.

I have used tools like Stack-overflow and ChatGPT to write the code.

In [6]:
# import packges
import pandas as pd
import numpy as np
import os
from IPython.display import clear_output
import nltk
import pysentiment2 as ps
lm = ps.LM() # import the Loughran McDonald dictionary

In [1]:
# path to metadata:
my_path = '###########' # path to the data -> folder: NYT_metadata_text
save_path = '########'  # path to save the outputs.

### Functions:

In [4]:
# Function to load the data

# path: path to directory were the data is stored
# month: string of the month to load
# year: string of the year to load

def load_metadata(path, month, year):
    filename = year +'/NYT_metadata' + year + '_' + month + '.csv'
    data = pd.read_csv(path+filename, index_col=0)
    print('loaded: ', filename)
    clear_output(wait=True)
    return data

### because diferent parts of this script were preformed at diferent stages of the analysis I have two functions that load the data:

In [None]:
# Load Data Function
def load_data(path, file_name):
    # Concatenate the file name and the '.csv' extension
    filename = file_name + '.csv'
    # Read the CSV file located at the specified path, using the concatenated filename as the file to be loaded
    # The index_col=0 argument sets the first column of the CSV file as the index column
    # Uncomment the following line and use it if you want to load the 'keywords' column as lists
    # converters={'keywords': lambda x: x.strip("[]").replace('"', "'").replace("'","").split(', ')}
    data = pd.read_csv(path + filename, index_col=0)
    
    # Clear the output to provide a cleaner display
    clear_output(wait=True)
    
    # Print the name of the loaded file
    print('loaded:', filename)
    
    # Return the loaded data
    return data


In [12]:
def filter_relevant_articles(data):
    # Filter the 'data' DataFrame to include only rows where the 'keywords' column contains the strings 'United States' or 'UNITED STATES'
    data = data[data['keywords'].str.contains('United States|UNITED STATES', na=False)]

    # Define a list of relevant article types
    relevant_type_material = ['News', 'Letter', 'Op-Ed', 'Editorial', 'Brief']

    # Filter the 'data' DataFrame to include only rows where the 'type_material' column contains any of the relevant article types
    data = data[data['type_material'].str.contains('|'.join(relevant_type_material), na=False)]

    # Return the filtered 'data' DataFrame
    return data


def filter_for_inflation(data):
    # Define a list of keywords related to inflation
    keywords = ['price index', 'price-index', 'price level', 'inflation', 'deflation', 'rising cost', 'rising costs',
                'falling cost', 'falling costs', 'rising prices', 'price surge', 'falling prices', 'price-hike',
                'price hike']

    # Filter the 'data' DataFrame to include only rows where the 'text' column contains any of the keywords
    data = data[data['text'].str.contains('|'.join(keywords), na=False)]

    # Return the filtered 'data' DataFrame
    return data

In [2]:
def save_articles(base_path, base_name, df, year, month):
    # base_name = how to name the file (string)
    # This function creates a specific folder for each year and stores the articles per month in a separate CSV file.
    name_months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

    # Define the path for the folder based on the base path and year
    path = base_path + year

    # Check if the folder for that year exists, if not, create it
    isExist = os.path.exists(path)
    if not isExist:
        os.makedirs(path)

    # Define the name of the CSV file based on the base name, year, and month
    file_name = base_name + year + '_' + name_months[month] + '.csv'

    # Store the DataFrame as a CSV file in the specified path and file name
    df.to_csv(path + '/' + file_name)


### Count total number of entries in data set

In [None]:
# count the total of all articles
count = 0
name_months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
# apply to all of the data
for year in range(1980, 2023):
    year = str(year)
    for month in range(0,12):
        data = load_metadata(my_path, name_months[month], year)
        count += len(data)

### Combine Title, Abstract and First Paragraph to one Text
In Order to get the most sentiment-information out of the data, it makes sence to combine the text data from the Title, the Abstract and the first Paragraph.

In [8]:
def unique_text(text):
    # combine the Title, Abstract, First paragraph and text snippet without duplicate sentences and NaN's
    
    # Convert text to lowercase
    text = text.lower()
    
    # Replace specific characters and patterns with '.'
    text = text.replace(';', '.').replace(' (s)', '.').replace(' (m)', '.').replace('  ', ' ').replace(':', '.')

    # Tokenize the text into sentences using NLTK's sent_tokenize
    text = nltk.sent_tokenize(text)

    # Remove spaces from each sentence in text
    no_spaces = []
    for sentence in text:
        no_spaces.append(sentence.replace(' ', ''))

    # Keep track of unique sentences using a set and a list
    unique_set = set()
    unique_list = []

    for i, s in enumerate(no_spaces):
        if s not in unique_set:
            unique_list.append(text[i])
            unique_set.add(s)

    # Clean up the unique sentences by removing 'nan' and extra periods
    unique_text = [word.replace('nan', '').replace('.', '').replace(' .', '').replace('. ', '').replace(' . ', '').replace('.  ', '')  for word in unique_list]
    
    # Join the unique sentences into a single string with spaces
    unique_text = ' '.join(unique_list)
    
    return unique_text


### Combine and Store
This block loads all the data and adds the "text" collum to the data.frame by combining the title, the abstract and the first paragraph with the special "unique_text" function that removes duplicated sentences.

In [None]:
name_months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

# Apply unique_text function to all data
for year in range(1980, 2023):
    year = str(year)
    for month in range(0, 12):
        # Load metadata for the given year and month
        data = load_metadata(my_path, name_months[month], year)

        # Create a new 'text' column by combining the 'title', 'abstract', and 'paragraph' columns with appropriate separators
        text = data['title'].astype(str) + ' . ' + data['abstract'].astype(str) + ' . ' + data['paragraph'].astype(str) + ' . '

        # Apply the unique_text function to the 'text' column to remove duplicate sentences and clean the text
        data['text'] = text.apply(unique_text)

        # Save the articles to a separate CSV file for the given year and month
        save_articles(save_path, 'NYT_metadata', data, year, month)

### Test the quality of the new "text":

In [10]:
data = load_metadata(my_path, 'January', '1990')
# execute clean data and create new collum: text
text = data['title'].astype(str) + ' . ' + data['abstract'].astype(str)+ ' . ' + data['paragraph'].astype(str)+ ' . '
data['text'] = text.apply(unique_text)

data = filter_relevant_articles(filter_for_inflation(data))
data = data.reset_index(drop=True)

print(len(data))

4


In [11]:
# Test the new full text
row = 1

print(data['title'][row])
print(data['abstract'][row])
print(data['paragraph'][row])
print('----------')
print(data['text'][row])
print('----------')
print(lm.tokenize(data['text'][row]))
print(lm.get_score(lm.tokenize(data['text'][row])))

Producer Prices Up 4.8% in '89, Most in 8 Years
LEAD: Paced by big increases for energy and tobacco, producer prices jumped seven-tenths of 1 percent in December to bring 1989 inflation by this measure to 4.8 percent, the highest rate since 1981.
Paced by big increases for energy and tobacco, producer prices jumped seven-tenths of 1 percent in December to bring 1989 inflation by this measure to 4.8 percent, the highest rate since 1981.
----------
producer prices up 4.8% in '89, most in 8 years . lead. paced by big increases for energy and tobacco, producer prices jumped seven-tenths of 1 percent in december to bring 1989 inflation by this measure to 4.8 percent, the highest rate since 1981. .
----------
['produc', 'lead', 'big', 'increas', 'energi', 'tobacco', 'produc', 'jump', 'percent', 'bring', 'inflat', 'measur', 'percent', 'highest', 'rate']
{'Positive': 2, 'Negative': 0, 'Polarity': 0.99999950000025, 'Subjectivity': 0.13333332444444504}


# Look at frequency of articles of diferent types (type_material)
In this section I analyse the diferent types of entries that make up the data set. The goal is to habe a understanding of the data and find relefant articles for the analysis.

In [None]:
# loop over all the data and store the frequency of diferent numbers of intrest:

y_start = 1980
y_end = 2023

frequ_total_entrys = np.array([])
frequ_relevant_world = np.array([])
frequ_US_articles = np.array([])
frequ_relevant_articles = np.array([])
frequ_news = np.array([])
frequ_letter = np.array([])
frequ_oped = np.array([])
frequ_editorial = np.array([])
frequ_brief = np.array([])
frequ_about_inflation = np.array([])

# Iterate over the years and months
for y in range(y_start, y_end):
    year = str(y)
    for i in range(0, 12):
        month = name_months[i]
        
        my_filename = year + '/NYT_metadata' + year + '_' + month
        data = load_data(my_path, my_filename)
        
        # Store the total number of entries for each month and year
        frequ_total_entrys = np.append(frequ_total_entrys, len(data))
        
        # Filter the data for articles containing 'United States' keywords
        data_US = data[data['keywords'].str.contains('United States|UNITED STATES', na=False)]
        # Store the number of articles related to the United States for each month and year
        frequ_US_articles = np.append(frequ_US_articles, len(data_US))
        
        # Filter the data for relevant articles based on specific criteria
        data_relevant_world = filter_relevant_articles(data)
        # Store the number of relevant articles globally for each month and year
        frequ_relevant_world = np.append(frequ_relevant_world, len(data_relevant_world))
        
        # Filter the data for relevant articles specifically related to the United States
        data_relevant_US = data_US[data_US['keywords'].str.contains('United States|UNITED STATES', na=False)]
        # Store the number of relevant articles related to the United States for each month and year
        frequ_relevant_articles = np.append(frequ_relevant_articles, len(data_relevant_US))
        
        # Store the number of articles of each type (News, Letter, Op-Ed, Editorial, Brief) for each month and year
        frequ_news = np.append(frequ_news, len(data_relevant_US[data_relevant_US['type_material'] == 'News']))
        frequ_letter = np.append(frequ_letter, len(data_relevant_US[data_relevant_US['type_material'] == 'Letter']))
        frequ_oped = np.append(frequ_oped, len(data_relevant_US[data_relevant_US['type_material'] == 'Op-Ed']))
        frequ_editorial = np.append(frequ_editorial, len(data_relevant_US[data_relevant_US['type_material'] == 'Editorial']))
        frequ_brief = np.append(frequ_brief, len(data_relevant_US[data_relevant_US['type_material'] == 'Brief']))
        
        # Filter the data for articles related to inflation
        data_inflation = filter_for_inflation(data_relevant_US)
        # Store the number of articles related to inflation for each month and year
        frequ_about_inflation = np.append(frequ_about_inflation, len(data_inflation))

In [None]:
# Create a DataFrame to store the frequency counts of different types of articles and other statistics
frequ_data = pd.DataFrame({
    'Date': np.arange(y_start, y_end, 1/12),  # Create a date range for each month
    'frequ_total_entrys': frequ_total_entrys,  # Total number of entries for each month and year
    'frequ_relevant_world': frequ_relevant_world,  # Number of relevant articles globally for each month and year
    'frequ_US_articles': frequ_US_articles,  # Number of articles related to the United States for each month and year
    'frequ_relevant_articles': frequ_relevant_articles,  # Number of relevant articles related to the United States for each month and year
    'frequ_news': frequ_news,  # Number of articles categorized as 'News' for each month and year
    'frequ_letter': frequ_letter,  # Number of articles categorized as 'Letter' for each month and year
    'frequ_oped': frequ_oped,  # Number of articles categorized as 'Op-Ed' for each month and year
    'frequ_editorial': frequ_editorial,  # Number of articles categorized as 'Editorial' for each month and year
    'frequ_brief': frequ_brief,  # Number of articles categorized as 'Brief' for each month and year
    'frequ_about_inflation': frequ_about_inflation  # Number of articles related to inflation for each month and year
})

# Save the results to a CSV file
frequ_data.to_csv(save_path + '/frequ_article_type.csv', index=False)


### Business Desk Analysis
The Idear was to filter for all Economic articles in order to have someting to compare my filter for inflation to. (Was not used in final analysis).
I stored all News-Desk-Keywords in a Table and manuly looked for "Business-related" keywords. Then I coppied them into the file: business_desk

In [None]:
# for looking at business desk data. (Was not used in final analysis.)
path_to_business_desk = '#####' # path to the file: business_desk.txt

In [None]:
with open(path_to_business_desk+ "business_desk.txt", 'r') as file:
    business_desk_keywords = file.readlines()

for i in range(len(business_desk_keywords)):
    business_desk_keywords[i] = business_desk_keywords[i][:-1]

In [None]:
def filter_news_desk(data, filterkeywords):
    # Filter the data based on the news_desk column and keywords related to the United States
    data = data[data['news_desk'].isin(filterkeywords)]
    data = data[data['keywords'].str.contains('United States|UNITED STATES', na=False)]
    return data

### load all the data and filter with the busines_desk keywords, then calculate the sentiment.

In [None]:
text_sorce = 'text'
y_start = 1980
y_end = 2023

# Define sentiment labels: positive, neutral, and negative
# positive article = 1     
# neutral article = 0       
# negative article = -1     

sentiment_of_articles = []  # nested list of all the counts of positive/negative/neutral articles in a given month: [num_positive, num_neutral, num_negative]
positive_articles_per_month = np.array([])  # number of positive articles per month
negative_articles_per_month = np.array([])  # number of negative articles per month
neutral_articles_per_month = np.array([])  # number of neutral articles per month
mean_sentiment_per_month = np.array([])  # mean sentiment score per month
articles_per_month = np.array([])  # number of relevant articles in a given month
empthy_counter = 0  # counter for empty articles

# Iterate over years
for y in range(y_start, y_end):
    year = str(y)

    # Iterate over months
    for i in range(0, 12):
        month = name_months[i]

        # Load data and filter by news desk keywords
        my_filename = year + '/NYT_metadata' + year + '_' + month
        data = filter_news_desk(load_data(my_path, my_filename), business_desk_keywords)

        articles_counter = 0  # relevant articles (not empty) counter
        positive_articles = 0  # count of positive articles
        neutral_articles = 0  # count of neutral articles
        negative_articles = 0  # count of negative articles

        # Iterate over data entries
        for index in data.index:
            text = data.loc[index, text_sorce]
            if isinstance(text, str):
                articles_counter += 1  # increment relevant articles counter

                tokenized_text = lm.tokenize(text)
                score = lm.get_score(tokenized_text)['Polarity']  # get sentiment score

                if score > 0:
                    positive_articles += 1
                elif score < 0:
                    negative_articles += 1
                else:
                    neutral_articles += 1

        # Append sentiment counts to sentiment_of_articles
        sentiment_of_articles.append(np.array([positive_articles, neutral_articles, negative_articles]))

        # Save the number of relevant articles in this month
        articles_per_month = np.append(articles_per_month, articles_counter)

# Calculate means of sentiment per month
for month in sentiment_of_articles:
    if month.sum() == 0:
        mean_sentiment_per_month = np.append(mean_sentiment_per_month, 0)
        positive_articles_per_month = np.append(positive_articles_per_month, 0)
        neutral_articles_per_month = np.append(neutral_articles_per_month, 0)
        negative_articles_per_month = np.append(negative_articles_per_month, 0)
    else:
        mean = (month[0] - month[2]) / month.sum()
        mean_sentiment_per_month = np.append(mean_sentiment_per_month, mean)
        positive_articles_per_month = np.append(positive_articles_per_month, month[0])
        neutral_articles_per_month = np.append(neutral_articles_per_month, month[1])
        negative_articles_per_month = np.append(negative_articles_per_month, month[2])
