# Headline processing

This notebook cleans the headlines and produces a CSV containing simplified tokens.

## Imports

In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from pandas.core.common import flatten
import matplotlib.pyplot as plt
import seaborn as sns

## Setup

In [2]:
pd.set_option("max_colwidth", 0)

## Data sourcing

In [3]:
articles = pd.read_csv("./data/articles.csv")

articles.head()

Unnamed: 0,title,description,link,source
0,"GCSE results day 2023 LIVE: Pass grades fall for second year running in England, Wales and Northern Ireland - with 68.2% marked at 4/C","Follow MailOnline's liveblog today as hundreds of thousands of pupils in England, Wales and Northern Ireland pick up their GCSE results.",https://www.dailymail.co.uk/news/live/article-12439999/gcse-results-day-live-2023-exams-students-college-sixth-form.html?ns_mchannel=rss&ito=1490&ns_campaign=1490,Daily Mail
1,Wagner 'had full mobilisation plan if Prigozhin was killed' it is claimed as they vow revenge for 'assassination' - as MI6 chief says there are signs Putin 'took him out' in plane crash and if he's not dead 'he soon will be',"Russian outlet Readovka, closely linked with Prigozhin, reported Wagner had a 'long-established mechanism of action in the event of the death of Yevgeny Prigozhin or Dmitry Utkin'.",https://www.dailymail.co.uk/news/article-12440165/Wagner-mobilisation-plan-Prigozhin-killed-claimed-vow-revenge-assassination-MI6-chief-says-signs-Putin-took-plane-crash-hes-not-dead-soon-be.html?ns_mchannel=rss&ito=1490&ns_campaign=1490,Daily Mail
2,Prince Harry WILL be in the UK on the anniversary of the Queen's death but will 'snub his family' - as it is revealed he faces having to ask King if he can stay at Frogmore Cottage,"The Duke of Sussex, 38, is set to fly out from California to London to make an appearance at the WellChild Awards on September 8.",https://www.dailymail.co.uk/news/article-12440111/Prince-Harry-forced-ask-Palaces-permission-accommodation-royal-estate-face-staying-hotel-return-UK-without-Meghan-eve-anniversary-Queens-death-King-evicted-Sussexes-Frogmore-Cottage.html?ns_mchannel=rss&ito=1490&ns_campaign=1490,Daily Mail
3,Wagner plane crash LIVE: Yevgeny Prigozhin's death risks retaliation in Russia after mercenary group threaten Vladimir Putin,LIVE BLOG: Wagner mercenaries have threatened to march on Moscow after it was claimed warlord Yevgeny Prigozhin was among the dead in a plane crash near the Russian capital.,https://www.dailymail.co.uk/news/live/article-12440113/Wagner-plane-crash-LIVE-Yevgeny-Prigozhin-Putin.html?ns_mchannel=rss&ito=1490&ns_campaign=1490,Daily Mail
4,"Yevgeny Prigozhin was assassinated 'as a gift for Zelensky to celebrate Ukraine's victory day today', Putin's former spokesman claims","Sergei Markov, a stern supporter of the Russian president and formerly a close advisor, said it was 'absolutely clear that Prigozhin [was] killed by [the] Ukrainian intelligence service.'",https://www.dailymail.co.uk/news/article-12440163/Yevgeny-Prigozhin-assassinated-gift-Zelensky-celebrate-Ukraines-victory-day-today-Putins-former-spokesman-claims.html?ns_mchannel=rss&ito=1490&ns_campaign=1490,Daily Mail


## Most frequent words

### Processing

In [4]:
# Subset the data

title_df = articles[["title", "source"]].copy()

In [5]:
title_df["keywords"] = title_df["title"].str.lower()

In [14]:
title_df["keywords"]

0      gcse result day 2023 live pas grade fall second year run england wale northern ireland 68.2 mark 4/c                                   
1      wagner had full mobilisation plan prigozhin kill claim vow revenge assassination mi6 chief say sign putin took plane crash dead he soon
2      prince harry anniversary queen death snub family reveal face ask king stay frogmore cottage                                            
3      wagner plane crash live yevgeny prigozhin death risk retaliation russia mercenary group threaten vladimir putin                        
4      yevgeny prigozhin assassinate as gift zelensky celebrate ukraine victory day today putin former spokesman claim                        
                                                            ...                                                                               
163    mortgage rate five way save money                                                                                                      

In [6]:
# Split into tokens

title_df["keywords"] = title_df["keywords"].apply(word_tokenize)

In [7]:

# Create an object that can be used to lemmatise

lemma = WordNetLemmatizer()

# Create a dictionary to map tags to ones that the lemmatiser will understand.

tag_map = defaultdict(lambda : "n")  # by default, assume nouns
tag_map['J'] = "a"  # adjectives
tag_map['V'] = "v"  # verbs
tag_map['R'] = "r"  # adverbs

# Create a function to get the pos tags for a set of tokens, and return the tokens in a way the
# lemmatizer can interpret
def get_wordnet_tags(tokens):
    """Returns WordNet pos_tags for a set of tokens"""
    
    # Tag tokens with pos_tagger
    tagged_tokens = pos_tag(tokens)
    
    # Convert each tag to a version wordnet can understand
    tagged_tokens = [(token[0], tag_map[token[1][0]]) for token in tagged_tokens]
    
    return tagged_tokens

In [8]:
# pos_tag the tokens

title_df["keywords"] = title_df["keywords"].apply(get_wordnet_tags)

# Lemmatise the tokens

title_df["keywords"] = title_df["keywords"].apply(lambda tokens: [lemma.lemmatize(word=token[0], pos=token[1]) for token in tokens])

In [15]:
# Filter out punctuation, stop words, and very short words

stops = stopwords.words("english")

# Add specific stopwords

stops.extend(["n't"])

important_short_words = ["pm", "us", "uk", "gb"]

def filter_tokens(tokens):

    return [t for t in tokens
            if t not in stops
            and (len(t) > 2
            or t in important_short_words)]

title_df["keywords"] = title_df["keywords"].apply(filter_tokens)

In [16]:
# Remove specifically apostrophes

title_df["keywords"] = title_df["keywords"].apply(lambda tokens: [x.replace("'", "") for x in tokens])

In [17]:
# Join token lists back into strings

title_df["keywords"] = title_df["keywords"].apply(lambda tokens: " ".join(tokens))

In [18]:
title_df.sample(3)

Unnamed: 0,title,source,keywords
80,Which oil should you crack open with dinner?,Daily Mail,
53,"Murder investigation launched after Stagecoach bus driver, 23, is killed in horror crash with van",Daily Mail,
153,What are T-levels and what are the grades worth?,BBC,


## Data export

In [13]:
title_df.to_csv("./data/processed_headlines.csv", index=False)