In [1]:
import pandas as pd
from glob import glob
from pathlib import Path

# Imports

In [2]:
DATA_DIR = Path("../../data")
input_dir = DATA_DIR / "raw" / "nyt-data"

In [3]:
file_paths = glob(str(input_dir / "*.json"))

In [4]:
file_paths

['../../data/raw/nyt-data/paragraphs.json',
 '../../data/raw/nyt-data/articles-search.json']

# Exploration

In [5]:
articles = pd.read_json(file_paths[0], orient='records')
paragraphs = pd.read_json(file_paths[1], orient='records')

In [6]:
articles.head()

Unnamed: 0,url,id,paragraphs
0,http://www.nytimes.com/1997/01/01/arts/after-a...,4fd20e5c8eb7c8105d77f747,[after the rush of holiday festivities and the...
1,http://www.nytimes.com/1997/01/01/world/serb-p...,4fd20cf08eb7c8105d77c720,"[belgrade, serbia, wednesday, jan. 1—, with th..."
2,http://www.nytimes.com/1997/01/01/sports/for-4...,4fd1fd738eb7c8105d75d42e,"[san francisco, dec. 31—, the change was made,..."
3,http://www.nytimes.com/1997/01/01/world/islami...,4fd1fd738eb7c8105d75d42b,"[isfahan, iran—, isfahan, wrote jean chardin, ..."
4,http://www.nytimes.com/1997/01/01/opinion/l-me...,4fd1fd738eb7c8105d75d432,"[to the editor:, your dec. 30 front-page repor..."


In [7]:
articles.dtypes

url           object
id            object
paragraphs    object
dtype: object

In [8]:
paragraphs.head()

Unnamed: 0,lead,author,headline,abstract,locations,word_count,snippet,source,subjects,url,desk,date,section,type,id
0,After the rush of holiday festivities and the ...,"{'person': [{'organization': '', 'role': 'repo...","After All the Champagne, Time to Smell the Coffee","Jon Pareles reviews new CD's by Greg Brown, Bi...",[],530.0,After the rush of holiday festivities and the ...,The New York Times,"[RECORDINGS (AUDIO), MUSIC]",http://www.nytimes.com/1997/01/01/arts/after-a...,Cultural Desk,1997-01-01,Arts,Review,4fd20e5c8eb7c8105d77f747
1,With the high-pitched chirps of thousands of w...,"{'person': [{'organization': '', 'role': 'repo...",Serb Protests Take On a Holiday Mood,Anti-Government protesters in Belgrade welcome...,"[BELGRADE (SERBIA), SERBIA, YUGOSLAVIA]",723.0,With the high-pitched chirps of thousands of w...,The New York Times,"[NEW YEAR, DEMONSTRATIONS AND RIOTS]",http://www.nytimes.com/1997/01/01/world/serb-p...,Foreign Desk,1997-01-01,World,News,4fd20cf08eb7c8105d77c720
2,"The change was made, appropriately, after the ...","{'person': [{'organization': '', 'role': 'repo...","For 49ers, Pressure Will Be On the Line",Preview of San Francisco 49ers' playoff game a...,[],799.0,"The change was made, appropriately, after the ...",The New York Times,"[PLAYOFF GAMES, FOOTBALL]",http://www.nytimes.com/1997/01/01/sports/for-4...,Sports Desk,1997-01-01,Sports,News,4fd1fd738eb7c8105d75d42e
3,"Isfahan, wrote Jean Chardin, the 17th-century ...","{'person': [{'organization': '', 'role': 'repo...",Islamic Militancy vs. Money-Making in Iran,Iran is locked in fierce and bitter struggle o...,[IRAN],1194.0,"Isfahan, wrote Jean Chardin, the 17th-century ...",The New York Times,"[WOMEN, ISLAM, TRAVEL AND VACATIONS]",http://www.nytimes.com/1997/01/01/world/islami...,Foreign Desk,1997-01-01,World,News,4fd1fd738eb7c8105d75d42b
4,To the Editor: Your Dec. 30 front-page report ...,,Memo to Helms: Why and How We Need U.N.,"Letter from Paul Kennedy, Yale history profess...",[],373.0,To the Editor: Your Dec. 30 front-page repo...,The New York Times,[],http://www.nytimes.com/1997/01/01/opinion/l-me...,Editorial Desk,1997-01-01,Opinion,Letter,4fd1fd738eb7c8105d75d432


In [9]:
paragraphs.dtypes

lead                  object
author                object
headline              object
abstract              object
locations             object
word_count           float64
snippet               object
source                object
subjects              object
url                   object
desk                  object
date          datetime64[ns]
section               object
type                  object
id                    object
dtype: object

# Conversions

Isolate relevant columns

paragraphs: id, paragraphs (contains the text of the articles)
articles: id, date

In [10]:
paragraphs = paragraphs[['id', 'date']]
articles = articles[['id', 'paragraphs']]

Now we can merge them into one dataframe

In [11]:
data = pd.merge(paragraphs, articles, on='id', how='inner')

In [12]:
data = data.rename(columns={'paragraphs': 'text'})

In [13]:
data.iloc[1]["text"]

['belgrade, serbia, wednesday, jan. 1—',
 'with the high-pitched chirps of thousands of whistles and a display of fireworks, anti-government protesters welcomed in the new year, expressing hope for the beginnings of democracy in serbia and for the end of one-man rule by slobodan milosevic.',
 "on a sound stage erected in front of the snow-clad statue of prince mihailo in the central republic square, rock groups and actors led the new year's eve festivities, interspersed with cameo appearances by leaders of the opposition.",
 "the forbidding police presence during the previous six days of demonstrations was not on display on new year's eve, and the heavy snow let up. that gave the revelers a chance to regain the warm feelings and good humor that characterized much of the past six weeks of protest, which took shape after the government annulled opposition victories in municipal elections in november.",
 "the boisterous celebration, and a similar street party held by university students a

Convert the values of the text attribute from a list of strings to one string

In [14]:
data['text'] = data['text'].apply(lambda x: ' '.join(x))

In [15]:
data.iloc[1]["text"]

"belgrade, serbia, wednesday, jan. 1— with the high-pitched chirps of thousands of whistles and a display of fireworks, anti-government protesters welcomed in the new year, expressing hope for the beginnings of democracy in serbia and for the end of one-man rule by slobodan milosevic. on a sound stage erected in front of the snow-clad statue of prince mihailo in the central republic square, rock groups and actors led the new year's eve festivities, interspersed with cameo appearances by leaders of the opposition. the forbidding police presence during the previous six days of demonstrations was not on display on new year's eve, and the heavy snow let up. that gave the revelers a chance to regain the warm feelings and good humor that characterized much of the past six weeks of protest, which took shape after the government annulled opposition victories in municipal elections in november. the boisterous celebration, and a similar street party held by university students a few blocks away,

In [16]:
data.isnull().sum().divide(len(data)).sort_values(ascending=False)

id      0.0
date    0.0
text    0.0
dtype: float64

There are no missing values

Sort by Date

In [17]:
data['date'] =  pd.to_datetime(data['date'])

In [18]:
data.sort_values('date', inplace=True);

In [19]:
# Get the range of dates
period = (data['date'].min(), data['date'].max())

# Format the output
formatted_range = tuple(date.strftime("%Y-%m-%d") for date in period)
print("Period of time:", formatted_range)

Period of time: ('1990-01-01', '2016-06-05')


# Preprocessing

As described by Yao et al. in their Paper "Dynamic Word Embeddings for Evolving Semantic Discovery" they follow these preprocessing steps:

- Convert to lowercase
- Remove Punctuation
- Remove stop-words
- Remove rare words (< 200x in the whole corpus)

## 1. Convert to lowercase


In [20]:
data["text"] = data["text"].str.lower()

## 2. Remove Punctuations

Eliminate the punctuation, URL, and @

In [21]:
import re

def clean_text(text):
    text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)

    return text

In [22]:
data["text"] = data["text"].apply(clean_text)

### 3. Remove Stopwords

In [23]:
import nltk
# nltk.download('punkt')

def remove_stopwords(text):
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize

    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return " ".join(filtered_text)

In [24]:
data["text"] = data["text"].apply(remove_stopwords)

## 4. Remove rare words

In [25]:
def remove_rare_words(df, n = 200):
    from collections import Counter

    # Step 1: Tokenize the text in each row into words
    df['tokens'] = df['text'].str.split()

    # Step 2: Count the occurrences of each word across the entire dataset
    word_counts = Counter(word for tokens in df['tokens'] for word in tokens)

    print(f"Total: {len(word_counts)}")

    # Step 3: Filter out words that occur less than 200 times
    words_to_keep = {word for word, count in word_counts.items() if count > n}

    print(f"Filtered: {len(words_to_keep)}")

    # Step 4: Reconstruct the text by joining the remaining words back together
    df['text_filtered'] = df['tokens'].apply(lambda tokens: ' '.join(word for word in tokens if word in words_to_keep))

    df['text'] = df['text_filtered']

    return df.drop(columns=['tokens', 'text_filtered'])

In [26]:
data = remove_rare_words(data)

Total: 545057
Filtered: 22062


Let us now check if all observations survived the text-cleaning

In [27]:
data["text"].isna().sum()

0

In [28]:
num_texts_with_zero_length = (data["text"].str.len() == 0).sum()

print(f"Number of texts with zero length: {num_texts_with_zero_length}")

Number of texts with zero length: 7054


These are texts which only consisted of stopwords/rare-words.
They did not survive the text-cleaning process and are subsequently removed

In [29]:
data = data[data["text"].str.len() > 0]

In [30]:
data.head()

Unnamed: 0,id,date,text
24510,4fd190908eb7c8105d697ccf,1990-01-01,third national basketball association season c...
24511,4fd191f48eb7c8105d69a979,1990-01-01,seoul south korea dec 31 former president test...
24512,4fd19b228eb7c8105d6abae7,1990-01-01,editor william call navy oped nov 29 seriously...
24513,4fd18d4e8eb7c8105d691a2a,1990-01-01,diary h l edited charles illustrated pages alf...
24514,4fd19b228eb7c8105d6abadd,1990-01-01,madrid dec 31 nobel prize literature jose rece...


In [31]:
len(data)

98360

Save whole corpus as one

In [32]:
output_dir = DATA_DIR / "processed" / "nyt-data"
data.to_csv(output_dir / "corpus_all.csv", index=False, mode='w')

# Split & Save

In [33]:
data["year"] = data["date"].dt.year

In [34]:
grouped = data.groupby('year')

In [35]:
output_dir = DATA_DIR / "processed" / "nyt-data"

output_dir.mkdir(parents=True, exist_ok=True)

for year, group in grouped:
    filename = f'{year}_data.csv'
    group.to_csv(output_dir / filename, index=False)
    print(f'Saved {filename}')

Saved 1990_data.csv
Saved 1991_data.csv
Saved 1992_data.csv
Saved 1993_data.csv
Saved 1994_data.csv
Saved 1995_data.csv
Saved 1996_data.csv
Saved 1997_data.csv
Saved 1998_data.csv
Saved 1999_data.csv
Saved 2000_data.csv
Saved 2001_data.csv
Saved 2002_data.csv
Saved 2003_data.csv
Saved 2004_data.csv
Saved 2005_data.csv
Saved 2006_data.csv
Saved 2007_data.csv
Saved 2008_data.csv
Saved 2009_data.csv
Saved 2010_data.csv
Saved 2011_data.csv
Saved 2012_data.csv
Saved 2013_data.csv
Saved 2014_data.csv
Saved 2015_data.csv
Saved 2016_data.csv
