In [1]:
import os
import pickle

import pandas as pd

In [2]:
def get_nyt_data(year: int, data_path: str) -> list[str]:
    """Load New York Times data for a specific year.

    Args:
        year: The year to load data for
        data_path: Path to the directory containing NYT data

    Returns:
        List of article texts

    """
    year_data_path = os.path.join(data_path, f"{year}-lemmatized.pkl")
    with open(year_data_path, "rb") as f:
        data = pickle.load(f)
    return [item["article"] for item in data]

In [3]:
def get_mfd_count(article):
    """Count the number of times 'mfd' appears in an article.

    Args:
        article: The article text

    Returns:
        Count of 'mfd' occurrences

    """
    count = 0
    article = article.lower()
    for word in mfd_words:
        count += article.count(word)
    return count

In [4]:
def get_sentences(article):
    """Split an article into sentences.

    Args:
        article: The article text

    Returns:
        List of sentences

    """
    return [sentence.strip() for sentence in article.split(".") if sentence.strip()]

In [5]:
def get_articles_with_word(word, nyt_articles: list[str]) -> list[str]:
    """Get articles containing a specific word.

    Args:
        word: The word to search for
        nyt_articles: List of NYT articles

    Returns:
        List of articles containing the word

    """
    result = []
    for article in nyt_articles:
        sentences = get_sentences(article)
        for sentence in sentences:
            if word in sentence:
                result.append((sentence, get_mfd_count(sentence)))
    return result

In [19]:
def print_samples(year, word, k=10):
    nyt_articles = get_nyt_data(year, data_path)
    articles = get_articles_with_word(word, nyt_articles)
    articles = sorted(articles, key=lambda x: x[1], reverse=True)
    for i, (sentence, count) in enumerate(articles[:k]):
        print(f"Sample {i + 1}:")
        print(f"Sentence: {sentence}")
        print(f"Count of MFD words': {count}")
        print("-" * 80)

In [10]:
mfd_df = pd.read_csv("./data/mfd2.csv")
mfd_words = mfd_df["word"].tolist()

In [11]:
years = list(range(2007, 1986, -1))

In [12]:
data_path = "./data/NYT"

In [13]:
year = 1998
word = "Monica"
print_samples(year, word)

Sample 1:
Sentence: , S, 47M 3:47:027052 Muscarella, G, 32M 3:47:027053 Brown, P, 32M 3:47:037054 Basuk, P, 44M 3:47:037055 Davidson, B, 38M 3:47:037056 Mahmond, N, 31F 3:47:037057 Zenger, J, 43M 3:47:037058 Delgado, F, 36M 3:47:037059 Tipping, S, 48F 3:47:037060 Artuso, A, 46M 3:47:047061 Sirakovsky, S, 36F 3:47:047062 Vessaz, B, 40F 3:47:047063 Marr, K, 30F 3:47:057064 D'Angelo, C, 39M 3:47:057065 Anquetil, A, 41M 3:47:057066 Bruneau, J, 55M 3:47:057067 Goodall, P, 46M 3:47:067068 May, S, 22F 3:47:067069 Niedermann, R, 48M 3:47:067070 Van Helden, P, 34M 3:47:077071 Soley, M, 34F 3:47:077072 Woeltjen, W, 45M 3:47:077073 Bartol, M, 27F 3:47:087074 Depree, R, 32M 3:47:087075 Ragan, C, 30M 3:47:087076 Debiard, J, 40M 3:47:087077 Kornreich, R, 27M 3:47:087078 Grandjot, B, 32F 3:47:087079 Peattie, W, 39M 3:47:097080 Cretu, C, 31M 3:47:097081 Andersson, B, 46M 3:47:097082 Ashbrook, J, 34M 3:47:097083 Rebick, S, 23F 3:47:107084 Bonjour, P, 49M 3:47:107085 Mizzon, A, 49M 3:47:107086 Kyle, C, 

In [14]:
year = 1996
word = "drug"
print_samples(year, word)

Sample 1:
Sentence: Because of drug addiction and alcoholism, it observed, mothers are reluctant  to let children play unsupervised in the parks, children walk to school along routes protected by the police, churches open only for worship services and many storekeepers unlock their doors only in response to a ringing bell
Count of MFD words': 21
--------------------------------------------------------------------------------
Sample 2:
Sentence: But to tip off the media? To get on  prime-time news and invite other possible victims to come forward? Where is justice in all of this?On the front page and page 15 the same day there was an article, "Mothers Whose Drug Habit Kills Their Babies," about five infants who died as victims of  mothers on drugs and a state system that proves inefficient in protecting them
Count of MFD words': 21
--------------------------------------------------------------------------------
Sample 3:
Sentence: He faithfully recounts the  facts about Paul Leonard New

In [15]:
year = 2004
word = "drug"
print_samples(year, word)

Sample 1:
Sentence: That is because those three institutions, after undertaking separate reviews of test data available on various painkillers, reached the same conclusion: For most patients, Vioxx, Celebrex and a related drug, Bextra, did not work any better than older pain relievers or provide any safety benefits beyond them
Count of MFD words': 21
--------------------------------------------------------------------------------
Sample 2:
Sentence: Efforts to increase that number have gone slowly because of high drug prices, fights over patents, a lack of money from donors, reluctance by African leaders to admit that their nations have epidemics and the inability of shattered health care systems to muster enough doctors, nurses and laboratories to safely deliver the drugs
Count of MFD words': 20
--------------------------------------------------------------------------------
Sample 3:
Sentence: To the Editor:Here's another reason felons who have served their sentences should be allowe

In [20]:
word = "beef"
year = 2005
print_samples(year, word, k=20)

Sample 1:
Sentence: Beef Stew with Sweet and Hot Paprika  Adapted from ''Italian Slow and Savory'' by Joyce Goldstein (Chronicle Books, 2004)  Time: About 3 hours plus 2 hours' marinating3 pounds beef stew meat or 4 pounds short ribs with bone, in 2-inch chunks1/2 cup extra virgin olive oil  3 tablespoons sweet paprika, Spanish or Hungarian  Salt and freshly ground black pepper  3 large onions, chopped  2 cloves garlic, minced  2 teaspoons ground cumin1/4 teaspoon hot paprika or cayenne, or to taste  1 cup dry red wine  1 1/2 cups chopped, seeded canned plum tomatoes, with some juice  Grated zest of 1 lemon  1 bay leaf, sprig fresh marjoram and sprig rosemary, tied together
Count of MFD words': 10
--------------------------------------------------------------------------------
Sample 2:
Sentence: Just days before the planned resumption of bus service across divided Kashmir, threats from militant groups operating on the Indian side of Kashmir have begun to sow fear among would-be passen

In [17]:
word = "gasoline"
year = 2000
print_samples(year, word)

Sample 1:
Sentence: Drawing on lower-than-usual inventories, Great Lakes refineries now must produce a reformulated gasoline that burns cleaner in adherence with the latest Environmental Protection Agency regulations
Count of MFD words': 13
--------------------------------------------------------------------------------
Sample 2:
Sentence: Swonk, a senior vice president and chief economist at the Bank One Corporation, said that rising interest rates and higher gasoline prices seemed to be having little effect so far on the auto industry, traditionally a sector particularly sensitive to rates and gas prices
Count of MFD words': 11
--------------------------------------------------------------------------------
Sample 3:
Sentence: A small group of protesters attacked the offices of the National Elections Commission, the highest government body that supervises the elections, and broke windows and damaged a door with rocks, pipes and at least one gasoline bomb
Count of MFD words': 11
-----

In [21]:
word = "bread"
year = 2002
print_samples(year, word, k=20)

Sample 1:
Sentence: 7101 Gianatti, M, 22 M 3:52:45  7102 Ambrosi, A, 32 F 3:52:45  7103 Barile, J, 40 F 3:52:46  7104 Furlong, P, 34 M 3:52:46  7105 Nassar, T, 28 F 3:52:46  7106 Klie, J, 47 M 3:52:47  7107 Frey, V, 40 M 3:52:47  7108 Montesdeoca, R, 50 M 3:52:47  7109 Schmieder, D, 37 M 3:52:47  7110 Murphy, P, 36 M 3:52:48  7111 Schmerzler, R, 38 M 3:52:48  7112 Visscher, T, 22 F 3:52:48  7113 Henson, S, 32 F 3:52:49  7114 Abzeu, C, 34 M 3:52:49  7115 Reinfurt, D, 31 M 3:52:50  7116 Nygard, J, 32 M 3:52:50  7117 Ribeiro, G, 46 M 3:52:51  7118 Bixler, J, 41 M 3:52:51  7119 Kantor, M, 52 M 3:52:51  7120 Parish, M, 27 M 3:52:52  7121 Winkfield, B, 32 F 3:52:52  7122 Smedsrud, J, 44 M 3:52:52  7123 Fasti, J, 35 M 3:52:52  7124 Byrne, B, 50 F 3:52:53  7125 Martin, B, 30 M 3:52:53  7126 North, J, 40 M 3:52:53  7127 Kalkut, P, 59 M 3:52:53  7128 Soerensen, P, 31 M 3:52:53  7129 Farber, D, 35 F 3:52:54  7130 Zoeller, W, 33 M 3:52:54  7131 Yzaguirre, J, 51 M 3:52:54  7132 Melville, G, 46 M 3: