# Drudge entities analysis

By Ben Welsh

A draft analysis of the top words in headlines from the Drudge Report

## Import

Python tools

In [1]:
import typing
import pandas as pd
from collections import Counter

Formatting

In [2]:
from rich import print
from rich.progress import track

Natural language processing

In [3]:
import spacy

In [4]:
# !pipenv run python -m spacy download en_core_web_lg

In [5]:
nlp = spacy.load('en_core_web_lg')

## Extract

Read in data

In [12]:
link_df = pd.read_csv(
    "../extracts/csv/us-right-wing-hyperlinks-analysis.csv",
    parse_dates=["earliest_date"]
)

In [13]:
link_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21389 entries, 0 to 21388
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   handle         21389 non-null  object        
 1   text           20138 non-null  object        
 2   url            21389 non-null  object        
 3   earliest_date  21389 non-null  datetime64[ns]
 4   is_story       21389 non-null  bool          
 5   domain         21389 non-null  object        
dtypes: bool(1), datetime64[ns](1), object(4)
memory usage: 856.5+ KB


In [14]:
link_df.earliest_date.min()

Timestamp('2022-11-02 00:00:00')

In [15]:
link_df.earliest_date.max()

Timestamp('2022-11-08 00:00:00')

In [16]:
link_df.head()

Unnamed: 0,handle,text,url,earliest_date,is_story,domain
0,DailyCaller,,/2022/11/08/cole-hauser-rip-wheeler-death-futu...,2022-11-08,False,.
1,DailyCaller,,/2022/11/08/donald-trump-jd-vance-ohio-midterm...,2022-11-08,False,.
2,DailyCaller,,/2022/11/08/florida-prepared-subtropical-storm...,2022-11-08,False,.
3,DailyCaller,,/2022/11/08/lab-leak-theory-democrats-investig...,2022-11-08,False,.
4,DailyCaller,,/2022/11/08/mistake-sylvester-stallone-almost-...,2022-11-08,False,.


## Transform

Filter down to stories

In [20]:
story_df = link_df[
    (link_df.is_story) &
    ~(pd.isnull(link_df.text))
].copy()

Cut `...`

In [21]:
story_df.text = story_df.text.str.replace(r"\.{2,}", "", regex=True)

In [45]:
story_df.earliest_date.value_counts()

2022-11-02    1645
2022-11-08    1533
2022-11-04    1050
2022-11-03     970
2022-11-07     730
2022-11-06     611
2022-11-05     594
Name: earliest_date, dtype: int64

In [85]:
story_df.url = story_df.url.str.strip()

In [86]:
today_df = story_df[story_df.earliest_date == '2022-11-08']

In [87]:
today_df.to_csv("us-right-wing-election-headlines.csv")

Extract all unique headlines

In [88]:
headline_list = sorted(list(today_df.text.unique()))

## Analyze

Pull out all of the meaningful words

In [41]:
def get_lemma(headline: str) -> typing.Dict:
    """Parse all of the words we want to keep in the headline."""
    # Read it into our NPL thing
    doc = nlp(headline)
    
    # Parse out all the words
    token_list = [token for token in doc]

    # Remove stop words
    token_list = [t for t in token_list if not t.is_stop]

    # Remove punctuation words
    token_list = [t for t in token_list if not t.is_punct]

    # Remove digits
    token_list = [t for t in token_list if not t.is_digit]

    # Trim it down to only the stuff we want to keep
    dict_list = [dict(
        headline=headline,
        word=t.text.upper(),
        lemma=t.lemma_.upper(),
        part_of_speech=t.pos_,
    ) for t in token_list]
    
    # Pass it back
    return dict_list

In [42]:
word_list = []
for headline in track(headline_list):
    word_list += get_lemma(headline)

Output()

In [43]:
word_df = pd.DataFrame(word_list)

In [46]:
word_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7054 entries, 0 to 7053
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   headline        7054 non-null   object
 1   word            7054 non-null   object
 2   lemma           7054 non-null   object
 3   part_of_speech  7054 non-null   object
dtypes: object(4)
memory usage: 220.6+ KB


In [47]:
word_df.head()

Unnamed: 0,headline,word,lemma,part_of_speech
0,"""Car Vending Machine"" Company Carvana's Stock ...",CAR,CAR,NOUN
1,"""Car Vending Machine"" Company Carvana's Stock ...",VENDING,VEND,VERB
2,"""Car Vending Machine"" Company Carvana's Stock ...",MACHINE,MACHINE,NOUN
3,"""Car Vending Machine"" Company Carvana's Stock ...",COMPANY,COMPANY,PROPN
4,"""Car Vending Machine"" Company Carvana's Stock ...",CARVANA,CARVANA,PROPN


Remove our extra stop words, as well as all symbols and verbs

In [75]:
stop_list = [
    "COMMENTS",
    "COMMENT",
    # "NEW",
    # "MAN",
    # "WOMAN",
    # "YEAR",
    # "DAY",
    # "MILLION",
    # "HIGH",
    # "BIG",
    # "RECORD",
    # "HOME",
    # "WORLD",
    # "STATE",
    # "TIME",
    # "CASE",
    # "LIFE",
    # "AMERICAN",
    # "INSIDE",
    # "EX",
    # "MAR",
    # "HIT",
    # "LAGO",
    # "RISE",
    # "AMID",
    # "WARNS",
    # "RATE",
    # "SHOW",
    # "ATTACK",
    # "RISE",
    # "DEAD",
    # "SET",
]

In [76]:
qualified_df = word_df[
    (~word_df.part_of_speech.isin(["SYM", "VERB"])) &
    (~word_df.lemma.isin(stop_list))
]

Calculate the 25 most common words

In [77]:
top_words = (
    qualified_df.groupby("lemma")
        .size()
        .rename("n")
        .reset_index()
        .sort_values("n", ascending=False)
        .head(25)
)

Get the top verb used with each word

In [78]:
def get_headlines(lemma: str) -> typing.List:
    """Get all the headlines for the provided word."""
    return sorted(list(qualified_df[qualified_df.lemma == lemma].headline.unique()))

In [79]:
def get_top_verb(lemma: str) -> str:
    """Get the top verb in the provided lemma's headline set."""
    # Set our stop words for the verbs
    stop_verbs = ["SAYS", "HAS", "GETS", "GET", "LULA", "ELON", "SAY", "HAVE",]
    if lemma == "COVID":
        stop_verbs += ["TESTS"]
    if lemma == "MUSK":
        stop_verbs += ["SOCIAL"]

    # Pull the headlines
    headline_list = get_headlines(lemma)

    # Loop through all of the headlines
    master_list = []    
    for headline in headline_list:
        # Parse the headline again with NLP
        doc = nlp(headline)
        
        # Pull out the verbs
        verb_list = [t.lemma_.upper() for t in doc if t.pos_ == "VERB"]
        
        # Cut the stop words
        verb_list = [v for v in verb_list if v not in stop_verbs]
        
        # Add it to our master list
        master_list += verb_list
    
    # Count the verbs
    verb_counter = Counter(master_list)
    
    # Pull the most common one
    top_verb = verb_counter.most_common(2)
    
    # Return the result
    return top_verb[0][0]

In [80]:
top_words['top_verb'] = top_words.lemma.apply(get_top_verb)

In [81]:
top_words.head(25)

Unnamed: 0,lemma,n,top_verb
806,ELECTION,86,KNOW
278,BIDEN,57,SHUT
1549,MIDTERM,51,VOTE
650,DAY,42,KNOW
2428,TRUMP,36,MAKE
689,DEMOCRATS,31,LOSE
2519,VOTE,29,COUNT
1655,NEW,28,VOTE
229,BALLOT,27,COUNT
1053,GOP,27,VOTE


In [82]:
qualified_df[qualified_df.lemma == "COMMENT"]

Unnamed: 0,headline,word,lemma,part_of_speech


Get the timeseries for our top words

In [30]:
min_date, max_date = story_df.earliest_date.min(), story_df.earliest_date.max()

In [31]:
def get_timeseries(lemma: str) -> typing.List:
    """Pull the day to day timeseries for the provided word."""
    # Count the top words by day
    df = (
        qualified_df[qualified_df.lemma == lemma]
            .merge(story_df[['earliest_date', 'text']].rename(columns={"text": "headline"}), on="headline")
            .groupby("earliest_date")
            .size()
            .rename("n")
            .reset_index()
            .rename(columns={"earliest_date": "date"})
            .set_index("date")
    )
    
    # Fill in days we're missing
    date_range = pd.date_range(
        min_date,
        max_date,
        freq="D",
    )
    date_index = pd.DatetimeIndex(date_range)
    backfilled_df = df.reindex(date_index)
    backfilled_df.n.fillna(0, inplace=True)
    
    # Calculate the 7-day rolling average
    backfilled_df['7_day_rolling_average'] = backfilled_df.n.rolling(7).mean()

    # Convert it to a dict list
    dict_list = backfilled_df.reset_index().rename(columns={"index": "date"}).to_dict(orient="records")
    
    # Convert our dates to strings
    for d in dict_list:
        d['date'] = d['date'].strftime("%Y-%m-%d")
    
    # Pass it out
    return dict_list

In [32]:
top_words['timeseries'] = top_words.lemma.apply(get_timeseries)

In [33]:
top_words.head()

Unnamed: 0,lemma,n,top_verb,timeseries
7449,TRUMP,151,TAKE,"[{'date': '2022-08-09', 'n': 5.0, '7_day_rolli..."
750,BIDEN,104,WANT,"[{'date': '2022-08-09', 'n': 3.0, '7_day_rolli..."
2307,ELECTION,100,VOTE,"[{'date': '2022-08-09', 'n': 0.0, '7_day_rolli..."
7806,WAR,87,GROW,"[{'date': '2022-08-09', 'n': 3.0, '7_day_rolli..."
5547,PUTIN,74,BLOW,"[{'date': '2022-08-09', 'n': 1.0, '7_day_rolli..."


## Validation

Proof any words we're curious about

In [34]:
get_headlines("PUTIN")

["'MUSK TRANSMITTING MESSAGE FOR PUTIN'",
 "ANOTHER PUTIN CRONY DIES AFTER 'FALLING FROM BOAT'",
 'BIDEN WARNS PUTIN AGAINST USING NUCLEAR OR CHEMICAL WEAPONS',
 'CAR-BOMB KILLING SOWS UNEASE AMONG PUTIN CHEERLEADERS',
 "DAUGHTER OF 'PUTIN'S BRAIN' KILLED IN CAR BOMB",
 "DESPERATE PUTIN'S DOUBLE TROUBLE",
 "DID PUTIN'S FROGMEN BLOW UP EUROPE'S GAS SUPPLIES?",
 "HOW PUTIN PUSHING ARMY BOSSES THROUGH 'MEAT GRINDER' OF DEATH",
 'IN DC, PUTIN NUKE THREATS STIR GROWING ALARM',
 'LEAKED SPY DOCS CLAIM PUTIN TAKING SECRET COCKTAIL OF DRUGS',
 "LEAKED SPY DOCS SUGGEST PUTIN DOES HAVE PARKINSON'S, CANCER",
 "MEET PUTIN'S INNER CIRCLE OF EVIL",
 'MUSK APPEASEMENT OF PUTIN AND CHINA STOKES FEARS OF NEW TWITTER POLICIES',
 'MUSK DENIES HE TALKED TO PUTIN AHEAD OF CONTROVERSIAL TWEET',
 "ODESA DEFIANT. IT'S ALSO PUTIN'S ULTIMATE TARGET",
 'POLAND ASKS USA TO HOST NUKES AMID GROWING PUTIN FEARS',
 "PUTIN 'HAS GIVEN ORDER TO DEPLOY NUKES,' CLAIMS KREMLIN INSIDER",
 "PUTIN 'PLANS TO BLOW UP MAJOR DAM'