In [None]:
import nltk
from nltk.corpus import treebank
from nltk.tag.hmm import HiddenMarkovModelTagger
from nltk.tokenize import word_tokenize
from nltk.tokenize import treebank
import yaml
import pandas as pd
import re
from yaml import safe_load
from sqlalchemy.dialects import postgresql
from sqlalchemy import Integer, Numeric, String, DateTime, create_engine
import psycopg2
from nltk.tokenize import TreebankWordTokenizer
twt = TreebankWordTokenizer()
import random
from tqdm import tqdm
tqdm.pandas()
from sklearn.model_selection import train_test_split
import os
from dotenv import load_dotenv
load_dotenv()

In [505]:
pd.set_option('max_colwidth', 1000)

In [535]:
def make_unicode(inp):
    if type(inp) != unicode:
        inp =  inp.decode('utf-8')
    return inp

def kw_query(query_word):
    """
    Grab a random bit of text that references your keyword query
    """
    
    connection = psycopg2.connect(user=user,
                              password=password,
                              host=host,
                              port=port,
                              database=database)
    cursor = connection.cursor()
    
    sql = f'''SELECT title FROM articles WHERE title ~ '\y{query_word.replace("'","")}\y' limit 1'''

    cursor.execute(sql)
    # print("Selecting rows from mobile table using cursor.fetchall")
    result = cursor.fetchone()
    connection.close()
    # print(type(result))
    # print(result)
    # print(len(result))
    if result != None:
        return result[0]
    else:
        return None

def flatten_extend(matrix):
     flat_list = []
     for row in matrix:
         flat_list.extend(row)
     return flat_list

def annotate(input_sentence):
    """
    1. REGEX search the input text for country names, getting the appropriate span-match
    2. Word tokenize, and span tokenize
    3. Compare the span-tokenized spans to the regex spans
    4. Poop out annotated docs
    """
    ## Regex search for country names
    regex_hits = {}
    for match in regex_query.finditer(input_sentence):
        regex_hits[(match.start(), match.end())] = match.group()

    ## Tokenization
    tokens = [x for x in twt.tokenize(input_sentence)]
    token_spans = [x for x in twt.span_tokenize(input_sentence)]

    ## Stop early if there's no country mention
    if len(regex_hits)<1:
        return [(token, 'o') for token in tokens]
    
    ## Compare the REGEX and Tokenized spans    
    xyz = []
    for token_span in token_spans:
        for regex_span in regex_hits.keys():
            if (token_span[0] >= regex_span[0]) & (token_span[1] <= regex_span[1]):
                xyz.append('Country')
            else:
                xyz.append("o")
    return list(zip(tokens, xyz))

def get_random_titles(limit=25):
    connection = psycopg2.connect(user=user,
                              password=password,
                              host=host,
                              port=port,
                              database=database)
    cursor = connection.cursor()
    
    sql = f'''SELECT title FROM articles ORDER BY random() LIMIT {limit};'''
    
    cursor.execute(sql)
    # print("Selecting rows from mobile table using cursor.fetchall")
    result = cursor.fetchall()
    connection.close()

    return result

from nltk.tag.hmm import HiddenMarkovModelTagger

def train_hmm(train, dev):
    """
    Train an HMM tagger on the training set provided
    """
    
    hmm_tagger = HiddenMarkovModelTagger.train(train, dev)
    
    return hmm_tagger

## Connect to DB

In [5]:
database = os.getenv('DATABASE')
host = os.getenv('DBHOST')
port = os.getenv('DBPORT')
user = os.getenv('DBUSER')
password = os.getenv('DBPASSWORD')

In [8]:
conn_string = fr'postgresql+psycopg2://{user}:{password}@{host}/{database}'
db = create_engine(conn_string)
conn = db.connect()

## Load and munge data

In [11]:
with open("en.yml", "r") as f:
    try:
        countries = (yaml.safe_load(f))
    except yaml.YAMLError as exc:
        print(exc)

In [32]:
leader_df = pd.read_csv(r"world-leaders-social-media.csv")

In [22]:
no_aliases = []
for country in countries:
    if 'aliases' in country:
        pass
    else:
        no_aliases.append(country)

In [24]:
for x in no_aliases:
    countries[x]['aliases'] = []

In [34]:
df = df.T

In [107]:
df['example'] = df['iso_name'].progress_apply(kw_query)

100%|████████████████████████████████████████████████████████████████████████████████| 249/249 [03:40<00:00,  1.13it/s]


In [146]:
df = df[df['example'].str.len()>0]

In [None]:
demonym_df = pd.read_excel("demonyms.xlsx")

adj = flatten_extend([i.split(', ') for i in demonym_df['Adjectivals'].tolist()])
dem = flatten_extend([i.split(', ') for i in demonym_df['Demonyms'].tolist()])

In [707]:
country_lookup = df['iso_name'].to_list()
country_lookup.extend(df['short'].to_list())

country_lookup.extend(flatten_extend(df[df['aliases'].notna()]['aliases'].to_list()))
country_lookup.extend(adj)
country_lookup.extend(dem)

bonus_context = ['US', 'U.S.', 'USA', 'U.S.A.' 'England', 'Herzegovina', 'British', 'Kurd', 'Kurdish', 'Washington', 'Beijing', 'Moscow', 'London',
                'Gaza', 'Palestine', 'West Bank', 'Levant', 'NKorea', 'SKorea', "Uyghurs","Uighurs", "Uygurs", "Uigurs","Uyghur","Uighur", "Uygur", 
                 "Uigur", "Native Americans", "Native American", 'Hezbollah', 'ISIS', 'ISIL', 'Daesh', 'Obama', 'Biden', 'Mike Pence', 'Donald Trump',
                'Putin', 'Xi Jinping', 'Syria', 'Syrian', 'Zelensky']
country_lookup.extend(bonus_context)

country_lookup = [x.strip() for x in country_lookup]

print("Before dedup:",len(country_lookup))
country_lookup = list(set(country_lookup))
print("After dedup:",len(country_lookup))

Before dedup: 1151
After dedup: 877


In [708]:
## Todo: escape periods. For some reason this is really hard
country_lookup = [x.replace('.', r'.') for x in country_lookup]
country_lookup.sort()
country_query = [fr"\b{x}\b" for x in country_lookup]
regex_query = re.compile("|".join(country_query))

## Apply annotations/Make training data

In [709]:
df['annotated'] = df['example'].progress_apply(annotate)
annotated = df['annotated'].to_list()
train_data = get_random_titles(50000)
df3 = pd.DataFrame(train_data, columns=['text'])
df3 = df3.drop_duplicates('text')
print(len(df3))
df3['annotated'] = df3['text'].progress_apply(annotate)
df3 = df3[df3['annotated'].astype(str).str.contains('Country')]
annotated2 = df3['annotated'].to_list()
annotated.extend(annotated2)

100%|██████████████████████████████████████████████████████████████████████████████| 205/205 [00:00<00:00, 4268.22it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['annotated'] = df['example'].progress_apply(annotate)


47745


100%|██████████████████████████████████████████████████████████████████████████| 47745/47745 [00:11<00:00, 4242.88it/s]


In [710]:
len(annotated)

11440

In [711]:
train, dev = train_test_split(annotated, test_size=0.2, random_state=42)

dataset = {"train": train, "dev":dev}

## Train Hidden Markov Model

In [712]:
hmm_tagger = train_hmm(dataset["train"], dataset["dev"])

accuracy over 33398 tokens: 95.57


## Validate

In [713]:
testoo = get_random_titles(500)
df4 = pd.DataFrame(testoo, columns=['text'])

In [714]:
df4 = df4.drop_duplicates('text')
len(df4)

493

In [715]:
df4['annotate'] = df4['text'].progress_apply(annotate)
df4['hmm'] = df4['text'].progress_apply(lambda x: hmm_tagger.tag(twt.tokenize(x)))

100%|██████████████████████████████████████████████████████████████████████████████| 493/493 [00:00<00:00, 4178.31it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 493/493 [00:01<00:00, 362.91it/s]


In [716]:
len(df4[df4['annotate']!=df4['hmm']])

141

In [717]:
df4[df4['annotate']!=df4['hmm']].sample(5)

Unnamed: 0,text,annotate,hmm
351,Potato Chip-Chocolate Chip Cookies Recipe,"[(Potato, o), (Chip-Chocolate, o), (Chip, o), (Cookies, o), (Recipe, o)]","[(Potato, Country), (Chip-Chocolate, o), (Chip, o), (Cookies, o), (Recipe, o)]"
175,"Alicia Navarro, 18, has now fled with the 36-year-old man she was living with at a Montana apartment - days after autistic teen reappeared four years after vanishing","[(Alicia, o), (Navarro, o), (,, o), (18, o), (,, o), (has, o), (now, o), (fled, o), (with, o), (the, o), (36-year-old, o), (man, o), (she, o), (was, o), (living, o), (with, o), (at, o), (a, o), (Montana, o), (apartment, o), (-, o), (days, o), (after, o), (autistic, o), (teen, o), (reappeared, o), (four, o), (years, o), (after, o), (vanishing, o)]","[(Alicia, Country), (Navarro, o), (,, o), (18, o), (,, o), (has, o), (now, o), (fled, o), (with, o), (the, o), (36-year-old, o), (man, o), (she, o), (was, o), (living, o), (with, o), (at, o), (a, o), (Montana, o), (apartment, o), (-, o), (days, o), (after, o), (autistic, o), (teen, o), (reappeared, o), (four, o), (years, o), (after, o), (vanishing, o)]"
168,Myles Goodwyn Dies At 75,"[(Myles, o), (Goodwyn, o), (Dies, o), (At, o), (75, o)]","[(Myles, Country), (Goodwyn, o), (Dies, o), (At, o), (75, o)]"
216,"Sleep Quality, Not Quantity, Could Be More Important: Study","[(Sleep, o), (Quality, o), (,, o), (Not, o), (Quantity, o), (,, o), (Could, o), (Be, o), (More, o), (Important, o), (:, o), (Study, o)]","[(Sleep, Country), (Quality, o), (,, o), (Not, o), (Quantity, o), (,, o), (Could, o), (Be, o), (More, o), (Important, o), (:, o), (Study, o)]"
369,Ellie Goulding chats for 'ages' with ex Greg James at the BRITs - 11 years after their split,"[(Ellie, o), (Goulding, o), (chats, o), (for, o), ('ages, o), (', o), (with, o), (ex, o), (Greg, o), (James, o), (at, o), (the, o), (BRITs, o), (-, o), (11, o), (years, o), (after, o), (their, o), (split, o)]","[(Ellie, Country), (Goulding, o), (chats, o), (for, o), ('ages, o), (', o), (with, o), (ex, o), (Greg, o), (James, o), (at, o), (the, o), (BRITs, o), (-, o), (11, o), (years, o), (after, o), (their, o), (split, o)]"


In [718]:
df4.sample(5)

Unnamed: 0,text,annotate,hmm
476,AOC's bid to stop Jim Jordan becoming speaker: Squad member says New York Republicans won't back a man who voted to overturn the election or supports an abortion ban because they'll be voted out next,"[(AOC, o), ('s, o), (bid, o), (to, o), (stop, o), (Jim, o), (Jordan, Country), (becoming, o), (speaker, o), (:, o), (Squad, o), (member, o), (says, o), (New, o), (York, o), (Republicans, o), (wo, o), (n't, o), (back, o), (a, o), (man, o), (who, o), (voted, o), (to, o), (overturn, o), (the, o), (election, o), (or, o), (supports, o), (an, o), (abortion, o), (ban, o), (because, o), (they, o), ('ll, o), (be, o), (voted, o), (out, o), (next, o)]","[(AOC, o), ('s, o), (bid, o), (to, o), (stop, o), (Jim, o), (Jordan, Country), (becoming, o), (speaker, o), (:, o), (Squad, o), (member, o), (says, o), (New, o), (York, o), (Republicans, o), (wo, o), (n't, o), (back, o), (a, o), (man, o), (who, o), (voted, o), (to, o), (overturn, o), (the, o), (election, o), (or, o), (supports, o), (an, o), (abortion, o), (ban, o), (because, o), (they, o), ('ll, o), (be, o), (voted, o), (out, o), (next, o)]"
49,US mulls sending Ukraine cluster munitions in latest weapon opinion reversal,"[(US, Country), (mulls, o), (sending, o), (Ukraine, o), (cluster, o), (munitions, o), (in, o), (latest, Country), (weapon, o), (opinion, o), (reversal, o)]","[(US, Country), (mulls, o), (sending, o), (Ukraine, o), (cluster, o), (munitions, o), (in, o), (latest, o), (weapon, o), (opinion, o), (reversal, o)]"
442,"IDF prepares for possible escalation: 'Hamas chose to go to war with us, we will win'","[(IDF, o), (prepares, o), (for, o), (possible, o), (escalation, o), (:, o), ('Hamas, o), (chose, o), (to, o), (go, o), (to, o), (war, o), (with, o), (us, o), (,, o), (we, o), (will, o), (win, o), (', o)]","[(IDF, o), (prepares, o), (for, o), (possible, o), (escalation, o), (:, o), ('Hamas, o), (chose, o), (to, o), (go, o), (to, o), (war, o), (with, o), (us, o), (,, o), (we, o), (will, o), (win, o), (', o)]"
117,Five key takeaways from the Berlin Film Festival,"[(Five, o), (key, o), (takeaways, o), (from, o), (the, o), (Berlin, o), (Film, o), (Festival, o)]","[(Five, o), (key, o), (takeaways, o), (from, o), (the, o), (Berlin, o), (Film, o), (Festival, o)]"
110,Global impact: 5 ways war in Ukraine has changed the world,"[(Global, o), (impact, o), (:, o), (5, o), (ways, o), (war, o), (in, o), (Ukraine, Country), (has, o), (changed, o), (the, o), (world, o)]","[(Global, o), (impact, o), (:, o), (5, o), (ways, o), (war, o), (in, o), (Ukraine, o), (has, o), (changed, o), (the, o), (world, o)]"


## Explore how it broke
- Periods in acronyms are a bear to deal with in regex

In [663]:
hmm_tagger.tag(twt.tokenize("U.S. officials found eating cheezits under the bleachers"))

[('U.S.', 'o'),
 ('officials', 'o'),
 ('found', 'o'),
 ('eating', 'o'),
 ('cheezits', 'o'),
 ('under', 'o'),
 ('the', 'o'),
 ('bleachers', 'o')]

In [664]:
annotate("U.S. officials found eating cheezits under the bleachers")

[('U.S.', 'o'),
 ('officials', 'o'),
 ('found', 'o'),
 ('eating', 'o'),
 ('cheezits', 'o'),
 ('under', 'o'),
 ('the', 'o'),
 ('bleachers', 'o')]

In [666]:
hmm_tagger.tag(twt.tokenize("Biden and Putin had lunch in Moscow, waiting for a call from Egypt"))

[('Biden', 'Country'),
 ('and', 'o'),
 ('Putin', 'o'),
 ('had', 'o'),
 ('lunch', 'o'),
 ('in', 'o'),
 ('Moscow', 'o'),
 (',', 'o'),
 ('waiting', 'o'),
 ('for', 'o'),
 ('a', 'o'),
 ('call', 'o'),
 ('from', 'o'),
 ('Egypt', 'o')]

In [719]:
annotate("Biden and Putin had lunch in Moscow, waiting for a call from Egypt")

[('Biden', 'Country'),
 ('and', 'o'),
 ('Putin', 'o'),
 ('had', 'o'),
 ('lunch', 'o'),
 ('in', 'o'),
 ('Moscow', 'o'),
 (',', 'o'),
 ('waiting', 'o'),
 ('for', 'Country'),
 ('a', 'o'),
 ('call', 'o'),
 ('from', 'o'),
 ('Egypt', 'o')]