In [19]:
import regex as re
import nltk
import pandas as pd

In [20]:
def tokenize(text):
    return re.findall(r'[\w-]*\p{L}[\w-]*', text)

In [21]:
text = "Let's defeat SaRs-CoV-2 together in 2020!"

print(text)
print("|".join(tokenize(text)))

Let's defeat SaRs-CoV-2 together in 2020!
Let|s|defeat|SaRs-CoV-2|together|in


In [22]:
nltk

<module 'nltk' from 'C:\\Python3\\Lib\\site-packages\\nltk\\__init__.py'>

In [23]:
# Stop words = determiners, auxiliary verbs, pronouns, adverbs, ...
# -- don't carry much information, their high-frequencies obscure insights...

stopwords = set(nltk.corpus.stopwords.words('english'))
list(sorted(stopwords))[:10]

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an']

In [24]:
def getstopwords(stopwords = nltk.corpus.stopwords.words('english'), include_stopwords = set(), exclude_stopwords = set()):
    return set(stopwords) | include_stopwords - exclude_stopwords

In [31]:
def remove_stop(tokens):
    return [t for t in tokens if t.lower() not in stopwords]

In [32]:
pipeline = [str.lower, tokenize, remove_stop]

In [33]:
stopwords = getstopwords()
list(stopwords)[:5]

["shouldn't", 'about', 'as', "i'd", "it's"]

In [34]:
def prepare(text, pipeline):
    tokens = text
    for f in pipeline:
        tokens = f(tokens)
    return tokens

In [35]:
df = pd.read_csv('data/un-general-debates-blueprint.csv')
df.sample(2)

Unnamed: 0,session,year,country,country_name,speaker,position,text
7305,69,2014,VEN,"Venezuela, Bolivarian Republic of",Nicolás Maduro Moros,President,"At \nthe outset, as this is the first time tha..."
3677,50,1995,MLT,Malta,Mr. de Marco,Deputy Prime Minister,It is with great pleasure that\nI congratulate...


In [38]:
df['tokens'] = df['text'].apply(prepare, pipeline=pipeline)

In [39]:
df.head()

Unnamed: 0,session,year,country,country_name,speaker,position,text,prepared_text,tokens
0,25,1970,ALB,Albania,Mr. NAS,,33: May I first convey to our President the co...,"[may, first, convey, president, congratulation...","[may, first, convey, president, congratulation..."
1,25,1970,ARG,Argentina,Mr. DE PABLO PARDO,,177.\t : It is a fortunate coincidence that pr...,"[fortunate, coincidence, precisely, time, unit...","[fortunate, coincidence, precisely, time, unit..."
2,25,1970,AUS,Australia,Mr. McMAHON,,100.\t It is a pleasure for me to extend to y...,"[pleasure, extend, mr, president, warmest, con...","[pleasure, extend, mr, president, warmest, con..."
3,25,1970,AUT,Austria,Mr. KIRCHSCHLAEGER,,155.\t May I begin by expressing to Ambassado...,"[may, begin, expressing, ambassador, hambro, b...","[may, begin, expressing, ambassador, hambro, b..."
4,25,1970,BEL,Belgium,Mr. HARMEL,,"176. No doubt each of us, before coming up to ...","[doubt, us, coming, rostrum, wonders, usefulne...","[doubt, us, coming, rostrum, wonders, usefulne..."


In [40]:
df['num_tokens'] = df['tokens'].map(len)

In [41]:
df.head()

Unnamed: 0,session,year,country,country_name,speaker,position,text,prepared_text,tokens,num_tokens
0,25,1970,ALB,Albania,Mr. NAS,,33: May I first convey to our President the co...,"[may, first, convey, president, congratulation...","[may, first, convey, president, congratulation...",4092
1,25,1970,ARG,Argentina,Mr. DE PABLO PARDO,,177.\t : It is a fortunate coincidence that pr...,"[fortunate, coincidence, precisely, time, unit...","[fortunate, coincidence, precisely, time, unit...",2341
2,25,1970,AUS,Australia,Mr. McMAHON,,100.\t It is a pleasure for me to extend to y...,"[pleasure, extend, mr, president, warmest, con...","[pleasure, extend, mr, president, warmest, con...",2575
3,25,1970,AUT,Austria,Mr. KIRCHSCHLAEGER,,155.\t May I begin by expressing to Ambassado...,"[may, begin, expressing, ambassador, hambro, b...","[may, begin, expressing, ambassador, hambro, b...",2166
4,25,1970,BEL,Belgium,Mr. HARMEL,,"176. No doubt each of us, before coming up to ...","[doubt, us, coming, rostrum, wonders, usefulne...","[doubt, us, coming, rostrum, wonders, usefulne...",2064
