#### Parts of speech & named entity recognition task

In [1]:
import spacy
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from spacy import displacy, tokenizer
from IPython.display import HTML, display
import re
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('bbc_news.csv')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,index,title,pubDate,guid,link,description
0,0,6684,Can I refuse to work?,"Wed, 10 Aug 2022 15:46:18 GMT",https://www.bbc.co.uk/news/business-62147992,https://www.bbc.co.uk/news/business-62147992?a...,With much of the UK enduring another period of...
1,1,9267,'Liz Truss the Brief?' World reacts to UK poli...,"Mon, 17 Oct 2022 11:35:12 GMT",https://www.bbc.co.uk/news/world-63285480,https://www.bbc.co.uk/news/world-63285480?at_m...,The UK's political chaos has been watched arou...
2,2,7387,Rationing energy is nothing new for off-grid c...,"Wed, 31 Aug 2022 05:20:18 GMT",https://www.bbc.co.uk/news/uk-scotland-highlan...,https://www.bbc.co.uk/news/uk-scotland-highlan...,Scoraig in the north west Highlands has long h...
3,3,767,The hunt for superyachts of sanctioned Russian...,"Tue, 22 Mar 2022 14:37:01 GMT",https://www.bbc.co.uk/news/60739336,https://www.bbc.co.uk/news/60739336?at_medium=...,"Wealthy Russians sanctioned by the US, EU and ..."
4,4,3712,Platinum Jubilee: 70 years of the Queen in 70 ...,"Wed, 01 Jun 2022 23:17:33 GMT",https://www.bbc.co.uk/news/uk-61660128,https://www.bbc.co.uk/news/uk-61660128?at_medi...,A quick look back at the Queen's 70 years on t...


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   1000 non-null   int64 
 1   index        1000 non-null   int64 
 2   title        1000 non-null   object
 3   pubDate      1000 non-null   object
 4   guid         1000 non-null   object
 5   link         1000 non-null   object
 6   description  1000 non-null   object
dtypes: int64(2), object(5)
memory usage: 54.8+ KB


#### We want to extract the titles of the news articles and store them separately

In [5]:
titles = pd.DataFrame(data['title'])
titles.head()

Unnamed: 0,title
0,Can I refuse to work?
1,'Liz Truss the Brief?' World reacts to UK poli...
2,Rationing energy is nothing new for off-grid c...
3,The hunt for superyachts of sanctioned Russian...
4,Platinum Jubilee: 70 years of the Queen in 70 ...


#### Cleaning data - converting title to lowercase

In [6]:
titles['lowercase'] = titles['title'].str.lower()
titles.head()

Unnamed: 0,title,lowercase
0,Can I refuse to work?,can i refuse to work?
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...


#### Removing stopwords

In [7]:
en_stopwords = stopwords.words('english')
titles['no_stopwords'] = titles['lowercase'].apply(lambda x: ' '.join([word for word in x.split() if word not in (en_stopwords)]) )

In [8]:
titles.head()

Unnamed: 0,title,lowercase,no_stopwords
0,Can I refuse to work?,can i refuse to work?,refuse work?
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds


#### Removing punctuations

In [9]:
titles['no_stopwords_no_punct'] = titles.apply(lambda x: re.sub(r"([^\w\s])", '', x['no_stopwords']), axis=1)
titles.head()

Unnamed: 0,title,lowercase,no_stopwords,no_stopwords_no_punct
0,Can I refuse to work?,can i refuse to work?,refuse work?,refuse work
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...,liz truss brief world reacts uk political turmoil
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community,rationing energy nothing new offgrid community
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs,hunt superyachts sanctioned russian oligarchs
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds,platinum jubilee 70 years queen 70 seconds


#### Tokenize

In [10]:
titles['token_raw']= titles.apply(lambda x: word_tokenize(x['title']), axis=1)
titles['token_clean']= titles.apply(lambda x: word_tokenize(x['no_stopwords_no_punct']), axis=1)
titles.head()

Unnamed: 0,title,lowercase,no_stopwords,no_stopwords_no_punct,token_raw,token_clean
0,Can I refuse to work?,can i refuse to work?,refuse work?,refuse work,"[Can, I, refuse, to, work, ?]","[refuse, work]"
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...,liz truss brief world reacts uk political turmoil,"['Liz, Truss, the, Brief, ?, ', World, reacts,...","[liz, truss, brief, world, reacts, uk, politic..."
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community,rationing energy nothing new offgrid community,"[Rationing, energy, is, nothing, new, for, off...","[rationing, energy, nothing, new, offgrid, com..."
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs,hunt superyachts sanctioned russian oligarchs,"[The, hunt, for, superyachts, of, sanctioned, ...","[hunt, superyachts, sanctioned, russian, oliga..."
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds,platinum jubilee 70 years queen 70 seconds,"[Platinum, Jubilee, :, 70, years, of, the, Que...","[platinum, jubilee, 70, years, queen, 70, seco..."


#### Lemmatize

In [11]:
lemmatizer = WordNetLemmatizer()
titles["token_clean_lemmatized"] = titles["token_clean"].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])
titles.head()

Unnamed: 0,title,lowercase,no_stopwords,no_stopwords_no_punct,token_raw,token_clean,token_clean_lemmatized
0,Can I refuse to work?,can i refuse to work?,refuse work?,refuse work,"[Can, I, refuse, to, work, ?]","[refuse, work]","[refuse, work]"
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...,liz truss brief world reacts uk political turmoil,"['Liz, Truss, the, Brief, ?, ', World, reacts,...","[liz, truss, brief, world, reacts, uk, politic...","[liz, truss, brief, world, reacts, uk, politic..."
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community,rationing energy nothing new offgrid community,"[Rationing, energy, is, nothing, new, for, off...","[rationing, energy, nothing, new, offgrid, com...","[rationing, energy, nothing, new, offgrid, com..."
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs,hunt superyachts sanctioned russian oligarchs,"[The, hunt, for, superyachts, of, sanctioned, ...","[hunt, superyachts, sanctioned, russian, oliga...","[hunt, superyachts, sanctioned, russian, oliga..."
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds,platinum jubilee 70 years queen 70 seconds,"[Platinum, Jubilee, :, 70, years, of, the, Que...","[platinum, jubilee, 70, years, queen, 70, seco...","[platinum, jubilee, 70, year, queen, 70, second]"


#### Creating lists for tokens

In [12]:
token_raw_list = sum(titles['token_raw'], []) #unpack our lists into a single list
token_clean_list = sum(titles['token_clean_lemmatized'], [])

### POS tagging

In [13]:
nlp = spacy.load('en_core_web_sm')

##### Create a spacy doc from our raw text

In [14]:
spacy_doc = nlp(' '.join(token_raw_list))

In [15]:
pos_df = pd.DataFrame(columns=['token', 'pos_tag'])

In [16]:
for token in spacy_doc:
    pos_df = pd.concat([pos_df, 
                        pd.DataFrame.from_records([{'token': token.text, 'pos_tag': token.pos_}])], ignore_index=True)

#### Most common tokens and their associated POS tags

In [17]:
pos_df_counts = pos_df.groupby(['token', 'pos_tag']).size().reset_index(name='counts').sort_values(by = 'counts', ascending =False)
pos_df_counts

Unnamed: 0,token,pos_tag,counts
95,:,PUNCT,543
8,',PUNCT,300
2897,in,ADP,187
4082,to,PART,175
3268,of,ADP,172
...,...,...,...
2304,crumbling,VERB,1
2305,crunch,PROPN,1
827,Jarrod,PROPN,1
826,Japanese,ADJ,1


#### Most common nouns

In [18]:
nouns = pos_df_counts[pos_df_counts.pos_tag == "NOUN"]
nouns

Unnamed: 0,token,pos_tag,counts
4267,war,NOUN,35
3552,record,NOUN,15
3416,police,NOUN,14
4356,year,NOUN,14
4316,win,NOUN,14
...,...,...,...
2294,criticism,NOUN,1
2296,crocodile,NOUN,1
2297,crop,NOUN,1
2300,crown,NOUN,1


#### Most common verbs

In [19]:
verbs = pos_df_counts[pos_df_counts.pos_tag == "VERB"]
verbs

Unnamed: 0,token,pos_tag,counts
3687,says,VERB,30
9,',VERB,14
2670,found,VERB,13
4317,win,VERB,12
4324,wins,VERB,10
...,...,...,...
2324,dating,VERB,1
2329,dazzling,VERB,1
2293,criticised,VERB,1
2301,crowned,VERB,1


### Named entity recognition 

In [20]:
for word in spacy_doc.ents:
    print(word.text, word.label_)

Liz Truss PERSON
UK GPE
Rationing PRODUCT
superyachts CARDINAL
Russian NORP
70 years DATE
70 seconds TIME
Red Bull ORG
Formula 1 's PRODUCT
World Triathlon Championship Series EVENT
Flora Duffy PERSON
Georgia Taylor-Brown PERSON
Terry Hall PERSON
Post Office ORG
Fujitsu ORG
Horizon ORG
UK GPE
Six CARDINAL
Scotland GPE
'99 DATE
Ireland GPE
Liz Truss PERSON
Queen PERSON
5 % PERCENT
England Russians NORP
Week DATE
11-17 June 2022 DATE
Hong Kong GPE
25 years DATE
Nick Pope PERSON
Newcastle GPE
China GPE
Barcelona GPE
Real Madrid ORG
Clasico GPE
Kuenssberg GPE
Budget ORG
Nicola Bulley ORG
Christmas DATE
100-year-old DATE
Edinburgh GPE
three days DATE
Freya PERSON
600kg QUANTITY
Norway GPE
UK GPE
Ukrainian GPE
Nicola Bulley ORG
Don Bolduc PERSON
Democrats NORP
Republican NORP
Queen PERSON
17-year-old DATE
Isaac Babadi PERSON
Panenka ORG
Amtrak ORG
Three CARDINAL
Missouri GPE
four CARDINAL
Scottish NORP
the day DATE
Chris PERSON
Jr v Liam Smith PERSON
Ukraine GPE
Russian NORP
Root PRODUCT
Eng

In [22]:
html = displacy.render(spacy_doc, style = 'ent', jupyter = False)
#display(HTML(html))

#### Extract the tokens and entity tags into a dataframe

In [23]:
ner_df = pd.DataFrame(columns=['token', 'ner_tag'])

for token in spacy_doc.ents:
    if pd.isna(token.label_) is False:
        ner_df = pd.concat([ner_df, pd.DataFrame.from_records(
            [{'token': token.text, 'ner_tag': token.label_}])], ignore_index=True)

In [26]:
ner_df.head()

Unnamed: 0,token,ner_tag
0,Liz Truss,PERSON
1,UK,GPE
2,Rationing,PRODUCT
3,superyachts,CARDINAL
4,Russian,NORP


#### Token frequency count

In [29]:
ner_df_counts = ner_df.groupby(['token','ner_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)
ner_df_counts.head()

Unnamed: 0,token,ner_tag,counts
965,Ukraine,GPE,47
955,UK,GPE,36
329,England,GPE,32
819,Russian,NORP,20
957,US,GPE,19


#### Most common people

In [30]:
people = ner_df_counts[ner_df_counts.ner_tag == "PERSON"]
people

Unnamed: 0,token,ner_tag,counts
257,Covid,PERSON,9
760,Queen,PERSON,8
757,Putin,PERSON,8
169,Boris Johnson,PERSON,6
563,Liz Truss,PERSON,6
...,...,...,...
405,Georgia Taylor-Brown,PERSON,1
406,Geraint Thomas,PERSON,1
409,Ghislaine Maxwell,PERSON,1
410,Gianluigi Lentini,PERSON,1


#### Most common places

In [31]:
places = ner_df_counts[ner_df_counts.ner_tag == "GPE"]
places

Unnamed: 0,token,ner_tag,counts
965,Ukraine,GPE,47
955,UK,GPE,36
329,England,GPE,32
957,US,GPE,19
378,France,GPE,12
...,...,...,...
419,Great Britain 's,GPE,1
420,Greece,GPE,1
444,Honiton,GPE,1
390,GB,GPE,1


#### Most common products

In [35]:
products = ner_df_counts[ner_df_counts.ner_tag == "PRODUCT"]
products

Unnamed: 0,token,ner_tag,counts
714,Oliviers 2022,PRODUCT,1
776,Rationing,PRODUCT,1
754,Pride,PRODUCT,1
608,Marvel,PRODUCT,1
987,WW2,PRODUCT,1
971,Ulvade,PRODUCT,1
806,Root,PRODUCT,1
212,Centrica,PRODUCT,1
225,Children,PRODUCT,1
244,Coleen,PRODUCT,1
