In [28]:
# !python -m spacy download en_core_web_sm

In [25]:
import pandas as pd
import spacy
import re
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import spacy
 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AY7\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
df = pd.read_csv('bbc_news.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,index,title,pubDate,guid,link,description
0,0,6684,Can I refuse to work?,"Wed, 10 Aug 2022 15:46:18 GMT",https://www.bbc.co.uk/news/business-62147992,https://www.bbc.co.uk/news/business-62147992?a...,With much of the UK enduring another period of...
1,1,9267,'Liz Truss the Brief?' World reacts to UK poli...,"Mon, 17 Oct 2022 11:35:12 GMT",https://www.bbc.co.uk/news/world-63285480,https://www.bbc.co.uk/news/world-63285480?at_m...,The UK's political chaos has been watched arou...
2,2,7387,Rationing energy is nothing new for off-grid c...,"Wed, 31 Aug 2022 05:20:18 GMT",https://www.bbc.co.uk/news/uk-scotland-highlan...,https://www.bbc.co.uk/news/uk-scotland-highlan...,Scoraig in the north west Highlands has long h...
3,3,767,The hunt for superyachts of sanctioned Russian...,"Tue, 22 Mar 2022 14:37:01 GMT",https://www.bbc.co.uk/news/60739336,https://www.bbc.co.uk/news/60739336?at_medium=...,"Wealthy Russians sanctioned by the US, EU and ..."
4,4,3712,Platinum Jubilee: 70 years of the Queen in 70 ...,"Wed, 01 Jun 2022 23:17:33 GMT",https://www.bbc.co.uk/news/uk-61660128,https://www.bbc.co.uk/news/uk-61660128?at_medi...,A quick look back at the Queen's 70 years on t...


In [3]:
df['raw_tokens'] = df['description'].apply(lambda s: word_tokenize(s))

In [4]:
df['raw_tokens'][0]

['With',
 'much',
 'of',
 'the',
 'UK',
 'enduring',
 'another',
 'period',
 'of',
 'hot',
 'weather',
 ',',
 'some',
 'workers',
 'will',
 'face',
 'very',
 'high',
 'temperatures',
 '.']

In [5]:
df['description_lower'] = df['description'].str.lower()

## Remove stop words:

In [66]:
stop_words = set(stopwords.words('english'))

df['description_lower_no_stopwords'] = df['description_lower'].apply(lambda s: ' '.join([w for w in s.split() if w not in stop_words]))

In [67]:
df['description_lower_no_stopwords'][0]

'much uk enduring another period hot weather, workers face high temperatures.'

## Remove Punctuation:

In [68]:
df['description_lower_no_stopwords_no_punc'] = df['description_lower_no_stopwords'].apply(lambda s: re.sub(r'[^\w\s]', '', s))

In [69]:
df['description_lower_no_stopwords_no_punc'][0]

'much uk enduring another period hot weather workers face high temperatures'

## Tokenize preproccesed sentences: 

In [70]:
df['preproccesed_tokens'] = df['description_lower_no_stopwords_no_punc'].apply(lambda s: word_tokenize(s))

In [71]:
df['preproccesed_tokens'][0]

['much',
 'uk',
 'enduring',
 'another',
 'period',
 'hot',
 'weather',
 'workers',
 'face',
 'high',
 'temperatures']

## Stemming:

In [72]:
ps = PorterStemmer()
df['stemmed_preproccesed_tokens'] = df['preproccesed_tokens'].apply(lambda s: [ps.stem(w) for w in s])

In [73]:
df['stemmed_preproccesed_tokens'][0]

['much',
 'uk',
 'endur',
 'anoth',
 'period',
 'hot',
 'weather',
 'worker',
 'face',
 'high',
 'temperatur']

## Lemmatization:

In [74]:
lemmatizer = WordNetLemmatizer()
df['lemmatized_preproccesed_tokens'] = df['preproccesed_tokens'].apply(lambda s: [lemmatizer.lemmatize(w) for w in s])

In [75]:
df['lemmatized_preproccesed_tokens'][0]

['much',
 'uk',
 'enduring',
 'another',
 'period',
 'hot',
 'weather',
 'worker',
 'face',
 'high',
 'temperature']

## Extract POS(Parts-Of-Speech) Tagging:

In [76]:
nlp = spacy.load("en_core_web_sm") 

In [77]:
raw_tokens_list = sum(df['raw_tokens'], [])
processed_tokens_list = sum(df['lemmatized_preproccesed_tokens'], [])

In [78]:
raw_spacy_doc = nlp(' '.join([token for token in raw_tokens_list]))
processed_spacy_doc = nlp(' '.join([token for token in processed_tokens_list]))

In [79]:
raw_pos_df = pd.DataFrame(columns = ['token', 'pos_tag'])
raw_pos_df.head()

Unnamed: 0,token,pos_tag


In [98]:
raw_records = []
for token in raw_spacy_doc:
    raw_records.append({'token': token.text, 'pos_tag': token.pos_})
raw_records[:5]

[{'token': 'With', 'pos_tag': 'ADP'},
 {'token': 'much', 'pos_tag': 'ADJ'},
 {'token': 'of', 'pos_tag': 'ADP'},
 {'token': 'the', 'pos_tag': 'DET'},
 {'token': 'UK', 'pos_tag': 'PROPN'}]

In [81]:
raw_pos_df = pd.concat([raw_pos_df, pd.DataFrame(raw_records)])
raw_pos_df.head()

Unnamed: 0,token,pos_tag
0,With,ADP
1,much,ADJ
2,of,ADP
3,the,DET
4,UK,PROPN


In [82]:
raw_pos_df.shape

(20354, 2)

In [83]:
processed_pos_df = pd.DataFrame(columns = ['token', 'pos_tag'])
processed_pos_df.head()

Unnamed: 0,token,pos_tag


In [84]:
processed_records = []
for token in processed_spacy_doc:
    processed_records.append({'token': token.text, 'pos_tag': token.pos_})
processed_records[:5]

[{'token': 'much', 'pos_tag': 'ADV'},
 {'token': 'uk', 'pos_tag': 'PROPN'},
 {'token': 'enduring', 'pos_tag': 'VERB'},
 {'token': 'another', 'pos_tag': 'DET'},
 {'token': 'period', 'pos_tag': 'NOUN'}]

In [85]:
processed_pos_df = pd.concat([processed_pos_df, pd.DataFrame(processed_records)])
processed_pos_df.head()

Unnamed: 0,token,pos_tag
0,much,ADV
1,uk,PROPN
2,enduring,VERB
3,another,DET
4,period,NOUN


In [86]:
processed_pos_df.shape

(11147, 2)

In [87]:
processed_pos_df.loc[(processed_pos_df['pos_tag'] == 'VERB')]

Unnamed: 0,token,pos_tag
2,enduring,VERB
8,face,VERB
14,watched,VERB
18,held,VERB
25,make,VERB
...,...,...
11128,say,VERB
11129,want,VERB
11133,help,VERB
11140,defrauded,VERB


In [90]:
raw_pos_count_df = raw_pos_df.groupby(['token', 'pos_tag']).size().reset_index(name = 'counts').sort_values(by='counts', ascending = False)
raw_pos_count_df.head(10)

Unnamed: 0,token,pos_tag,counts
23,.,PUNCT,966
5072,the,DET,837
3972,of,ADP,421
16,",",PUNCT,397
1878,a,DET,380
3445,in,ADP,377
2008,and,CCONJ,334
5122,to,PART,315
12,'s,PART,268
20,-,PUNCT,246


In [91]:
processed_pos_count_df = processed_pos_df.groupby(['token', 'pos_tag']).size().reset_index(name = 'counts').sort_values(by='counts', ascending = False)
processed_pos_count_df.head(10)

Unnamed: 0,token,pos_tag,counts
4020,say,VERB,157
1590,england,PROPN,68
4791,uk,PROPN,49
3418,people,NOUN,47
3271,one,NUM,46
5121,year,NOUN,44
1193,cup,PROPN,43
560,bbc,PROPN,42
5098,world,PROPN,41
5082,woman,NOUN,32


In [92]:
raw_pos_count_df.loc[(raw_pos_count_df['pos_tag'] == 'VERB')].head(10)

Unnamed: 0,token,pos_tag,counts
4607,says,VERB,123
4605,say,VERB,42
2165,beat,VERB,19
5012,take,VERB,19
3339,help,VERB,18
1805,Watch,VERB,17
5414,win,VERB,16
3310,have,VERB,15
3754,make,VERB,15
4530,rising,VERB,14


In [93]:
processed_pos_count_df.loc[(processed_pos_count_df['pos_tag'] == 'VERB')].head(10)

Unnamed: 0,token,pos_tag,counts
4020,say,VERB,157
4534,take,VERB,24
2837,make,VERB,20
1805,find,VERB,18
1006,come,VERB,17
3897,rising,VERB,15
569,beat,VERB,14
4575,tell,VERB,13
4086,see,VERB,13
3987,said,VERB,13


In [94]:
raw_pos_count_df.loc[(raw_pos_count_df['pos_tag'] == 'NOUN')].head(10)

Unnamed: 0,token,pos_tag,counts
4108,people,NOUN,45
5458,year,NOUN,38
5115,time,NOUN,31
5460,years,NOUN,26
2641,day,NOUN,19
5309,victory,NOUN,18
3238,government,NOUN,18
5356,war,NOUN,18
3621,leader,NOUN,18
4170,police,NOUN,18


In [95]:
processed_pos_count_df.loc[(processed_pos_count_df['pos_tag'] == 'NOUN')].head(10)

Unnamed: 0,token,pos_tag,counts
3418,people,NOUN,47
5121,year,NOUN,44
5082,woman,NOUN,32
1240,day,NOUN,31
3489,police,NOUN,27
2648,leader,NOUN,23
4647,time,NOUN,22
2233,home,NOUN,21
5097,world,NOUN,21
5008,week,NOUN,21


In [101]:
ner_raw_df = pd.DataFrame(columns = ['token', 'label'])
raw_records = []
for token in raw_spacy_doc.ents:
    raw_records.append({'token': token.text, 'label': token.label_})
raw_records[:5]

[{'token': 'UK', 'label': 'GPE'},
 {'token': 'UK', 'label': 'GPE'},
 {'token': 'Russians', 'label': 'NORP'},
 {'token': 'US', 'label': 'GPE'},
 {'token': 'EU', 'label': 'GPE'}]

In [103]:
ner_raw_df = pd.concat([ner_raw_df, pd.DataFrame(raw_records)])
ner_raw_df.head()

Unnamed: 0,token,label
0,UK,GPE
1,UK,GPE
2,Russians,NORP
3,US,GPE
4,EU,GPE


In [105]:
ner_processed_df = pd.DataFrame(columns = ['token', 'label'])
processed_tokens = []
for token in processed_spacy_doc.ents:
    processed_tokens.append({'token': token.text, 'label': token.label_})
processed_tokens[:5]

[{'token': 'scoraig', 'label': 'GPE'},
 {'token': 'russian', 'label': 'NORP'},
 {'token': 'eu', 'label': 'ORG'},
 {'token': '70 year', 'label': 'DATE'},
 {'token': 'jubilee speech red bull', 'label': 'ORG'}]

In [107]:
ner_processed_df = pd.concat([ner_processed_df, pd.DataFrame(processed_tokens)])
ner_processed_df.head()

Unnamed: 0,token,label
0,scoraig,GPE
1,russian,NORP
2,eu,ORG
3,70 year,DATE
4,jubilee speech red bull,ORG


In [108]:
ner_raw_df['label'].value_counts()

GPE            548
PERSON         492
ORG            309
DATE           268
CARDINAL       197
NORP           113
ORDINAL         72
EVENT           59
LOC             23
TIME            20
MONEY           20
FAC             20
WORK_OF_ART     18
PRODUCT         12
PERCENT         11
QUANTITY         7
LAW              3
LANGUAGE         1
Name: label, dtype: int64

In [109]:
ner_processed_df['label'].value_counts()

CARDINAL       221
PERSON         216
DATE           212
GPE            206
ORG            114
NORP           100
ORDINAL         68
EVENT           31
TIME            13
LOC             11
QUANTITY         9
FAC              2
PRODUCT          1
LAW              1
WORK_OF_ART      1
Name: label, dtype: int64

In [110]:
ner_raw_df.loc[(ner_raw_df['label'] == 'PERSON'), 'token'].value_counts()

Queen               4
Rishi Sunak         4
Boris Johnson       4
Rory McIlroy        4
Jurgen Klopp        4
                   ..
Jason Manford 's    1
Emi Martinez 's     1
Petr Pavel          1
Andrej Babis        1
Aanoch Mor          1
Name: token, Length: 440, dtype: int64

In [114]:
ner_processed_df.loc[(ner_processed_df['label'] == 'PERSON'), 'token'].value_counts()

boris johnson                                        6
kate                                                 3
erik ten                                             2
harry kane                                           2
vladimir putin                                       2
                                                    ..
jadon sancho                                         1
joe clarke                                           1
daniel lewis                                         1
ed davey                                             1
alison van uytvanck first appearance centre court    1
Name: token, Length: 195, dtype: int64