<a href="https://colab.research.google.com/github/CasperCoder/intro_to_NLP/blob/main/Practical_ad2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
import re
import pandas as pd
import matplotlib.pyplot as plt

In [8]:
bbc_data = pd.read_csv('/content/bbc_news.csv')

In [9]:
bbc_data.head()

Unnamed: 0.1,Unnamed: 0,index,title,pubDate,guid,link,description
0,0,6684,Can I refuse to work?,"Wed, 10 Aug 2022 15:46:18 GMT",https://www.bbc.co.uk/news/business-62147992,https://www.bbc.co.uk/news/business-62147992?a...,With much of the UK enduring another period of...
1,1,9267,'Liz Truss the Brief?' World reacts to UK poli...,"Mon, 17 Oct 2022 11:35:12 GMT",https://www.bbc.co.uk/news/world-63285480,https://www.bbc.co.uk/news/world-63285480?at_m...,The UK's political chaos has been watched arou...
2,2,7387,Rationing energy is nothing new for off-grid c...,"Wed, 31 Aug 2022 05:20:18 GMT",https://www.bbc.co.uk/news/uk-scotland-highlan...,https://www.bbc.co.uk/news/uk-scotland-highlan...,Scoraig in the north west Highlands has long h...
3,3,767,The hunt for superyachts of sanctioned Russian...,"Tue, 22 Mar 2022 14:37:01 GMT",https://www.bbc.co.uk/news/60739336,https://www.bbc.co.uk/news/60739336?at_medium=...,"Wealthy Russians sanctioned by the US, EU and ..."
4,4,3712,Platinum Jubilee: 70 years of the Queen in 70 ...,"Wed, 01 Jun 2022 23:17:33 GMT",https://www.bbc.co.uk/news/uk-61660128,https://www.bbc.co.uk/news/uk-61660128?at_medi...,A quick look back at the Queen's 70 years on t...


In [10]:
bbc_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   1000 non-null   int64 
 1   index        1000 non-null   int64 
 2   title        1000 non-null   object
 3   pubDate      1000 non-null   object
 4   guid         1000 non-null   object
 5   link         1000 non-null   object
 6   description  1000 non-null   object
dtypes: int64(2), object(5)
memory usage: 54.8+ KB


In [11]:
titles = pd.DataFrame(bbc_data['title'])

In [12]:
titles.head()

Unnamed: 0,title
0,Can I refuse to work?
1,'Liz Truss the Brief?' World reacts to UK poli...
2,Rationing energy is nothing new for off-grid c...
3,The hunt for superyachts of sanctioned Russian...
4,Platinum Jubilee: 70 years of the Queen in 70 ...


In [13]:
titles['lowercase'] = titles['title'].str.lower()

In [16]:
nltk.download('stopwords')
en_stopwords = stopwords.words('english')
titles['no_stopwords'] = titles['lowercase'].apply(lambda x: ' '.join([word for word in x.split() if word not in (en_stopwords)]))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [19]:
titles['no_stopwords_no_punct'] = titles.apply(lambda x: re.sub(r"([^\w\s])", "", x['no_stopwords']), axis=1)

In [24]:
nltk.download('punkt_tab')
titles['tokens_raw'] = titles.apply(lambda x: word_tokenize(x['title']), axis=1)
titles['tokens_clean'] = titles.apply(lambda x: word_tokenize(x['no_stopwords_no_punct']), axis=1)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [27]:
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
titles["tokens_clean_lemmatized"] = titles["tokens_clean"].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [28]:
tokens_raw_list = sum(titles['tokens_raw'], [])
tokens_clean_list = sum(titles['tokens_clean_lemmatized'], [])

#### POS Tagging

In [29]:
nlp = spacy.load("en_core_web_sm")

In [30]:
spacy_doc = nlp(' '.join(tokens_raw_list))

In [31]:
pos_df = pd.DataFrame(columns = ['token', 'pos_tag'])

In [32]:
for token in spacy_doc:
  pos_df = pd.concat([pos_df, pd.DataFrame.from_records(
      [{'token': token.text, 'pos_tag':token.pos_}])], ignore_index=True)

In [33]:
pos_df_counts = pos_df.groupby(['token', 'pos_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)

In [35]:
pos_df_counts.head(10)


Unnamed: 0,token,pos_tag,counts
94,:,PUNCT,543
7,',PUNCT,315
2895,in,ADP,187
4072,to,PART,173
3262,of,ADP,172
21,-,PUNCT,165
4033,the,DET,163
1858,and,CCONJ,147
14,'s,PART,142
96,?,PUNCT,130


In [36]:
nouns = pos_df_counts[pos_df_counts.pos_tag == 'NOUN'][0:10]

In [37]:
nouns

Unnamed: 0,token,pos_tag,counts
4258,war,NOUN,35
3545,record,NOUN,15
4350,year,NOUN,14
3999,tax,NOUN,13
3411,police,NOUN,13
3362,people,NOUN,12
2323,day,NOUN,12
4307,win,NOUN,11
2029,boss,NOUN,11
2565,fans,NOUN,11


In [39]:
verbs = pos_df_counts[pos_df_counts.pos_tag =='VERB'][0:10]
verbs

Unnamed: 0,token,pos_tag,counts
3678,says,VERB,30
4308,win,VERB,15
2666,found,VERB,13
1963,beats,VERB,9
4317,wins,VERB,9
2708,get,VERB,9
2383,dies,VERB,9
8,',VERB,9
3677,say,VERB,8
2792,have,VERB,8


In [40]:
adj = pos_df_counts[pos_df_counts.pos_tag =='ADJ'][0:10]
adj

Unnamed: 0,token,pos_tag,counts
3239,new,ADJ,28
1399,Russian,ADJ,22
2602,final,ADJ,16
18,-,ADJ,14
2621,first,ADJ,13
3195,more,ADJ,10
2832,high,ADJ,10
2998,last,ADJ,9
1992,big,ADJ,9
3298,other,ADJ,8


#### NER

In [42]:
ner_df = pd.DataFrame(columns=['token', 'ner_tag'])

for token in spacy_doc.ents:
  if pd.isna(token.label_) is False:
    ner_df = pd.concat([ner_df, pd.DataFrame.from_records(
        [{'token': token.text, 'ner_tag':token.label_}])], ignore_index=True)

In [44]:
ner_df.head()

Unnamed: 0,token,ner_tag
0,Liz Truss,PERSON
1,UK,GPE
2,superyachts,CARDINAL
3,Russian,NORP
4,Platinum Jubilee,PERSON


In [45]:
ner_df_counts = ner_df.groupby(['token', 'ner_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)

In [46]:
ner_df_counts.head(10)

Unnamed: 0,token,ner_tag,counts
984,Ukraine,GPE,44
978,UK,GPE,37
320,England,GPE,34
979,US,GPE,22
837,Russian,NORP,21
1056,World Cup 2022,EVENT,18
1088,first,ORDINAL,15
780,Queen,PERSON,11
1054,World Cup,EVENT,11
219,China,GPE,11


In [47]:
people = ner_df_counts[ner_df_counts.ner_tag == 'PERSON'][:10]

In [48]:
people

Unnamed: 0,token,ner_tag,counts
780,Queen,PERSON,11
245,Covid,PERSON,9
776,Putin,PERSON,8
163,Boris Johnson,PERSON,6
564,Liz Truss,PERSON,6
317,Emma Raducanu,PERSON,4
808,Rishi Sunak,PERSON,4
511,Jurgen Klopp,PERSON,4
827,Rory McIlroy,PERSON,3
110,Andy Murray,PERSON,3
