In [4]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import re
import pandas as pd
import matplotlib.pyplot as plt

In [7]:
data = pd.read_csv(r"D:\Anaconda\bbc_news.csv")

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   1000 non-null   int64 
 1   index        1000 non-null   int64 
 2   title        1000 non-null   object
 3   pubDate      1000 non-null   object
 4   guid         1000 non-null   object
 5   link         1000 non-null   object
 6   description  1000 non-null   object
dtypes: int64(2), object(5)
memory usage: 54.8+ KB


In [12]:
titles = pd.DataFrame(data['title'])

In [13]:
titles.head()

Unnamed: 0,title
0,Can I refuse to work?
1,'Liz Truss the Brief?' World reacts to UK poli...
2,Rationing energy is nothing new for off-grid c...
3,The hunt for superyachts of sanctioned Russian...
4,Platinum Jubilee: 70 years of the Queen in 70 ...


In [22]:
# Making everything in lowecase
titles['lowercase'] = titles['title'].str.lower()


In [34]:
# Removing stopwords from the data
en_stopwords = stopwords.words('english')
titles['no_stopwords'] = titles['lowercase'].apply(lambda x: " ".join([words for words in x.split() if words not in en_stopwords]))
titles['no_stopwords']

0                                           refuse work?
1      'liz truss brief?' world reacts uk political t...
2        rationing energy nothing new off-grid community
3          hunt superyachts sanctioned russian oligarchs
4            platinum jubilee: 70 years queen 70 seconds
                             ...                        
995    dominic raab: third senior civil servant gives...
996                  highlights: radacanu beats uytvanck
997         pictures: mountain bikers descend snowy peak
998    companies must help cut living costs, says new...
999       beware online car sale scams, consumers warned
Name: no_stopwords, Length: 1000, dtype: object

In [35]:
# Removing all the punctuation marks
titles['no_punc'] = titles ['no_stopwords'].apply(lambda x : re.sub(r"[^\w\s]", " " , x))
titles['no_punc']

0                                           refuse work 
1       liz truss brief   world reacts uk political t...
2        rationing energy nothing new off grid community
3          hunt superyachts sanctioned russian oligarchs
4            platinum jubilee  70 years queen 70 seconds
                             ...                        
995    dominic raab  third senior civil servant gives...
996                  highlights  radacanu beats uytvanck
997         pictures  mountain bikers descend snowy peak
998    companies must help cut living costs  says new...
999       beware online car sale scams  consumers warned
Name: no_punc, Length: 1000, dtype: object

In [41]:
# Lemmatizing the data
lemmatize = WordNetLemmatizer()
titles['lemmatized'] = titles['no_punc'].apply(lambda x : [lemmatize.lemmatize(word) for word in x.split()])
titles['lemmatized'] [998]

['company',
 'must',
 'help',
 'cut',
 'living',
 'cost',
 'say',
 'new',
 'cost',
 'living',
 'tsar']

In [47]:
# Store the cleaned data into a empty string
clean_data = sum(titles['lemmatized'], [])
print(clean_data, end = " ")



In [71]:
# POS Tagging 
import spacy
nlp = spacy.load("en_core_web_sm")

In [72]:
spacy_doc = nlp(' '.join(clean_data)) # Converting list into string 


In [76]:
records = []
for token in spacy_doc:
    records.append({'token': token.text,'pos_tag': token.pos_})
pos_df = pd.DataFrame(records)
pos_df.head()

Unnamed: 0,token,pos_tag
0,refuse,AUX
1,work,NOUN
2,liz,PROPN
3,truss,ADJ
4,brief,ADJ


In [77]:
pos_df_count = pos_df.groupby(['token','pos_tag']).size().reset_index(name = "counts").sort_values(by = 'counts',ascending=False)
pos_df_count.head()

Unnamed: 0,token,pos_tag,counts
2978,s,PART,142
24,2022,NUM,47
1154,england,PROPN,46
862,cup,PROPN,39
3665,uk,PROPN,37


In [81]:
record_ner = []
for token in spacy_doc.ents:
    record_ner.append({'token': token.text,'ner_tag': token.label_})
ner_df = pd.DataFrame(record_ner)
ner_df.head()

Unnamed: 0,token,ner_tag
0,russian,NORP
1,70 year,DATE
2,70 second,TIME
3,bull,ORG
4,1,CARDINAL


In [82]:
ner_df_count = ner_df.groupby(['token','ner_tag']).size().reset_index(name = "counts").sort_values(by = 'counts',ascending=False)
ner_df_count.head()

Unnamed: 0,token,ner_tag,counts
40,2022,CARDINAL,26
472,russian,NORP,25
242,first,ORDINAL,17
41,2022,DATE,15
252,france,GPE,10
