# Q1: Load the data and clean text

In [None]:
import pandas as pd
import re


In [None]:
 df = pd.read_csv("/bbc_news.csv")
print(df.head())

                                               title  \
0  Ukraine: Angry Zelensky vows to punish Russian...   
1  War in Ukraine: Taking cover in a town under a...   
2         Ukraine war 'catastrophic for global food'   
3  Manchester Arena bombing: Saffie Roussos's par...   
4  Ukraine conflict: Oil price soars to highest l...   

                         pubDate  \
0  Mon, 07 Mar 2022 08:01:56 GMT   
1  Sun, 06 Mar 2022 22:49:58 GMT   
2  Mon, 07 Mar 2022 00:14:42 GMT   
3  Mon, 07 Mar 2022 00:05:40 GMT   
4  Mon, 07 Mar 2022 08:15:53 GMT   

                                               guid  \
0  https://www.bbc.co.uk/news/world-europe-60638042   
1  https://www.bbc.co.uk/news/world-europe-60641873   
2      https://www.bbc.co.uk/news/business-60623941   
3            https://www.bbc.co.uk/news/uk-60579079   
4      https://www.bbc.co.uk/news/business-60642786   

                                                link  \
0  https://www.bbc.co.uk/news/world-europe-606380...   
1  

In [None]:
texts = df["description"].dropna().astype(str)

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text
cleaned_texts = texts.apply(clean_text)

Use NLTK or Spacy:

In [None]:
import nltk
import spacy
from collections import Counter
from nltk.stem import PorterStemmer

In [None]:
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
nlp = spacy.load("en_core_web_sm")
tokens = []
for doc in nlp.pipe(cleaned_texts, disable=["ner", "parser"]):
    words = [token.text for token in doc if not token.is_stop and token.is_alpha and len(token.text) > 2]
    tokens.extend(words)

In [None]:
freq_dist = Counter(tokens)
print(freq_dist.most_common(10))

[('says', 4561), ('world', 2030), ('bbc', 2011), ('people', 1989), ('england', 1922), ('new', 1894), ('cup', 1486), ('league', 1321), ('years', 1308), ('win', 1212)]


In [None]:
# Load spacy English model
nlp = spacy.load("en_core_web_sm")
# Initialize stemmer
stemmer = PorterStemmer()
lemmas = []
stems = []
for doc in nlp.pipe(cleaned_texts, disable=["ner", "parser"]):
    for token in doc:
        if not token.is_stop and token.is_alpha and len(token.text) > 2:
            lemmas.append(token.lemma_)
            stems.append(stemmer.stem(token.text))

In [None]:
# Count top 10 frequent lemmas
lemma_freq = Counter(lemmas)
print("Top 10 frequent lemmas (spaCy):")
print(lemma_freq.most_common(10))

Top 10 frequent lemmas (spaCy):
[('say', 5463), ('england', 2313), ('year', 2250), ('world', 2170), ('bbc', 2026), ('people', 2021), ('win', 1918), ('new', 1898), ('cup', 1501), ('day', 1500)]


In [None]:
# Count top 10 frequent stems
stem_freq = Counter(stems)
print("Top 10 frequent stems (NLTK stemming):")
print(stem_freq.most_common(10))

Top 10 frequent stems (NLTK stemming):
[('say', 4721), ('bbc', 2408), ('england', 2338), ('year', 2251), ('world', 2170), ('peopl', 2021), ('new', 1894), ('win', 1747), ('cup', 1502), ('day', 1500)]
