In [3]:

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
import spacy



df = pd.read_csv("/content/bbc_news.csv")

print("Dataset shape:", df.shape)
print("Columns available:", df.columns)


def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


df['clean_desc'] = df['description'].apply(clean_text)
print("\nSample cleaned text:\n", df['clean_desc'].head())


nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
all_tokens = []

for text in df['clean_desc']:
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words and len(word) > 1]
    all_tokens.extend(filtered_tokens)


token_freq = Counter(all_tokens)
top10_tokens = token_freq.most_common(10)
print("\nTop 10 frequent tokens (NLTK):\n", top10_tokens)

nlp = spacy.load("en_core_web_sm")

lemmas = []
for doc in nlp.pipe(df['clean_desc'].tolist(), disable=["parser", "ner"]):
    for token in doc:
        if not token.is_stop and token.is_alpha and len(token.lemma_) > 1:
            lemmas.append(token.lemma_.lower())


lemma_freq = Counter(lemmas)
top10_lemmas = lemma_freq.most_common(10)
print("\nTop 10 frequent lemmas (spaCy):\n", top10_lemmas)


stemmer = PorterStemmer()
stems = [stemmer.stem(token) for token in all_tokens]
stem_freq = Counter(stems)
print("\nTop 10 stems (PorterStemmer):\n", stem_freq.most_common(10))


Dataset shape: (42115, 5)
Columns available: Index(['title', 'pubDate', 'guid', 'link', 'description'], dtype='object')

Sample cleaned text:
 0    the ukrainian president says the country will ...
1    jeremy bowen was on the frontline in irpin as ...
2    one of the worlds biggest fertiliser firms say...
3    the parents of the manchester arena bombings y...
4    consumers are feeling the impact of higher ene...
Name: clean_desc, dtype: object


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Top 10 frequent tokens (NLTK):
 [('says', 4561), ('world', 2030), ('bbc', 2011), ('people', 1989), ('england', 1922), ('first', 1905), ('new', 1894), ('say', 1676), ('cup', 1486), ('uk', 1462)]

Top 10 frequent lemmas (spaCy):
 [('say', 5463), ('england', 2313), ('year', 2250), ('world', 2170), ('bbc', 2026), ('people', 2021), ('win', 1918), ('new', 1898), ('cup', 1501), ('day', 1500)]

Top 10 stems (PorterStemmer):
 [('say', 6397), ('bbc', 2408), ('england', 2338), ('year', 2251), ('world', 2170), ('peopl', 2021), ('first', 1909), ('new', 1894), ('win', 1747), ('uk', 1682)]
