In [7]:
import pandas as pd
import re

# Load dataset (assuming it's downloaded and named 'bbc-news.csv')
df = pd.read_csv("/content/sample_data/bbc_news.csv")

# Keep only 'description' column
texts = df['description'].dropna().astype(str)

# Basic text cleaning function
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # remove punctuation and numbers
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    return text

df['clean_description'] = texts.apply(clean_text)
print(df['clean_description'].head())


0    the ukrainian president says the country will ...
1    jeremy bowen was on the frontline in irpin as ...
2    one of the worlds biggest fertiliser firms say...
3    the parents of the manchester arena bombings y...
4    consumers are feeling the impact of higher ene...
Name: clean_description, dtype: object


In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

# Download required resources
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

# Tokenize and remove stopwords
def tokenize_nltk(text):
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
    return tokens

df['tokens_nltk'] = df['clean_description'].apply(tokenize_nltk)

# Flatten token list
all_tokens = [token for tokens in df['tokens_nltk'] for token in tokens]

# Top 10 frequent tokens
freq_dist = Counter(all_tokens)
print("Top 10 frequent tokens (NLTK):")
print(freq_dist.most_common(10))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Top 10 frequent tokens (NLTK):
[('says', 4561), ('world', 2030), ('bbc', 2011), ('people', 1989), ('england', 1922), ('first', 1905), ('new', 1894), ('say', 1676), ('cup', 1486), ('two', 1392)]


In [13]:
import spacy
from collections import Counter

# Load the small English language model
# Run 'python -m spacy download en_core_web_sm' in your terminal if you don't have it
nlp = spacy.load('en_core_web_sm')

# Sample text data (e.g., from a 'description' column of a dataset)
text_data = """
    Running is a great exercise. Many runners run daily.
    Studies have shown that studying improves knowledge.
    The best teams are not always the team with the best players.
    This new policy was implemented. The implementation was successful.
"""

# Process the text with the spaCy pipeline
doc = nlp(text_data)

# Extract lemmas, excluding stop words and punctuation
# Lemmatization reduces words like 'running' and 'ran' to their base form 'run'
lemmas = [
    token.lemma_.lower()
    for token in doc
    if not token.is_stop and not token.is_punct and not token.is_space
]

# Count the frequency of each lemma
lemma_counts = Counter(lemmas)

# Display the top 10 most frequent lemmas
print("Top 10 Frequent Lemmas:")
for lemma, count in lemma_counts.most_common(10):
    print(f"{lemma}: {count}")

Top 10 Frequent Lemmas:
run: 2
good: 2
team: 2
great: 1
exercise: 1
runner: 1
daily: 1
study: 1
show: 1
studying: 1
