<a href="https://colab.research.google.com/github/KarlX07/NLP-assignment/blob/main/NLPsssign%5B%5B01%5D%5D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# **Lab Assignment: Introduction to NLP and Text Data Collection**



In [35]:

pip install requests beautifulsoup4 pandas



In [36]:
import requests
from bs4 import BeautifulSoup
import time
import csv

BASE_URL = "https://www.bbc.com"
START_URL = "https://www.bbc.com/news"

headers = {
    "User-Agent": "Mozilla/5.0"
}

def get_article_links():
    """Scrape article URLs from the BBC homepage."""
    response = requests.get(START_URL, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    links = []
    for a in soup.select("a[href]"):
        href = a["href"]
        if "/news/" in href and href.startswith("/news"):
            links.append(BASE_URL + href)


    return list(set(links))

def scrape_article(url):
    """Extract title, date, and article content."""
    time.sleep(1)

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")


    title_tag = soup.find("h1")
    title = title_tag.get_text(strip=True) if title_tag else "No Title"


    date_tag = soup.find("time")
    date = date_tag.get_text(strip=True) if date_tag else "No Date Found"


    paragraphs = soup.select("article p")
    content = "\n".join(p.get_text(strip=True) for p in paragraphs)


    content = content.replace("\t", " ").replace("\n\n", "\n")

    return {
        "url": url,
        "title": title,
        "date": date,
        "content": content
    }


links = get_article_links()
print(f"Found {len(links)} article links")

articles = []

for link in links[:100]:
    try:
        print("Scraping:", link)
        article_data = scrape_article(link)


        if len(article_data["content"]) > 200:
            articles.append(article_data)
    except Exception as e:
        print("Error:", e)


with open("bbc_news_data.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["url", "title", "date", "content"])
    writer.writeheader()
    writer.writerows(articles)

print("Scraping complete. Saved to bbc_news_data.csv")

Found 53 article links
Scraping: https://www.bbc.com/news/bbcindepth
Scraping: https://www.bbc.com/news/articles/crmxrln4vp0o
Scraping: https://www.bbc.com/news/videos/c8drzn7g393o
Scraping: https://www.bbc.com/news/articles/cj417gqq5n1o
Scraping: https://www.bbc.com/news/videos/cn8xzxpv0eno
Scraping: https://www.bbc.com/news/articles/c4gpkenx9dzo
Scraping: https://www.bbc.com/news/northern_ireland
Scraping: https://www.bbc.com/news/world/australia
Scraping: https://www.bbc.com/news/articles/cwypzdgwg1yo
Scraping: https://www.bbc.com/news/scotland/scotland_politics
Scraping: https://www.bbc.com/news/articles/cly2kgvnqyzo
Scraping: https://www.bbc.com/news/scotland
Scraping: https://www.bbc.com/news/world/middle_east
Scraping: https://www.bbc.com/news/in_pictures
Scraping: https://www.bbc.com/news/videos/ckg45vpyg3ro
Scraping: https://www.bbc.com/news/topics/c2vdnvdg6xxt
Scraping: https://www.bbc.com/news/videos/cx20zn57v3po
Scraping: https://www.bbc.com/news/articles/czr1dxzrx3zo
Scrap

-----------------------------------------EXTRACTION--------------------------

In [37]:
import pandas as pd

df = pd.read_csv("bbc_news_data.csv")
print(df.head())
print("Total Documents:", len(df))

                                              url  \
0             https://www.bbc.com/news/bbcindepth   
1  https://www.bbc.com/news/articles/crmxrln4vp0o   
2  https://www.bbc.com/news/articles/cj417gqq5n1o   
3  https://www.bbc.com/news/articles/c4gpkenx9dzo   
4       https://www.bbc.com/news/northern_ireland   

                                               title           date  \
0                                           NewsNews  No Date Found   
1  Soldier took her own life after fight, inquest...     4 days ago   
2  South African man seen at neo-Nazi rally has A...    4 hours ago   
3  How my on-air 'brain fog' moment sparked a big...   14 hours ago   
4                                           NewsNews  No Date Found   

                                             content  
0  Get alerts on the UK app about the latest stor...  
1  A soldier took her own life while fearing a fi...  
2  A South African man who was seen attending a n...  
3  When I rather nervously shared 

EXTRACTION

In [38]:

texts = df["content"].astype(str).tolist()
print("Example Text:\n", texts[0][:500])

Example Text:
 Get alerts on the UK app about the latest stories from InDepth - the home of the best analysis from BBC correspondents
Drones have been found across Western Europe near airports, military bases and power plants, as part of a suspected programme of 'hybrid warfare'
Misshaps of their own making overshadow important things the government has to do, writes Laura Kuenssberg.
The US president is notably absent from these UN climate talks, as are other world leaders, all of which prompts questions abou


--------Data Cleaning----------

In [39]:

import re

def clean_text(text):
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[^a-zA-Z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["clean_text"] = df["content"].apply(clean_text)

------------------TOKENIZATION--------------------

In [40]:
import nltk
nltk.download("punkt")
nltk.download('punkt_tab')

df["tokens"] = df["clean_text"].apply(lambda x: nltk.word_tokenize(x.lower()))
df[["clean_text", "tokens"]].head()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,clean_text,tokens
0,Get alerts on the UK app about the latest stor...,"[get, alerts, on, the, uk, app, about, the, la..."
1,A soldier took her own life while fearing a fi...,"[a, soldier, took, her, own, life, while, fear..."
2,A South African man who was seen attending a n...,"[a, south, african, man, who, was, seen, atten..."
3,When I rather nervously shared a personal post...,"[when, i, rather, nervously, shared, a, person..."
4,The principal of the school where one of the y...,"[the, principal, of, the, school, where, one, ..."


----------------Stopword Removal -------------------------

In [41]:
nltk.download("stopwords")
stopwords = set(nltk.corpus.stopwords.words("english"))

df["tokens_nostop"] = df["tokens"].apply(
    lambda words: [w for w in words if w not in stopwords]
)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


------------Lemmatization--------------
Normalize words → running → run, cars → car

In [42]:
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")

lemm = WordNetLemmatizer()

df["lemmas"] = df["tokens_nostop"].apply(
    lambda words: [lemm.lemmatize(w) for w in words]
)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


-----------Feature Extraction-----------

In [43]:
def word_count(tokens):
    return len(tokens)

def vocab_size(tokens):
    return len(set(tokens))

df["word_count"] = df["lemmas"].apply(word_count)
df["vocab_size"] = df["lemmas"].apply(vocab_size)


--------------Read data ----------------

In [44]:
df.to_csv("bbc_preprocessed_data.csv", index=False)

------Language Analysis---------

In [45]:
import numpy as np

avg_vocab = df["vocab_size"].mean()
print("Average Vocabulary Size:", avg_vocab)



Average Vocabulary Size: 266.7291666666667


--------Word Count-------

In [46]:
print("Average Word Count:", df["word_count"].mean())
print("Max Word Count:", df["word_count"].max())
print("Min Word Count:", df["word_count"].min())


Average Word Count: 407.0833333333333
Max Word Count: 1224
Min Word Count: 118


----------Sentence Length-----

In [47]:
import nltk

def sentence_lengths(text):
    sentences = nltk.sent_tokenize(text)
    lengths = [len(nltk.word_tokenize(s)) for s in sentences]
    return lengths

df["sentence_lengths"] = df["clean_text"].apply(sentence_lengths)

all_lengths = [l for sub in df["sentence_lengths"] for l in sub]

print("Average Sentence Length:", np.mean(all_lengths))
print("Max Sentence Length:", np.max(all_lengths))


Average Sentence Length: 708.375
Max Sentence Length: 2193


--------Most Common Words--------------

In [48]:
from collections import Counter

all_words = [w for tokens in df["lemmas"] for w in tokens]
freq = Counter(all_words).most_common(20)

print("Most Common Words:", freq)


Most Common Words: [('say', 153), ('said', 139), ('u', 110), ('bbc', 104), ('year', 98), ('epstein', 98), ('people', 87), ('trump', 85), ('one', 77), ('police', 72), ('government', 71), ('new', 68), ('drone', 61), ('first', 60), ('two', 56), ('would', 56), ('president', 55), ('uk', 53), ('day', 53), ('told', 50)]
