## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import re
from lxml import etree
import json
import torch
from sklearn.metrics.pairwise import cosine_similarity,euclidean_distances,manhattan_distances
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

from sitemaps_utils import *

## Data Collection

#### BBC

In [2]:
BBC_news_sitemaps = ["https://www.bbc.com/sitemaps/https-sitemap-com-news-1.xml",
                     "https://www.bbc.com/sitemaps/https-sitemap-com-news-2.xml",
                     "https://www.bbc.com/sitemaps/https-sitemap-com-news-3.xml"]

sky_news_sitemaps = ["https://news.sky.com/sitemap/sitemap-news.xml",
                    "https://www.skysports.com/sitemap/sitemap-news.xml"]

namespaces = {
    'sitemap': 'http://www.sitemaps.org/schemas/sitemap/0.9',
    'news': 'http://www.google.com/schemas/sitemap-news/0.9'
}

urls = {}
# BBC News uses 'sitemap:lastmod' for the date tag
urls["bbc"] = Extract_todays_urls_from_sitemaps(BBC_news_sitemaps, namespaces, 'sitemap:lastmod')
print(f"Number of BBC URLs: {len(urls['bbc'])}")

# Sky News uses 'news:publication_date' for the date tag
urls["sky"] = Extract_todays_urls_from_sitemaps(sky_news_sitemaps, namespaces, 'news:publication_date')
print(f"Number of Sky URLs: {len(urls['sky'])}")


Number of BBC URLs: 677
Number of Sky URLs: 36


## Data Processing

In [3]:
bbc_topics_to_drop = {"pidgin", "hausa", "swahili", "naidheachdan","videos","cymrufyw"}
df_BBC = process_news_data(urls, "bbc", bbc_topics_to_drop)

sky_topics_to_drop = {"arabic", "urdu"}
df_Sky = process_news_data(urls, "sky", sky_topics_to_drop)


INFO:langid.langid:initializing identifier


------ bbc ------
Topic
news         207
sport        116
weather       21
newsround      1
Total        345

------ sky ------
Topic
story       17
football     4
cricket      4
f1           3
golf         2
tennis       2
racing       2
Total        34



In [None]:
titles to drop: ["One-minute World News",""]

title contains "weekly" : df_BBC[df_BBC['Title'].str.contains('weekly round-up', case=False)]


In [39]:
df_BBC[df_BBC['Title']=="Watch Newsround"]

Unnamed: 0,Url,Last Modified,Title,Topic
106,https://www.bbc.com/newsround/66176104,2024-05-25T08:25:50Z,Watch Newsround,newsround


In [None]:
df_BBC[~df_BBC['Title'].str.contains('weekly round-up', case=False)].drop_duplicates("Title").reset_index(inplace=True)

In [40]:
for i in df_BBC[~df_BBC['Title'].str.contains('weekly round-up', case=False)].drop_duplicates("Title").Title:
    print(i)

One-minute World News
South America weather forecast
General election 2024 poll tracker: How do the parties compare?
RideLondon: Gas leak near finish line in Maldon
Michael Gove steps down in mass exodus of MPs before election
Defence Secretary Lloyd Austin to transfers powers due to medical procedure
Childbirth: Wrexham woman left with stoma and PTSD
Neurodivergent and queer author on purging Llanelli memories
Football gossip: McKenna, Ten Hag, Pochettino, Maresca, Sancho, Salah, Kompany
Extinct ‘mountain jewel’ plant returned to wild - in secret location
Smartphone ban for kids is worth considering - MPs
Papua New Guinea landslide: Race to rescue villagers trapped
Royal Mail investigated by Ofcom for missing delivery targets
Bournemouth stabbing at beach leaves woman dead
South Africa elections: Some are losing faith in the ANC
Rudimental's Locksmith meets 10-year-old carer at festival
Gaza war: Israelis attack aid convoys sent for Palestinians
Britain's Got Talent: Comedian ‘embrace

In [30]:
for i in df_BBC[df_BBC['Title'].str.contains('weekly round-up', case=False)].Title:
    print(i)

Sussex weekly round-up: 18 May - 24 May 2024 
Surrey weekly round-up: 18 May - 24 May 2024 
Kent weekly round-up: 18 May - 24 May 2024


## Data cleaning

In [12]:
# news_sport = ["news","sport"]
# news_sport_urls = df_BBC[df_BBC.Topic.isin(news_sport)]
# news_sport_urls.reset_index(inplace=True)

In [4]:
def remove_elements(input_string: str) -> str:
    """
    This function removes all HTML tags and their content from the input string,
    and removes specific patterns such as "Published X hours ago" and "Image source".
    """
    # Parse the input string as HTML
    soup = BeautifulSoup(input_string, 'html.parser')
    
    # Remove <script>, <style>, and <picture> tags and their content
    for script in soup(["script", "style", "picture"]):
        script.decompose()
    
    # Extract text from the parsed HTML
    cleaned_string = soup.get_text(separator=' ', strip=True)

    # Define regular expressions to remove unwanted patterns
    patterns = [
        r'\bPublished.*?\bago\b',  # Matches "Published X hours ago"
        r'\bImage source\b',   # Matches "Image source, ..."
        r'\bImage caption\b',  # Matches "Image caption, ..."
        r'\bMedia caption\b',  # Matches "Media caption, ..."
        r'\bGetty Images\b', 
        r'\bBBC Wales News\b',         # Matches "BBC Wales News"
        r'\bPublished\s\d{1,2}\s\w+\b',
        #r'\bRelated Topics\b.*',
        r'\bRelated Internet\b.*', 
        r'\bBBC News Staff\b',
        r'\bFollow\s.*?\snews.*\b',
        r'\b\w+/\w+\b',
        r'Follow\sBBC.*'           
 
    ]

    # Remove the matched patterns from the text
    for pattern in patterns:
        cleaned_string = re.sub(pattern, '', cleaned_string, flags=re.DOTALL)
    
    # Additional cleanup: remove extra spaces and newlines
    cleaned_string = re.sub(r'["\'.,]+', '', cleaned_string)
    cleaned_string = re.sub(r'\s+', ' ', cleaned_string).strip()
    cleaned_string = cleaned_string.replace("This video can not be played To play this video you need to enable JavaScript in your browser.","")
    string2 = "Sign up for our morning newsletter and get BBC News in your inbox."
    cleaned_string = cleaned_string.replace(string2,"")

    
    article = []
    for line in (i for i in cleaned_string.split("\n") if len(i) >= 10):
        article.append(line)

    return " ".join(article)

In [30]:
async def request_sentences_from_urls_async(urls, timeout=20):
    articles_df = pd.DataFrame(columns=["url","title","content"])

    async with aiohttp.ClientSession() as session:
        tasks = []
        for idx, url in enumerate(urls.Url, start=1):
            if (idx - 1) % 100 == 0:
                logging.info(f"\nProcessing URL {((idx - 1)//100)+1}/{(len(urls)//100)+1}")

            tasks.append(fetch_url(session, url, timeout))

        results = await asyncio.gather(*tasks)

        for idx, (url, result) in enumerate(zip(urls.Url, results), start=1):
            if result is None:
                continue

            try:
                tree = etree.HTML(result)
                article_element = tree.find(".//article")
                if article_element is not None:
                    outer_html = etree.tostring(article_element, encoding='unicode')
                    article_body = remove_elements(outer_html)
                    article = [line for line in article_body.split("\n") if len(line) >= 40]
                    articles_df.loc[idx - 1] = (urls["url"][idx - 1],urls["title"][idx - 1],article)
                else:
                    # If no <article> element is found, try using BeautifulSoup with the specific ID
                    soup = BeautifulSoup(result, 'html.parser')
                    article_id = 'main-content'  # Replace with the actual ID you are targeting
                    article_element = soup.find(id=article_id)
                    if article_element:
                        article_body = remove_elements(str(article_element))
                        article = [line for line in article_body.split("\n") if len(line) >= 40]
                        articles_df.loc[idx - 1] = (urls["url"][idx - 1],urls["title"][idx - 1],article)
                    else:
                        logging.warning(f"No article content found on the page with ID {article_id}.")
            except Exception as e:
                logging.error(f"Error extracting article content from {url}: error: {e}")


    return articles_df

In [31]:
timeout = 20  # Timeout value
async def main():
    articles = await request_sentences_from_urls_async(df_BBC, timeout)
    return articles
articles_df = await main()


INFO:root:
Processing URL 1/4
INFO:root:
Processing URL 2/4
INFO:root:
Processing URL 3/4
INFO:root:
Processing URL 4/4


In [43]:
articles_df

Unnamed: 0,url,content
0,One-minute World News,[One-minute World News This video can not be p...
1,South East England weather forecast,[South East England weather forecast This vide...
2,East Midlands weather forecast,[East Midlands weather forecast This video can...
3,South England weather forecast,[South England weather forecast This video can...
4,South England weather forecast,[South England weather forecast This video can...
...,...,...
340,Man Utd news: Should club keep Erik ten Hag?,[Should Man Utd keep Ten Hag? What information...
341,Man City 1-2 Man Utd: What should Sir Jim Ratc...,[What does Ratcliffe do now? PA Media Simon St...
342,Pep Guardiola: Manchester City boss says he ma...,[Guardiola admits tactical mistake in FA Cup f...
343,Headline here...,[Proud Kinghorn stars as Toulouse lift Champio...


In [35]:
print(len(articles_df),len(df_BBC))
for title, content in articles_df.:
    # Find indices where the title matches
    mask = df_BBC['Title'] == title
    df_BBC.loc[mask, 'content'] = content
df_BBC

345 345


Unnamed: 0,Url,Last Modified,Title,Topic,content
0,https://www.bbc.com/news/10462520,2024-05-25T15:30:45Z,One-minute World News,news,One-minute World News This video can not be pl...
1,https://www.bbc.com/weather/forecast-video/214...,2024-05-25T17:25:07Z,South East England weather forecast,weather,South East England weather forecast This video...
2,https://www.bbc.com/weather/forecast-video/214...,2024-05-25T17:31:57Z,East Midlands weather forecast,weather,East Midlands weather forecast This video can ...
3,https://www.bbc.com/weather/forecast-video/247...,2024-05-25T17:11:35Z,South England weather forecast,weather,
4,https://www.bbc.com/weather/forecast-video/247...,2024-05-25T17:11:35Z,South England weather forecast,weather,
...,...,...,...,...,...
340,https://www.bbc.com/sport/football/articles/ck...,2024-05-25T18:25:45Z,Man Utd news: Should club keep Erik ten Hag?,sport,
341,https://www.bbc.com/sport/football/articles/cg...,2024-05-25T18:26:19Z,Man City 1-2 Man Utd: What should Sir Jim Ratc...,sport,
342,https://www.bbc.com/sport/football/articles/cl...,2024-05-25T18:27:43Z,Pep Guardiola: Manchester City boss says he ma...,sport,
343,https://www.bbc.com/sport/rugby-union/articles...,2024-05-25T18:34:20Z,Headline here...,sport,


In [29]:
df_BBC.loc[3].content

'nan'

In [66]:
idx = 75

url = "https://www.bbc.com/news/world-middle-east-67053011"
response = requests.get(url, timeout=timeout)
tree = etree.HTML(response.content)
article_element = tree.find(".//article")
            
if article_element is not None:
    outer_html = etree.tostring(article_element, encoding='unicode')
    
    article_body = remove_elements(outer_html)
    
article_body



'Hamas hostages: Stories of the people taken from Israel Share close panel Share page Copy link About sharing Related Topics Israel-Gaza war Family handout Amiram Cooper Guy Gilboa-Dalal and Tsachi Idan are being held in Gaza A total of 121 hostages remain unaccounted for after being kidnapped by Hamas on 7 October last year - at least 37 of them are presumed dead According to Israel 252 Israelis and foreigners were taken during the attacks Israel gives an official figure of 125 hostages being held in Gaza because it includes four people taken hostage in 2014 and 2015 Two of these are believed to have died Who are the released Israeli hostages? I’m not ready to lose hope: The hostages still in Gaza These are the stories of those hostages who are still being held which have either been confirmed by the BBC or credibly reported This list is regularly updated and names may change as some people feared kidnapped are confirmed to have been killed or released Last updated on 24 May 2024 at 1

In [53]:
outer_html

'<article class="ssrcss-1ag7mww-ArticleWrapper e1nh2i2l3"><div class="ssrcss-1ocoo3l-Wrap e42f8511"><div width="wide" class="ssrcss-ahx90r-WrapWithWidth e42f8510"><header data-component="legacy-header-block" class="ssrcss-15i1622"><h1 id="main-heading" type="headline" tabindex="-1" class="ssrcss-1pcdslv-Heading e10rt3ze0">Hamas hostages: Stories of the people taken from Israel</h1><div><ul role="list" class="ssrcss-j4gvkq-MetadataStripContainer eh44mf03"><div class="ssrcss-13nu8ri-GroupChildrenForWrapping eh44mf02"><li role="listitem" class="ssrcss-30fcoe-MetadataStripItem eh44mf01"><div class="visually-hidden ssrcss-1f39n02-VisuallyHidden e16en2lz0">Published</div><div class="ssrcss-m5j4pi-MetadataContent eh44mf00"><span class="ssrcss-1pvwv4b-MetadataSnippet e4wm5bw3"><span class="ssrcss-1if1g9v-MetadataText e4wm5bw1"><time data-testid="timestamp" datetime="2024-05-24T09:10:58.000Z"><span data-testid="time-and-date:clock" class="ssrcss-1mh4yp1-IconContainer e4wm5bw0"><svg viewbox="0 0

In [160]:
article_body

['Driver 82 dies after hitting stone wall Google The incident happened on Chapel Street on Thursday An 82-year-old driver has died after his car hit a stone wall in Barnsley The mans Red Vauxhall Vectra collided with the wall at the junction of Chapel Street in Ardsley at about 11:26 BST on Thursday South Yorkshire Police said He was taken to hospital but later died from his injuries Police have appealed for anyone with information or dashcam footage of the crash to come forward']

## Embeding and similarity check

In [72]:
import torch
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
import re

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

article_main_body = list(bbc_news.values())

# Load tokenizer and model
tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base").to(device)
if torch.cuda.device_count() > 1:
    encoder = torch.nn.DataParallel(encoder)

print("Using device:", device)

# Initialize an empty list to store embeddings
embeddings = []

# Process and encode each article body
for i, data in enumerate(article_main_body):
    # Tokenize with padding
    inputs = tokenizer("".join(data), return_tensors="pt", padding='max_length', truncation=True, max_length=512).to(device)  # Move inputs to GPU
    with torch.no_grad():  # No need to track gradients during inference
        embedding = encoder(**inputs).pooler_output
    embeddings.append(embedding)
    if i % 10 == 0:
        print(f"{i // 10}/{len(article_main_body) // 10}")

print("Encoding completed.")

Some weights of DPRQuestionEncoder were not initialized from the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base and are newly initialized: ['bert_model.embeddings.LayerNorm.bias', 'bert_model.embeddings.LayerNorm.weight', 'bert_model.embeddings.position_embeddings.weight', 'bert_model.embeddings.token_type_embeddings.weight', 'bert_model.embeddings.word_embeddings.weight', 'bert_model.encoder.layer.0.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.0.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.0.attention.output.dense.bias', 'bert_model.encoder.layer.0.attention.output.dense.weight', 'bert_model.encoder.layer.0.attention.self.key.bias', 'bert_model.encoder.layer.0.attention.self.key.weight', 'bert_model.encoder.layer.0.attention.self.query.bias', 'bert_model.encoder.layer.0.attention.self.query.weight', 'bert_model.encoder.layer.0.attention.self.value.bias', 'bert_model.encoder.layer.0.attention.self.value.weight', 'bert_model.encoder.layer.0.i

Using device: cuda
0/20
1/20
2/20
3/20
4/20
5/20
6/20
7/20
8/20
9/20
10/20
11/20
12/20
13/20
14/20
15/20
16/20
17/20
18/20
19/20
20/20
Encoding completed.


In [73]:
# Convert embeddings tensor to numpy arrays
embeddings_np = [embedding.cpu().numpy() for embedding in embeddings]

# Convert embeddings to float32
embeddings_np = [embedding.astype('float32') for embedding in embeddings_np]
embeddings_np = np.array(embeddings_np).reshape(len(embeddings_np), 768)

In [74]:
import pickle


content_embedding = (list(bbc_news.values()),embeddings_np)

with open('content_embedding.pkl', 'wb') as file:
    pickle.dump(content_embedding, file)

print("Embeddings saved successfully.")

Embeddings saved successfully.


In [77]:
import faiss
import numpy as np



# Initialize the index
index = faiss.IndexFlatL2(embeddings_np.shape[1])  # Assuming embeddings are of the same dimension

# Add embeddings to the index
index.add(embeddings_np)

# Search for similar embeddings
D, I = index.search(np.array(embeddings_np), k=2)  # k=2 returns the closest two vectors (including self)

high_sim = []
titles = list(bbc_news.keys())

# To keep track of which titles have been printed
printed_titles = set()

for i in range(len(I)):
    title1 = titles[i]
    closest_index = I[i][1]  # I[i][0] will be the index of itself, so we take the second closest
    title2 = titles[closest_index]
    similarity_score = 1 - D[i][1]  # cosine similarity is 1 - L2 distance
    if title1 not in printed_titles and title2 not in printed_titles and similarity_score >-0.4:
        print(f"Title 1: '{title1}'\nTitle 2: '{title2}'\nSimilarity Score: {similarity_score}")
        high_sim.append((title1, title2, similarity_score))
        printed_titles.add(title1)
        printed_titles.add(title2)

Title 1: 'French Open 2024: Draw, schedule, order of play & will Raducanu, Murray & Nadal feature?'
Title 2: 'Newspaper headlines: 'Faltering election start' and 'General ejection''
Similarity Score: -0.365966796875
Title 1: ' Majorca building collapse: Four dead and 16 injured, rescuers say'
Title 2: 'Government 'must go further' for a Hillsborough Law'
Similarity Score: -0.3289794921875
Title 1: 'Target to cut court backlog cannot be met, says watchdog'
Title 2: 'Icons of Football: McAvennie, Robertson, Jordan, Miller, Beattie, Greig'
Similarity Score: -0.268798828125
Title 1: 'US reviews co-operation with Georgia over 'foreign agent' law'
Title 2: 'Nelly Korda's dominance 'great' for women's golf - Georgia Hall'
Similarity Score: -0.39892578125
Title 1: 'New arena ticket levy will 'make a huge difference', West country venues say'
Title 2: 'Shop sales slump in April hit by heavy rain'
Similarity Score: -0.1796875
Title 1: 'Dorset's Big Picture: 20 May - 26 May 2024'
Title 2: 'Oxford

Try other types of  clustering methods such as :

Density-based spatial clustering (DBSC) : a density-based clustering algorithm that works on the assumption that clusters are dense regions in space separated by regions of lower density. It groups 'densely grouped' data points into a single cluster.

SPECTURAL CLUSTERING : uses information from the eigenvalues (spectrum) of special matrices (i.e. Affinity Matrix, Degree Matrix and Laplacian Matrix) derived from the graph or the data set.
DBA - ALOCATION


# Next steps

### 1

1) **SKY :** some urls are the same article from sky news and sky sports. check if they have same text and remove if they do?

2) **SKY & BBC :** Make sure BBC and SKY only have news and sports sections that are important to us. 

3) choose more websites