## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
import re
from lxml import etree
import json
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity,euclidean_distances,manhattan_distances
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

from sitemaps_utils import *


  from .autonotebook import tqdm as notebook_tqdm


## Data Collection

#### BBC

In [2]:
BBC_news_sitemaps = ["https://www.bbc.com/sitemaps/https-sitemap-com-news-1.xml",
                     "https://www.bbc.com/sitemaps/https-sitemap-com-news-2.xml",
                     "https://www.bbc.com/sitemaps/https-sitemap-com-news-3.xml"]

sky_news_sitemaps = ["https://news.sky.com/sitemap/sitemap-news.xml",
                    "https://www.skysports.com/sitemap/sitemap-news.xml"]

namespaces = {
    'sitemap': 'http://www.sitemaps.org/schemas/sitemap/0.9',
    'news': 'http://www.google.com/schemas/sitemap-news/0.9'
}

urls = {}
# BBC News uses 'sitemap:lastmod' for the date tag
urls["bbc"] = Extract_todays_urls_from_sitemaps(BBC_news_sitemaps, namespaces, 'sitemap:lastmod')
print(f"Number of BBC URLs: {len(urls['bbc'])}")

# Sky News uses 'news:publication_date' for the date tag
urls["sky"] = Extract_todays_urls_from_sitemaps(sky_news_sitemaps, namespaces, 'news:publication_date')
print(f"Number of Sky URLs: {len(urls['sky'])}")


Number of BBC URLs: 780
Number of Sky URLs: 52


## Data Processing

In [3]:
bbc_topics_to_drop = {"pidgin", "hausa", "swahili", "naidheachdan"}
df_BBC = process_news_data(urls, "bbc", bbc_topics_to_drop)

sky_topics_to_drop = {"arabic", "urdu"}
df_Sky = process_news_data(urls, "sky", sky_topics_to_drop)


INFO:langid.langid:initializing identifier


------ bbc ------
Topic
news         257
sport        109
weather       26
videos        13
newsround     10
Total        415

------ sky ------
Topic
story          33
football        5
olympics        2
rugby-union     1
cricket         1
tennis          1
more-sports     1
golf            1
boxing          1
f1              1
Total        47



## Data cleaning

In [4]:
# news_sport = ["news","sport"]
# news_sport_urls = df_BBC[df_BBC.Topic.isin(news_sport)]
# news_sport_urls.reset_index(inplace=True)

In [7]:
timeout = 20  # Timeout value
async def main():
    articles = await request_sentences_from_urls_async(df_BBC, timeout)
    return articles

bbc_news = await main()


INFO:root:
Processing URL 0/415
INFO:root:
Processing URL 10/415
INFO:root:
Processing URL 20/415
INFO:root:
Processing URL 30/415
INFO:root:
Processing URL 40/415
INFO:root:
Processing URL 50/415
INFO:root:
Processing URL 60/415
INFO:root:
Processing URL 70/415
INFO:root:
Processing URL 80/415
INFO:root:
Processing URL 90/415
INFO:root:
Processing URL 100/415
INFO:root:
Processing URL 110/415
INFO:root:
Processing URL 120/415
INFO:root:
Processing URL 130/415
INFO:root:
Processing URL 140/415
INFO:root:
Processing URL 150/415
INFO:root:
Processing URL 160/415
INFO:root:
Processing URL 170/415
INFO:root:
Processing URL 180/415
INFO:root:
Processing URL 190/415
INFO:root:
Processing URL 200/415
INFO:root:
Processing URL 210/415
INFO:root:
Processing URL 220/415
INFO:root:
Processing URL 230/415
INFO:root:
Processing URL 240/415
INFO:root:
Processing URL 250/415
INFO:root:
Processing URL 260/415
INFO:root:
Processing URL 270/415
INFO:root:
Processing URL 280/415
INFO:root:
Processing URL

## Embeding and similarity check

In [27]:
import torch
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
import re

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

article_main_body = list(bbc_news.values())

# Load tokenizer and model
tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base").to(device)
if torch.cuda.device_count() > 1:
    encoder = torch.nn.DataParallel(encoder)

print("Using device:", device)

# Initialize an empty list to store embeddings
embeddings = []

# Process and encode each article body
for i, data in enumerate(article_main_body):
    # Tokenize with padding
    inputs = tokenizer("".join(data), return_tensors="pt", padding='max_length', truncation=True, max_length=512).to(device)  # Move inputs to GPU
    with torch.no_grad():  # No need to track gradients during inference
        embedding = encoder(**inputs).pooler_output
    embeddings.append(embedding)
    if i % 10 == 0:
        print(f"{i // 10}/{len(article_main_body) // 10}")

print("Encoding completed.")


Some weights of DPRQuestionEncoder were not initialized from the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base and are newly initialized: ['bert_model.embeddings.LayerNorm.bias', 'bert_model.embeddings.LayerNorm.weight', 'bert_model.embeddings.position_embeddings.weight', 'bert_model.embeddings.token_type_embeddings.weight', 'bert_model.embeddings.word_embeddings.weight', 'bert_model.encoder.layer.0.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.0.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.0.attention.output.dense.bias', 'bert_model.encoder.layer.0.attention.output.dense.weight', 'bert_model.encoder.layer.0.attention.self.key.bias', 'bert_model.encoder.layer.0.attention.self.key.weight', 'bert_model.encoder.layer.0.attention.self.query.bias', 'bert_model.encoder.layer.0.attention.self.query.weight', 'bert_model.encoder.layer.0.attention.self.value.bias', 'bert_model.encoder.layer.0.attention.self.value.weight', 'bert_model.encoder.layer.0.i

Using device: cuda
0/40
1/40
2/40
3/40
4/40
5/40
6/40
7/40
8/40
9/40
10/40
11/40
12/40
13/40
14/40
15/40
16/40
17/40
18/40
19/40
20/40
21/40
22/40
23/40
24/40
25/40
26/40
27/40
28/40
29/40
30/40
31/40
32/40
33/40
34/40
35/40
36/40
37/40
38/40
39/40
40/40
Encoding completed.


In [28]:
# Convert embeddings tensor to numpy arrays
embeddings_np = [embedding.cpu().numpy() for embedding in embeddings]

# Convert embeddings to float32
embeddings_np = [embedding.astype('float32') for embedding in embeddings_np]
embeddings_np = np.array(embeddings_np).reshape(len(embeddings_np), 768)

In [20]:
import pickle


content_embedding = (list(bbc_news.values()),embeddings_np)

with open('content_embedding.pkl', 'wb') as file:
    pickle.dump(content_embedding, file)

print("Embeddings saved successfully.")

Embeddings saved successfully.


In [24]:
len(I)

408

In [30]:
import faiss
import numpy as np



# Initialize the index
index = faiss.IndexFlatL2(embeddings_np.shape[1])  # Assuming embeddings are of the same dimension

# Add embeddings to the index
index.add(embeddings_np)

# Search for similar embeddings
D, I = index.search(np.array(embeddings_np), k=2)  # k=2 returns the closest two vectors (including self)

high_sim = []
titles = list(bbc_news.keys())

# To keep track of which titles have been printed
printed_titles = set()

for i in range(len(I)):
    title1 = titles[i]
    closest_index = I[i][1]  # I[i][0] will be the index of itself, so we take the second closest
    title2 = titles[closest_index]
    similarity_score = 1 - D[i][1]  # cosine similarity is 1 - L2 distance
    if title1 not in printed_titles and title2 not in printed_titles:
        print(f"Title 1: '{title1}'\nTitle 2: '{title2}'\nSimilarity Score: {similarity_score}")
        high_sim.append((title1, title2, similarity_score))
        printed_titles.add(title1)
        printed_titles.add(title2)



Title 1: 'One-minute World News'
Title 2: 'Watch Newsround - signed and subtitled'
Similarity Score: -7.5859375
Title 1: 'What is the infected blood scandal and will victims get compensation?'
Title 2: 'Contaminated blood inquiry: Victim demands compensation clarity'
Similarity Score: -0.6265869140625
Title 1: 'South East England weather forecast'
Title 2: 'North East England weather forecast'
Similarity Score: -0.6766357421875
Title 1: 'East Midlands weather forecast'
Title 2: 'Middle East weather forecast'
Similarity Score: -1.239990234375
Title 1: 'South England weather forecast'
Title 2: 'South America weather forecast'
Similarity Score: -0.959228515625
Title 1: 'East Yorkshire & Lincolnshire weather forecast'
Title 2: 'Yorkshire weather forecast'
Similarity Score: -3.1702880859375
Title 1: 'South Africa in West Indies 2024 - fixtures, results & scorecards'
Title 2: 'England Euro 2024 squad: Marcus Rashford and Jordan Henderson left out of Gareth Southgate's provisional squad'
Simi

Try other types of  clustering methods such as :

Density-based spatial clustering (DBSC) : a density-based clustering algorithm that works on the assumption that clusters are dense regions in space separated by regions of lower density. It groups 'densely grouped' data points into a single cluster.

SPECTURAL CLUSTERING : uses information from the eigenvalues (spectrum) of special matrices (i.e. Affinity Matrix, Degree Matrix and Laplacian Matrix) derived from the graph or the data set.
DBA - ALOCATION


# Next steps

### 1

1) **SKY :** some urls are the same article from sky news and sky sports. check if they have same text and remove if they do?

2) **SKY & BBC :** Make sure BBC and SKY only have news and sports sections that are important to us. 

3) choose more websites