## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
import re
from lxml import etree
import json
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity,euclidean_distances,manhattan_distances
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

from sitemaps_utils import *


  from .autonotebook import tqdm as notebook_tqdm


## Data Collection

#### BBC

In [2]:
BBC_news_sitemaps = ["https://www.bbc.com/sitemaps/https-sitemap-com-news-1.xml",
                     "https://www.bbc.com/sitemaps/https-sitemap-com-news-2.xml",
                     "https://www.bbc.com/sitemaps/https-sitemap-com-news-3.xml"]

sky_news_sitemaps = ["https://news.sky.com/sitemap/sitemap-news.xml",
                    "https://www.skysports.com/sitemap/sitemap-news.xml"]

namespaces = {
    'sitemap': 'http://www.sitemaps.org/schemas/sitemap/0.9',
    'news': 'http://www.google.com/schemas/sitemap-news/0.9'
}

urls = {}
# BBC News uses 'sitemap:lastmod' for the date tag
urls["bbc"] = Extract_todays_urls_from_sitemaps(BBC_news_sitemaps, namespaces, 'sitemap:lastmod')
print(f"Number of BBC URLs: {len(urls['bbc'])}")

# Sky News uses 'news:publication_date' for the date tag
urls["sky"] = Extract_todays_urls_from_sitemaps(sky_news_sitemaps, namespaces, 'news:publication_date')
print(f"Number of Sky URLs: {len(urls['sky'])}")


Number of BBC URLs: 473
Number of Sky URLs: 21


## Data Processing

In [35]:
df_BBC[(df_BBC.Topic == "news") or (df_BBC.Topic == "sport")]

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [3]:
bbc_topics_to_drop = {"pidgin", "hausa", "swahili", "naidheachdan"}
df_BBC = process_news_data(urls, "bbc", bbc_topics_to_drop)

sky_topics_to_drop = {"arabic", "urdu"}
df_Sky = process_news_data(urls, "sky", sky_topics_to_drop)


INFO:langid.langid:initializing identifier


------ bbc ------
Topic
news         161
sport         45
weather       15
newsround      7
videos         2
Total        230

------ sky ------
Topic
story       15
olympics     1
football     1
f1           1
golf         1
boxing       1
Total        20



## Data cleaning

In [20]:
# news_sport = ["news","sport"]
# news_sport_urls = df_BBC[df_BBC.Topic.isin(news_sport)]
# news_sport_urls.reset_index(inplace=True)

In [4]:
#bbc_news_df = df_BBC[df_BBC.Topic == "news"].reset_index(drop=True)
bbc_news = request_sentences_from_urls(df_BBC)

INFO:root:
Processing URL 0/230
INFO:root:
Processing URL 10/230
INFO:root:
Processing URL 20/230
INFO:root:
Processing URL 30/230
INFO:root:
Processing URL 40/230
INFO:root:
Processing URL 50/230
INFO:root:
Processing URL 60/230
INFO:root:
Processing URL 70/230
INFO:root:
Processing URL 80/230
ERROR:root:Failed to fetch the web page: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
INFO:root:
Processing URL 90/230
INFO:root:
Processing URL 100/230
INFO:root:
Processing URL 110/230
INFO:root:
Processing URL 120/230
INFO:root:
Processing URL 130/230
INFO:root:
Processing URL 140/230
INFO:root:
Processing URL 150/230
INFO:root:
Processing URL 160/230
ERROR:root:Failed to fetch the web page: HTTPSConnectionPool(host='www.bbc.com', port=443): Max retries exceeded with url: /newsround/articles/c1vv54yk9dvo (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1007)')))
INFO:root:
Proc

## Embeding and similarity check

In [6]:
import torch
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
import re

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

article_main_body = list(bbc_news.values())

# Load tokenizer and model
tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base").to(device)
print(encoder.device)

# Initialize an empty list to store embeddings
embeddings = []

# Process and encode each article body
for i, data in enumerate(article_main_body):
    # Tokenize with padding
    inputs = tokenizer("".join(data), return_tensors="pt", padding='max_length', truncation=True, max_length=512).to(device)  # Move inputs to GPU
    with torch.no_grad():  # No need to track gradients during inference
        embedding = encoder(**inputs).pooler_output
    embeddings.append(embedding)
    if i % 10 == 0:
        print(f"{i // 10}/{len(article_main_body) // 10}")



Some weights of DPRQuestionEncoder were not initialized from the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base and are newly initialized: ['bert_model.embeddings.LayerNorm.bias', 'bert_model.embeddings.LayerNorm.weight', 'bert_model.embeddings.position_embeddings.weight', 'bert_model.embeddings.token_type_embeddings.weight', 'bert_model.embeddings.word_embeddings.weight', 'bert_model.encoder.layer.0.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.0.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.0.attention.output.dense.bias', 'bert_model.encoder.layer.0.attention.output.dense.weight', 'bert_model.encoder.layer.0.attention.self.key.bias', 'bert_model.encoder.layer.0.attention.self.key.weight', 'bert_model.encoder.layer.0.attention.self.query.bias', 'bert_model.encoder.layer.0.attention.self.query.weight', 'bert_model.encoder.layer.0.attention.self.value.bias', 'bert_model.encoder.layer.0.attention.self.value.weight', 'bert_model.encoder.layer.0.i

cuda:0
0/21
1/21
2/21
3/21
4/21
5/21
6/21
7/21
8/21
9/21
10/21
11/21
12/21
13/21
14/21
15/21
16/21
17/21
18/21
19/21
20/21
21/21


In [7]:
# Convert embeddings tensor to numpy arrays
embeddings_np = [embedding.cpu().numpy() for embedding in embeddings]

# Convert embeddings to float32
embeddings_np = [embedding.astype('float32') for embedding in embeddings_np]
embeddings_np = np.array(embeddings_np).reshape(len(embeddings_np), 768)

In [8]:
import pickle


content_embedding = (list(bbc_news.values()),embeddings_np)

with open('content_embedding.pkl', 'wb') as file:
    pickle.dump(content_embedding, file)

print("Embeddings saved successfully.")

Embeddings saved successfully.


In [15]:
1 - D[10][1]

-2.3717041015625

In [27]:
for i in range(1,3,1):
    print(i)

1
2


In [11]:
import faiss
import numpy as np



# Initialize the index
index = faiss.IndexFlatL2(embeddings_np.shape[1])  # Assuming embeddings are of the same dimension

# Add embeddings to the index
index.add(embeddings_np)

# Search for similar embeddings
D, I = index.search(np.array(embeddings_np), k=2)  # k=2 returns the closest two vectors (including self)

high_sim = []
titles = list(bbc_news.keys())

# To keep track of which titles have been printed
printed_titles = set()

for i in range(len(I)):
    title1 = titles[i]
    closest_index = I[i][1]  # I[i][0] will be the index of itself, so we take the second closest
    title2 = titles[closest_index]
    similarity_score = 1 - D[i][1]  # cosine similarity is 1 - L2 distance

    if title1 not in printed_titles and title2 not in printed_titles:
        print(f"Title 1: '{title1}'\nTitle 2: '{title2}'\nSimilarity Score: {similarity_score}")
        high_sim.append((title1, title2, similarity_score))
        printed_titles.add(title1)
        printed_titles.add(title2)



INFO:faiss.loader:Loading faiss with AVX2 support.
INFO:faiss.loader:Successfully loaded faiss with AVX2 support.


Title 1: 'One-minute World News'
Title 2: 'BBC Archive 1948: Motor racing at Goodwood'
Similarity Score: -8.1224365234375
Title 1: 'What is the infected blood scandal and will victims get compensation?'
Title 2: 'Infected blood scandal: Sunak promises 'comprehensive' blood compensation'
Similarity Score: -0.6851806640625
Title 1: 'East Midlands weather forecast'
Title 2: 'Channel Islands weather forecast'
Similarity Score: -2.74560546875
Title 1: 'South England weather forecast'
Title 2: 'South West England weather forecast'
Similarity Score: -1.4447021484375
Title 1: 'East Yorkshire & Lincolnshire weather forecast'
Title 2: 'Yorkshire weather forecast'
Similarity Score: -4.03564453125
Title 1: 'South Africa in West Indies 2024 - fixtures, results & scorecards'
Title 2: 'Hull: Images released of new cycle scheme'
Similarity Score: -3.998779296875
Title 1: 'Your pictures of Scotland: Photographs from around the country'
Title 2: 'Incredible new photos of the Milky Way from around the wo

Try other types of  clustering methods such as :

Density-based spatial clustering (DBSC) : a density-based clustering algorithm that works on the assumption that clusters are dense regions in space separated by regions of lower density. It groups 'densely grouped' data points into a single cluster.

SPECTURAL CLUSTERING : uses information from the eigenvalues (spectrum) of special matrices (i.e. Affinity Matrix, Degree Matrix and Laplacian Matrix) derived from the graph or the data set.
DBA - ALOCATION


# Next steps

### 1

1) **SKY :** some urls are the same article from sky news and sky sports. check if they have same text and remove if they do?

2) **SKY & BBC :** Make sure BBC and SKY only have news and sports sections that are important to us. 

3) choose more websites