# Building a Context-Aware Content Recommendation System Using BERT Embeddings | News Articles Recommedations Using Sentence Transformers


# Step 1: Import Required Libraries


In [35]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Step 2: Load Your Dataset


In [36]:
# Try reading the file with a different encoding
df = pd.read_csv('/content/Articles.csv', encoding='ISO-8859-1')

# Inspect the data
df

Unnamed: 0,Article,Date,Heading,NewsType
0,KARACHI: The Sindh government has decided to b...,1/1/2015,sindh govt decides to cut public transport far...,business
1,HONG KONG: Asian markets started 2015 on an up...,1/2/2015,asia stocks up in new year trad,business
2,HONG KONG: Hong Kong shares opened 0.66 perce...,1/5/2015,hong kong stocks open 0.66 percent lower,business
3,HONG KONG: Asian markets tumbled Tuesday follo...,1/6/2015,asian stocks sink euro near nine year,business
4,NEW YORK: US oil prices Monday slipped below $...,1/6/2015,us oil prices slip below 50 a barr,business
...,...,...,...,...
2687,strong>DUBAI: Dubai International Airport and ...,3/25/2017,Laptop ban hits Dubai for 11m weekend traveller,business
2688,"strong>BEIJING: Former Prime Minister, Shaukat...",3/26/2017,Pak China relations not against any third coun...,business
2689,strong>WASHINGTON: Uber has grounded its fleet...,3/26/2017,Uber grounds self driving cars after accid,business
2690,strong>BEIJING: The New Development Bank plans...,3/27/2017,New Development Bank plans joint investments i...,business


In [37]:
df.isnull().sum()

Unnamed: 0,0
Article,0
Date,0
Heading,0
NewsType,0


# Step 3: Preprocessing the Text Data


In [38]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

# Download stopwords and punkt (for tokenization)
nltk.download('stopwords')
nltk.download('punkt')


# Initialize the PorterStemmer and stopwords
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Preprocessing function to clean text (removes punctuation, converts to lowercase, removes stopwords, applies stemming)
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenize the text
    words = text.split()

    # Remove stopwords and apply stemming
    processed_words = [stemmer.stem(word) for word in words if word not in stop_words]

    # Join the words back into a string
    return " ".join(processed_words)

# Apply preprocessing to the 'Heading' column (or any other text column you want)
df['cleaned_heading'] = df['Heading'].apply(preprocess_text)

df

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,Article,Date,Heading,NewsType,cleaned_heading
0,KARACHI: The Sindh government has decided to b...,1/1/2015,sindh govt decides to cut public transport far...,business,sindh govt decid cut public transport fare 7pc...
1,HONG KONG: Asian markets started 2015 on an up...,1/2/2015,asia stocks up in new year trad,business,asia stock new year trad
2,HONG KONG: Hong Kong shares opened 0.66 perce...,1/5/2015,hong kong stocks open 0.66 percent lower,business,hong kong stock open 066 percent lower
3,HONG KONG: Asian markets tumbled Tuesday follo...,1/6/2015,asian stocks sink euro near nine year,business,asian stock sink euro near nine year
4,NEW YORK: US oil prices Monday slipped below $...,1/6/2015,us oil prices slip below 50 a barr,business,us oil price slip 50 barr
...,...,...,...,...,...
2687,strong>DUBAI: Dubai International Airport and ...,3/25/2017,Laptop ban hits Dubai for 11m weekend traveller,business,laptop ban hit dubai 11m weekend travel
2688,"strong>BEIJING: Former Prime Minister, Shaukat...",3/26/2017,Pak China relations not against any third coun...,business,pak china relat third countri shaukat aziz
2689,strong>WASHINGTON: Uber has grounded its fleet...,3/26/2017,Uber grounds self driving cars after accid,business,uber ground self drive car accid
2690,strong>BEIJING: The New Development Bank plans...,3/27/2017,New Development Bank plans joint investments i...,business,new develop bank plan joint invest economi alo...


# Step 4: Initialize The Sentence Transformer Model

In [39]:
# Load the pre-trained Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')  # You can use other models if you prefer

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Step 5: Text Embeddings Using BERT


In [40]:
# Generate embeddings for all article headings in the dataset
headings = df['cleaned_heading'].tolist()  # Assuming you have a 'cleaned_heading' column
embeddings = model.encode(headings, convert_to_tensor=True)

In [43]:
embeddings

tensor([[-0.0313,  0.0338, -0.0093,  ..., -0.0394,  0.0069,  0.0495],
        [-0.0520,  0.0016, -0.0221,  ..., -0.1266, -0.0875,  0.0417],
        [ 0.0940, -0.0362,  0.0582,  ..., -0.1183, -0.0391,  0.0702],
        ...,
        [ 0.0069, -0.0256,  0.0097,  ...,  0.0711,  0.0768, -0.0734],
        [ 0.0464, -0.0076, -0.0285,  ..., -0.1167,  0.0468, -0.0332],
        [-0.1320,  0.1207,  0.0169,  ..., -0.0530, -0.0112, -0.0319]])

In [44]:
embeddings[0]

tensor([-3.1311e-02,  3.3842e-02, -9.3314e-03,  5.3722e-02, -4.4769e-02,
         8.9522e-03,  4.7104e-02, -7.4335e-03, -3.6179e-02,  6.6231e-02,
         2.2116e-02, -6.3882e-02,  1.2157e-02,  2.7805e-02, -1.8594e-02,
        -8.3610e-02,  1.6920e-02,  6.8192e-02, -5.9756e-03, -2.9542e-02,
         6.6594e-02,  3.6769e-02, -8.0550e-03,  4.9943e-02, -3.9254e-02,
         8.4563e-03,  5.9706e-02,  1.1833e-02,  2.0560e-02, -3.3311e-02,
        -8.6875e-03, -4.0942e-03, -1.0762e-02,  6.0253e-03, -1.1994e-05,
         4.8461e-02,  3.7607e-03, -7.1833e-02,  4.2102e-02, -2.7863e-02,
         4.5894e-02,  1.4585e-02, -6.1159e-02,  4.9844e-02,  9.6326e-02,
        -2.9634e-02, -2.6796e-02,  4.3725e-02,  2.0296e-02, -2.0286e-02,
         2.3837e-02,  1.1002e-01, -7.6997e-02,  5.1552e-02, -9.5125e-03,
        -1.0269e-01, -4.3023e-02,  1.2116e-02,  1.5209e-02,  4.2475e-02,
        -3.3582e-02, -4.3263e-03,  4.2477e-03, -6.0966e-02,  6.2552e-02,
         8.9488e-03, -5.8474e-02, -9.1306e-02,  5.0

# Save and Load Embeddings

In [45]:
import pickle

# Save the embeddings to a file
with open('embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings, f)

In [46]:
embeddings = pickle.load(open('embeddings.pkl', 'rb'))

# Step 6: Recommendations


In [51]:
# Function to recommend top N articles based on a search query
def recommend_articles_from_search(query, df, model, num_recommendations=5):
    # Step 0: Clean Query
    query = preprocess_text(query)
    # Step 1: Get the embedding for the search query
    query_embedding = model.encode(query, convert_to_tensor=True)

    # Step 2: Compute cosine similarity between the query and all article embeddings
    similarities = cosine_similarity(query_embedding.reshape(1, -1), embeddings)

    # Step 3: Get the indices of the top N most similar articles
    similarities = similarities.flatten()  # Flatten the 2D array to 1D
    top_indices = similarities.argsort()[-num_recommendations:][::-1]  # Get indices of top N

    # Step 4: Retrieve the corresponding articles and their similarity scores
    recommended_articles = df.iloc[top_indices][['Heading', 'NewsType', 'Article', 'Date']]
    return recommended_articles, similarities[top_indices]

In [52]:
# Example usage of the recommendation function
query = "Asian markets upswing"
recommended_articles, similarity_scores = recommend_articles_from_search(query, df, model, num_recommendations=5)

recommended_articles

Unnamed: 0,Heading,NewsType,Article,Date
119,most asian markets up tokyo at 15 year hig,business,Hong Kong: Japanese shares hit a 15-year high ...,4/22/2015
226,asian markets mostly recover from hefty sell off,business,Hong Kong: Most Asia shares rose Wednesday as ...,7/29/2015
2610,Dollar down Trump takes over Asia markets u,business,strong>HONG KONG: The dollar retreated against...,1/23/2017
220,asia markets mostly down shanghai up a 7th str...,business,Hong Kong: Asian markets mostly fell Friday fo...,7/24/2015
546,Asia stocks edge up to four month high after W...,business,strong>TOKYO: Asian shares edged up to a four-...,3/31/2016


In [53]:
# Example usage of the recommendation function
query = "Karachi News"
recommended_articles, similarity_scores = recommend_articles_from_search(query, df, model, num_recommendations=5)

recommended_articles

Unnamed: 0,Heading,NewsType,Article,Date
1230,Islamabad emerge victorious against Karachi,sports,SHARJAH: Islamabad United defeated Karachi Kin...,2/14/2016
1277,Karachi eliminated Islamabad to play Peshawar,sports,DUBAI: Islamabad United defeated Karachi Kings...,2/21/2016
1175,Karachi restrict Islamabad to 132 8,sports,DUBAI: Karachi Kings restricted to Islamabad U...,2/7/2016
1280,PSL Islamabad thump Peshawar to face Quetta in fi,sports,strong>DUBAI: Sharjeel Khans smashing 117 off...,2/22/2016
723,Pakistans stock market dominates Asia Bloomberg,business,strong>ISLAMABAD: Asias best-performing stock...,6/15/2016
