# Download and Import dependecies

In [None]:
!pip install faiss-cpu
!pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faiss-cpu
  Downloading faiss_cpu-1.7.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m62.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.4
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence-transformers)
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m 

In [None]:
import numpy as np
import torch
import os
import pandas as pd
import faiss
import time
from sentence_transformers import SentenceTransformer
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from datetime import datetime

# Preprocessing Data

In [None]:
df=pd.read_csv("abcnews-date-text.csv")
df.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85041 entries, 0 to 85040
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   publish_date   85041 non-null  int64 
 1   headline_text  85041 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.3+ MB


In [None]:
data=df.headline_text.to_list()
data[:5]

['aba decides against community broadcasting licence',
 'act fire witnesses must be aware of defamation',
 'a g calls for infrastructure protection summit',
 'air nz staff in aust strike for pay rise',
 'air nz strike to affect australian travellers']

In [None]:
# Preprocessing and Cleaning
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Preprocess and clean the text
def preprocess_text(text):
    # Remove punctuation and convert to lowercase
    text = re.sub(r'[^\w\s]', '', text.lower())
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Perform stemming
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text

In [None]:
# Preprocess a list of text data
def preprocess_data(data):
    # Apply preprocess_text function to each text in the data
    preprocessed_data = [preprocess_text(text) for text in data]
    return preprocessed_data


In [None]:
# Preprocess a query text
def preprocess_query(query):
    # Apply preprocess_text function to the query text
    preprocessed_query = preprocess_text(query)
    return preprocessed_query


In [None]:
# Preprocess the data
preprocessed_data = preprocess_data(data)

In [None]:
#view first 5 processed articles
preprocessed_data[:5]

['aba decid commun broadcast licenc',
 'act fire wit must awar defam',
 'g call infrastructur protect summit',
 'air nz staff aust strike pay rise',
 'air nz strike affect australian travel']

# Sentence-BERT

In [None]:
# Initialize the SentenceTransformer model
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

Downloading (…)925a9/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)1a515925a9/README.md:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading (…)515925a9/config.json:   0%|          | 0.00/550 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)925a9/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading (…)1a515925a9/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)15925a9/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [None]:
# Encode the preprocessed data using the SentenceTransformer model
encoded_data = model.encode(preprocessed_data)

In [None]:
np.save('encoded_data.npy', encoded_data)
# Load the encoded vectors from the saved file
encoded_data = np.load('encoded_data.npy')

# FAISS + SBERT

In [None]:
# Step 1: Create an IndexFlatIP instance with the correct dimensionality
index_flat = faiss.IndexFlatIP(768)

# Step 2: Create an IndexIVFFlat instance with the same dimensionality as the encoded vectors
num_cells = 100
index_ivf = faiss.IndexIVFFlat(index_flat, 768, num_cells, faiss.METRIC_INNER_PRODUCT)

# Step 3: Train the index by adding the encoded_data with associated IDs
index_ivf.train(encoded_data)

# Step 4: Add the encoded_data vectors with corresponding IDs to the index
index_ivf.add_with_ids(encoded_data, np.array(range(len(data))))

In [None]:
# Save the FAISS index to a file
faiss.write_index(index_ivf, 'abc_news')

In [None]:
# Read the FAISS index from a file
index = faiss.read_index('abc_news')

# Query Search

In [None]:
def format_result(results, similarity_scores, publication_dates, hot_word):
    """
    Formats the search results with their similarity scores, publication dates, and a hot word.

    Args:
        results (list): List of search results.
        similarity_scores (list): List of similarity scores corresponding to each result.
        publication_dates (list): List of publication dates corresponding to each result.
        hot_word (str): Hot word associated with the search results.

    Returns:
        list: Formatted search results with similarity scores, publication dates, and the hot word.
    """
    formatted_results = []
    for result, score, date in zip(results, similarity_scores, publication_dates):
        formatted_result = "Result: {}\nSimilarity Score: {:.4f}\nPublication Date: {}\nHot Word: {}\n".format(
            result, score, date, hot_word)
        formatted_results.append(formatted_result)
    return formatted_results


In [None]:
def search(query):
    """
    Performs a similarity search based on the given query.

    Args:
        query (str): The search query.

    Returns:
        list: Formatted search results.
    """
    t = time.time()
    # Preprocess the query
    preprocessed_query = preprocess_query(query)

    # Encode the preprocessed query using Sentence-BERT model
    query_vector = model.encode([preprocessed_query])

    k = 5  # Number of nearest neighbors to retrieve

    # Perform similarity search using FAISS index
    distances, indices = index.search(query_vector, k)

    # Convert distances to similarity scores
    similarity_scores = 1 - distances.flatten()

    # Get publication dates for the search results
    publication_dates = df.iloc[indices.flatten().astype(int)]['publish_date'].tolist()

    # Extract the common word between documents
    count_vectorizer = CountVectorizer()
    selected_data = [data[i] for i in indices.flatten().astype(int)]
    count_matrix = count_vectorizer.fit_transform(selected_data)
    feature_names = count_vectorizer.get_feature_names_out()
    hot_word = feature_names[count_matrix.sum(axis=0).argmax()]

    # Format the search results
    formatted_results = format_result(np.array(data)[indices.flatten().astype(int)], similarity_scores, publication_dates, hot_word)

    # Print the total execution time
    print('Total time: {}'.format(time.time() - t))

    return formatted_results


In [None]:
query = input("Enter your query: ")
results = search(query)
print('Results:')
for result in results:
    print(result)


Enter your query: school
Total time: 0.031479835510253906
Results:
Result: sars threat closes beijing schools
Similarity Score: -221.7443
Publication Date: 20030423
Hot Word: school

Result: forums to consider school leaving age
Similarity Score: -219.3739
Publication Date: 20040209
Hot Word: school

Result: education minister raises school leaving age at
Similarity Score: -214.3020
Publication Date: 20040211
Hot Word: school

Result: pm encourages debate over schools
Similarity Score: -213.1128
Publication Date: 20040126
Hot Word: school

Result: union highlights school concerns
Similarity Score: -212.8883
Publication Date: 20030808
Hot Word: school



In [None]:
query = input("Enter your query: ")
results = search(query)
print('Results:')
for result in results:
    print(result)

Enter your query: How high school football coach is focusing on mental health
Total time: 0.028017282485961914
Results:
Result: drivers urged to take care as school resumes
Similarity Score: -164.4830
Publication Date: 20031006
Hot Word: school

Result: detainee children will attend port augusta schools
Similarity Score: -162.4056
Publication Date: 20030226
Hot Word: school

Result: parents teachers await schools review
Similarity Score: -161.0366
Publication Date: 20040415
Hot Word: school

Result: child detainees to attend primary school in sa
Similarity Score: -159.8206
Publication Date: 20030306
Hot Word: school

Result: support for driver ed school to continue
Similarity Score: -158.2196
Publication Date: 20030605
Hot Word: school



In [None]:
query = input("Enter your query: ")
results = search(query)
print('Results:')
for result in results:
    print(result)

Enter your query: How high school football coach is focusing on mental health
Total time: 0.030501127243041992
Results:
Result: drivers urged to take care as school resumes
Similarity Score: -164.4830
Publication Date: 20031006
Hot Word: school

Result: detainee children will attend port augusta schools
Similarity Score: -162.4056
Publication Date: 20030226
Hot Word: school

Result: parents teachers await schools review
Similarity Score: -161.0366
Publication Date: 20040415
Hot Word: school

Result: child detainees to attend primary school in sa
Similarity Score: -159.8206
Publication Date: 20030306
Hot Word: school

Result: support for driver ed school to continue
Similarity Score: -158.2196
Publication Date: 20030605
Hot Word: school



In [None]:
query = input("Enter your query: ")
results = search(query)
print('Results:')
for result in results:
    print(result)

Enter your query: Kourtney Kardashian announces she is pregnant with Travis Barker at Blink-182 concert
Total time: 0.03139042854309082
Results:
Result: scream queen sharapova shrieks into semi finals
Similarity Score: -202.9151
Publication Date: 20030614
Hot Word: queen

Result: pop star anastacia recalls breast cancer shock
Similarity Score: -200.5634
Publication Date: 20040331
Hot Word: queen

Result: britney spears seeks annulment of vegas wedding
Similarity Score: -192.7312
Publication Date: 20040106
Hot Word: queen

Result: agassi advances serena smashes schett
Similarity Score: -189.6812
Publication Date: 20030531
Hot Word: queen

Result: queen leaves hospital after double surgery
Similarity Score: -189.3689
Publication Date: 20031213
Hot Word: queen



# Cosine similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def search(query):
    t = time.time()
    preprocessed_query = preprocess_query(query)
    query_vector = model.encode([preprocessed_query])
    k = 5
    _, indices = index.search(query_vector, k)

    query_emb = query_vector[0]  # Extract the query embedding
    document_embs = model.encode(data)  # Encode all the document data

    # Compute cosine similarity between the query and document embeddings
    similarity_scores = cosine_similarity([query_emb], document_embs)[0]

    publication_dates = df.iloc[indices.flatten().astype(int)]['publish_date'].tolist()  # Get publication dates
    formatted_results = format_result(np.array(data)[indices.flatten().astype(int)], similarity_scores, publication_dates)

    print('Total time: {}'.format(time.time() - t))
    return formatted_results


In [None]:
 #Result Formatting
def format_result(result, similarity_scores, publication_dates):
    formatted_results = []
    for res, sim_score, pub_date in zip(result, similarity_scores, publication_dates):
        formatted_result = f'Similarity Score: {sim_score:.4f}\n' \
                           f'Publication Date: {pub_date}\n' \
                           f'{res}\n'
        formatted_results.append(formatted_result)
    return formatted_results

In [None]:
query = input("Enter your query: ")
results = search(query)
print('Results:')
for result in results:
    print(result)

Enter your query: How high school football coach is focusing on mental health
Total time: 35.056047201156616
Results:
Similarity Score: 0.2944
Publication Date: 20031006
drivers urged to take care as school resumes

Similarity Score: 0.3653
Publication Date: 20030226
detainee children will attend port augusta schools

Similarity Score: 0.3878
Publication Date: 20040415
parents teachers await schools review

Similarity Score: 0.3174
Publication Date: 20030306
child detainees to attend primary school in sa

Similarity Score: 0.2911
Publication Date: 20030605
support for driver ed school to continue

