## THIS CODE WAS RUN ON KAGGLE

In [None]:
import pandas as pd
import os
import glob

base_dir = '/kaggle/input/wikipedia-index-and-plaintext-20230801/articles/'

file_paths = glob.glob(os.path.join(base_dir, '*/*.csv'))

dataframes = []

for file_path in file_paths:
    try:
        df = pd.read_csv(file_path)
        dataframes.append(df)
        print(f"Successfully loaded: {file_path}")
    except Exception as e:
        print(f"Error loading {file_path}: {e}")

if dataframes:
    combined_df = pd.concat(dataframes, ignore_index=True)
    print(f"Combined dataframe shape: {combined_df.shape}")
else:
    print("No dataframes were successfully loaded.")


Successfully loaded: /kaggle/input/wikipedia-index-and-plaintext-20230801/articles/y.csv/y.csv
Successfully loaded: /kaggle/input/wikipedia-index-and-plaintext-20230801/articles/o.csv/o.csv
Successfully loaded: /kaggle/input/wikipedia-index-and-plaintext-20230801/articles/s.csv/s.csv
Successfully loaded: /kaggle/input/wikipedia-index-and-plaintext-20230801/articles/1.csv/1.csv
Successfully loaded: /kaggle/input/wikipedia-index-and-plaintext-20230801/articles/i.csv/i.csv
Successfully loaded: /kaggle/input/wikipedia-index-and-plaintext-20230801/articles/3.csv/3.csv
Successfully loaded: /kaggle/input/wikipedia-index-and-plaintext-20230801/articles/q.csv/q.csv
Successfully loaded: /kaggle/input/wikipedia-index-and-plaintext-20230801/articles/h.csv/h.csv
Successfully loaded: /kaggle/input/wikipedia-index-and-plaintext-20230801/articles/5.csv/5.csv
Successfully loaded: /kaggle/input/wikipedia-index-and-plaintext-20230801/articles/2.csv/2.csv
Combined dataframe shape: (3948913, 5)


In [3]:
df.iloc[0]

id                                                    3864
revid                                           1185941266
url               https://en.wikipedia.org/wiki?curid=3864
title                                    2001 World Series
text     The 2001 World Series was the championship ser...
Name: 0, dtype: object

In [4]:
null_counts = df["text"].isnull().sum()
print(null_counts)

249357


In [None]:
import re

def preprocess_text(text):
    if pd.isnull(text):
        return ""
    
    text = str(text)
    
    text = re.sub(r'\s+', ' ', text)  
    text = re.sub(r'[^\w\s]', '', text) 
    return text.lower().strip()

print(f"Missing values before processing: {df['text'].isnull().sum()}")

df['text'] = df['text'].fillna('')

df['clean_text'] = df['text'].apply(preprocess_text)

print("\nSample cleaned text:")
print(df['clean_text'].iloc[0])


Missing values before processing: 249357

Sample cleaned text:
the 2001 world series was the championship series of major league baseballs mlb 2001 season the 97th edition of the world series it was a bestofseven playoff between the national league nl champion arizona diamondbacks and the threetime defending world series champions and american league al champion new york yankees the underdog diamondbacks defeated the heavily favored yankees four games to three to win the series considered one of the greatest world series of all time its memorable aspects included two extrainning games and three lateinning comebacks diamondbacks pitchers randy johnson and curt schilling were both named world series most valuable players the yankees advanced to the world series by defeating the oakland athletics three games to two in the al division series and then the seattle mariners in the al championship series four games to one it was the yankees fourth consecutive world series appearance after winn

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-mpnet-base-v2')  # 768-dim

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
import torch
from tqdm.auto import tqdm

doc_embeddings = []
batch_size = 32

for i in tqdm(range(0, len(df), batch_size)):
    batch = df['clean_text'].iloc[i:i+batch_size].tolist()
    with torch.no_grad():
        embeddings = model.encode(batch, show_progress_bar=False)
    doc_embeddings.extend(embeddings)

df['embedding'] = doc_embeddings


  0%|          | 0/13688 [00:00<?, ?it/s]

In [12]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp310-cp310-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m59.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [None]:
import faiss
import numpy as np

embeddings_array = np.array(df['embedding'].tolist()).astype('float32')

dimension = embeddings_array.shape[1]
index = faiss.IndexFlatIP(dimension)
faiss.normalize_L2(embeddings_array)
index.add(embeddings_array)


In [16]:
df.iloc[0]

id                                                         3864
revid                                                1185941266
url                    https://en.wikipedia.org/wiki?curid=3864
title                                         2001 World Series
text          The 2001 World Series was the championship ser...
clean_text    the 2001 world series was the championship ser...
embedding     [-0.052457873, -0.017308295, 0.0279646, 0.0233...
Name: 0, dtype: object

In [None]:
import faiss
import numpy as np

faiss.normalize_L2(embeddings_array)

dimension = embeddings_array.shape[1]  
index = faiss.IndexFlatIP(dimension)  

index.add(embeddings_array)

print(f"Number of vectors in the index: {index.ntotal}")


Number of vectors in the index: 437998


In [None]:
def semantic_search(query_embedding, index, top_k=5):
    """
    Perform semantic search using FAISS index.
    
    Parameters:
    - query_embedding: numpy array of shape (1, dimension)
    - index: FAISS index object
    - top_k: Number of top results to retrieve
    
    Returns:
    - distances: List of distances to top_k results
    - indices: List of indices of top_k results
    """
    faiss.normalize_L2(query_embedding)  # normaliztion
    distances, indices = index.search(query_embedding, top_k)
    return distances, indices

query_embedding = np.random.rand(1, dimension).astype('float32')

distances, indices = semantic_search(query_embedding, index, top_k=5)

print("Distances:", distances)
print("Indices:", indices)


Distances: [[0.06125414 0.05786442 0.05755398 0.05746093 0.05599687]]
Indices: [[137278 428664 290071 160028 168779]]


In [None]:
results = df.iloc[indices[0]]  # Fetch rows corresponding to the top-k indices

for i, row in results.iterrows():
    print(f"Title: {row['title']}")
    print(f"URL: {row['url']}")
    print(f"Text Snippet: {row['text'][:200]}...")  
    print("-" * 80)


Title: 2015 in public domain
URL: https://en.wikipedia.org/wiki?curid=42644836
Text Snippet: This is a list of authors whose works enter the public domain in part of the world in 2015.
Entering the public domain in Europe.
A work enters the public domain in most European countries (with the e...
--------------------------------------------------------------------------------
Title: 2-day CPET
URL: https://en.wikipedia.org/wiki?curid=74545453
Text Snippet: A 2-day CPET is a cardiopulmonary exercise test given on two successive days to measure the effect of post-exertional malaise (PEM) on a patient's ability to exercise. PEM is a cardinal symptom of mya...
--------------------------------------------------------------------------------
Title: 2019 Compostela Valley renaming plebiscite
URL: https://en.wikipedia.org/wiki?curid=62536277
Text Snippet: On December 7, 2019, a plebiscite was held to determine if residents of the Philippine province of Compostela Valley approve the renaming of 

In [None]:
query = "Baseball championship 2001"  

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-mpnet-base-v2')
query_embedding = model.encode([query]).astype('float32')

distances, indices = semantic_search(query_embedding.reshape(1, -1), index, top_k=5)

results = df.iloc[indices[0]]
for i, row in results.iterrows():
    print(f"Title: {row['title']}")
    print(f"URL: {row['url']}")
    print(f"Text Snippet: {row['text'][:200]}...")
    print("-" * 80)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Title: 2001 Senior League World Series
URL: https://en.wikipedia.org/wiki?curid=47526671
Text Snippet: The 2001 Senior League World Series took place from August 12–18 in Kissimmee, Florida, United States. Palm Harbor, Florida defeated Maracaibo, Venezuela in the championship game. 
This was the final ...
--------------------------------------------------------------------------------
Title: 2001 Conference USA baseball tournament
URL: https://en.wikipedia.org/wiki?curid=36219546
Text Snippet: The 2001 Conference USA baseball tournament was the 2001 postseason baseball championship of the NCAA Division I Conference USA, held at Zephyr Field in New Orleans, Louisiana, from May 16 through 20....
--------------------------------------------------------------------------------
Title: 2001 International League season
URL: https://en.wikipedia.org/wiki?curid=22122231
Text Snippet: The 2001 International League season took place from April to September 2001.
The Louisville Bats defeated the S