In [1]:
pip install --upgrade transformers datasets

Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading transformers-4.51.3-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m96.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.whl (183 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, transformers
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: transformers
    Found existing installation: transformers 4.51.1
    Uninstalling transformers-4.51.1:
      Successfully uninsta

In [2]:
# Semantic Search on Review Dataset - Kaggle Ready Notebook

import pandas as pd
import re
import torch
from sentence_transformers import SentenceTransformer, util
import os
import shutil

2025-04-18 10:35:46.477765: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744972546.666897      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744972546.723468      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
from IPython.display import FileLink, display

In [4]:
# Load dataset
file_path = "/kaggle/input/fake-reviews-dataset/fake reviews dataset.csv"  # Replace with your dataset name on Kaggle

def load_and_clean_data(file_path):
    df = pd.read_csv(file_path)
    df.columns = [col.strip().lower() for col in df.columns]
    
    # Check required columns
    if 'text_' not in df.columns:
        raise ValueError("Column 'text_' is required in the dataset.")

    df = df[df['text_'].notnull()]
    df['text'] = df['text_'].apply(preprocess_text)
    return df

In [5]:
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    return ""

In [6]:
# Label conversion
def assign_labels(df):
    if 'label' not in df.columns or 'text_' not in df.columns:
        raise ValueError("Dataset must contain 'text_' and 'label' columns. Found: {}".format(df.columns.tolist()))

    label_mapping = {'OR': 0, 'CG': 1}
    df = df[df['text_'].notnull()]
    df['label'] = df['label'].map(label_mapping)

    if df['label'].isnull().any():
        raise ValueError("Label conversion failed — check for invalid labels in your data.")

    df['text'] = df['text_'].apply(preprocess_text)
    return df[['label', 'text']]

In [7]:
# Semantic Search Class
import os
import torch
import shutil
import pickle
from sentence_transformers import SentenceTransformer, util
from IPython.display import FileLink, display

class SemanticSearchEngine:
    def __init__(self, texts, model_name='all-MiniLM-L6-v2'):
        print("Loading SentenceTransformer model...")
        self.model = SentenceTransformer(model_name)
        print("Encoding all reviews...")
        self.texts = texts
        self.embeddings = self.model.encode(texts, convert_to_tensor=True)
        print("✅ Embeddings ready!")

        # Save model
        os.makedirs("saved_model", exist_ok=True)
        self.model.save("saved_model")

        # Save embeddings and texts
        #os.makedirs("semantic_search_model", exist_ok=True)
        torch.save(self.embeddings, "saved_model/embeddings.pt")
        with open("saved_model/texts.pkl", "wb") as f:
            pickle.dump(self.texts, f)

        # Zip model and semantic search folder
        shutil.make_archive("saved_model", 'zip', "saved_model")
        #shutil.make_archive("semantic_search_model", 'zip', "semantic_search_model")

        # Create download links
        display(FileLink("saved_model.zip"))
        #display(FileLink("semantic_search_model.zip"))

    def search(self, query, top_k=5):
        query_embedding = self.model.encode(query, convert_to_tensor=True)
        cos_scores = util.pytorch_cos_sim(query_embedding, self.embeddings)[0]
        top_results = torch.topk(cos_scores, k=top_k)

        print(f"\nTop {top_k} results for query: \"{query}\"")
        for score, idx in zip(top_results[0], top_results[1]):
            print(f"Score: {score:.4f} - Review: {self.texts[idx]}")


In [8]:
# Run Semantic Search
if __name__ == "__main__":
    df = load_and_clean_data(file_path)
    search_engine = SemanticSearchEngine(df['text'].tolist())

    # Example queries
    search_engine.search("good quality and reliable product")
    search_engine.search("scam or fake review packaging damaged")

Loading SentenceTransformer model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Encoding all reviews...


Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

✅ Embeddings ready!


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Top 5 results for query: "good quality and reliable product"
Score: 0.7466 - Review: great product it is highly reliable durable and affordable prices touched my heart i would recommend this product to a friend
Score: 0.7350 - Review: good quality and nice price especially for the price very good quality
Score: 0.7247 - Review: product is good and great quality the only problem is that it comes with a small piece of cardboard if you want to
Score: 0.7232 - Review: great quality and function very well made
Score: 0.7037 - Review: a great product and excellent customer service great product for the moneythis is a pretty good


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Top 5 results for query: "scam or fake review packaging damaged"
Score: 0.5745 - Review: i wrote a review and here it is i received this product at a discounted rate in exchange for my honest and unbiased review the product was packaged in a
Score: 0.5620 - Review: package came beat up and retaped but actual product was fine
Score: 0.5474 - Review: i bought this directly from costco the shipping was quick and it arrived intact i was surprised to find that the packaging was plastic not the clear plastic plastic of the cardboard box the packaging was well packaged and in good condition the size was perfect
Score: 0.5318 - Review: ive had the bluesmart for a couple of years now and im still happy with the product the other two have had problems the first one is the plastic part the second one is the plastic part its a little thin on the bottom but i dont feel its going to get damaged i did get a free shipping label on the first one and the second one was great the first one was defective