# Neural Search

An AI powered search engine using Transformers, K-Means, and Cosine Similarity.

In [1]:
!pip install -q transformers

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p38/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [2]:
import sagemaker
import pandas as pd

In [3]:
sess = sagemaker.Session()

bucket_name = sess.default_bucket()
role_name = sagemaker.get_execution_role()

bucket_prefix = "neural-search"

## Load data

In [61]:
data_location = f"s3://{bucket_name}/{bucket_prefix}/data/movies_metadata.csv"

df = pd.read_csv(data_location)[["title", "overview"]]
df = df[df["overview"].notna()]

df.head(2)

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,title,overview
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...


## Load embedding model

In [29]:
from transformers import BertModel, BertTokenizer
import torch
import numpy as np

In [44]:
model_name = "bert-base-uncased"

tokenizer = BertTokenizer.from_pretrained(model_name)

text_model = BertModel.from_pretrained(model_name)
cosine_similarity_model = torch.nn.CosineSimilarity(dim=0, eps=1e-6)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    
    pooled = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    
    return torch.nn.functional.normalize(pooled, p=2, dim=1)

In [25]:
def create_embeddings(text, tokenizer, model):
    encoded_input = tokenizer(
        text,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    
    with torch.no_grad():
        model_output = model(**encoded_input)
    
    return mean_pooling(model_output, encoded_input["attention_mask"])

### Example embedding similarities

In [45]:
from itertools import combinations

sample_text = [
    "Andy is going to the beach on Sunday with his friends",
    "Mandy is going to the movies Tuesday with her Mum",
    "The cargo ship sailed through the night",
]

sample_embeddings = create_embeddings(sample_text, tokenizer, text_model).cpu().numpy()

tuples = combinations(list(range(sentence_embeddings.size()[0])), 2)

with torch.no_grad():
    for i, j in tuples:
        similarity = cosine_similarity_model(sentence_embeddings[i], sentence_embeddings[j])
        print(f"'{sample_text[i]}', '{sample_text[j]}', similarity={similarity}")

'Andy is going to the beach on Sunday with his friends', 'Mandy is going to the movies Tuesday with her Mum', similarity=0.8401806950569153
'Andy is going to the beach on Sunday with his friends', 'The cargo ship sailed through the night', similarity=0.561195433139801
'Mandy is going to the movies Tuesday with her Mum', 'The cargo ship sailed through the night', similarity=0.5241331458091736


## Create model pipeline

### Preprocessing
- Take the text from pandas
- Run the model on all of the text
- Record the embedding vectors in the dataframe
- Perform KMeansClustering and record the cluster for each vector

### Perform search
- Create embeddings
- Get cluster of embedding vectors
- Find all text with the same cluster
- Perform cosine similarity search between all elements in the cluster
- Return the top K most elements

In [41]:
df["embeddings"] = np.nan
df["cluster"] = -1

df.head(2)

Unnamed: 0,title,overview,embeddings,cluster
0,Toy Story,"Led by Woody, Andy's toys live happily in his ...",,-1
1,Jumanji,When siblings Judy and Peter discover an encha...,,-1


In [None]:
batch_size = 128

input_text = df["overview"].values

for i in range(0, len(input_text) // batch_size, batch_size):
    batch = list(input_text[i:i + batch_size])
    
    batch_embeddings = create_embeddings(batch, tokenizer, text_model)
    
    print(batch_embeddings)
    
    break