# Neural Search

An AI powered search engine using Transformers, K-Means, and Cosine Similarity.

In [4]:
!pip install -q transformers

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p38/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [13]:
import sagemaker
import pandas as pd

In [11]:
sess = sagemaker.Session()

bucket_name = sess.default_bucket()
role_name = sagemaker.get_execution_role()

bucket_prefix = "neural-search"

## Load data

In [23]:
data_location = f"s3://{bucket_name}/{bucket_prefix}/data/movies_metadata.csv"

df = pd.read_csv(data_location)

df.head(1)

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0


In [24]:
overview_df = df["overview"]

overview_df.head(1)

0    Led by Woody, Andy's toys live happily in his ...
Name: overview, dtype: object

## Load embedding model

In [58]:
from transformers import BertModel, BertTokenizer
import torch
from itertools import combinations

In [25]:
model_name = "bert-base-uncased"

tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [50]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    
    pooled = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    
    return torch.nn.functional.normalize(pooled, p=2, dim=1)

In [67]:
sample_text = ["The moon looks beautiful today", "I am going for a walk today", "Do you like the moon?", "Those are some tasty sandwiches", "Why is that castle old?"]

encoded_input = tokenizer(sample_text, padding=True, truncation=True, return_tensors="pt")

with torch.no_grad():
    out = model(**encoded_input)

sentence_embeddings = mean_pooling(out, encoded_input["attention_mask"])

sentence_embeddings.size()

torch.Size([5, 768])

In [68]:
cosine_similarity_model = torch.nn.CosineSimilarity(dim=0, eps=1e-6)

tuples = combinations(list(range(sentence_embeddings.size()[0])), 2)

with torch.no_grad():
    for i, j in tuples:
        similarity = cosine_similarity_model(sentence_embeddings[i], sentence_embeddings[j])
        print(f"'{sample_text[i]}', '{sample_text[j]}', similarity={similarity}")


'The moon looks beautiful today', 'I am going for a walk today', similarity=0.6303966045379639
'The moon looks beautiful today', 'Do you like the moon?', similarity=0.7295293807983398
'The moon looks beautiful today', 'Those are some tasty sandwiches', similarity=0.6001173257827759
'The moon looks beautiful today', 'Why is that castle old?', similarity=0.6742875576019287
'I am going for a walk today', 'Do you like the moon?', similarity=0.6220574378967285
'I am going for a walk today', 'Those are some tasty sandwiches', similarity=0.5492238402366638
'I am going for a walk today', 'Why is that castle old?', similarity=0.5797369480133057
'Do you like the moon?', 'Those are some tasty sandwiches', similarity=0.5650840401649475
'Do you like the moon?', 'Why is that castle old?', similarity=0.7014718055725098
'Those are some tasty sandwiches', 'Why is that castle old?', similarity=0.5975286364555359
