In [1]:
# PREREQUISITS - uncomment if necessary

# %run -i "kaggleAPI.py"  # Download the sample-data.csv product data from Kaggle
# !pip install -r requirements.txt  # install modules



In [4]:
import pandas as pd
import numpy as np
import transformers
import torch
import faiss
import sentence-transformers

from transformers import AutoTokenizer, AutoModel
from datasets import Dataset

In [20]:
df = pd.read_csv('sample-data.csv', )
df.head()

Unnamed: 0,id,description
0,1,Active classic boxers - There's a reason why o...
1,2,Active sport boxer briefs - Skinning up Glory ...
2,3,Active sport briefs - These superbreathable no...
3,4,"Alpine guide pants - Skin in, climb ice, switc..."
4,5,"Alpine wind jkt - On high ridges, steep ice an..."


In [11]:
comments_dataset = Dataset.from_pandas(df)
comments_dataset

Dataset({
    features: ['id', 'description'],
    num_rows: 500
})

In [12]:
comments_dataset = comments_dataset.map(
    lambda x: {"comment_length": len(x["description"].split())}
)

100%|██████████| 500/500 [00:00<00:00, 9209.83ex/s]


In [13]:
comments_dataset = comments_dataset.filter(lambda x: x["comment_length"] > 15)
comments_dataset

100%|██████████| 1/1 [00:00<00:00, 142.41ba/s]


Dataset({
    features: ['id', 'description', 'comment_length'],
    num_rows: 500
})

In [14]:
# TEXT EMBEDDINGS

from transformers import AutoTokenizer, AutoModel

model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

OSError: Can't load config for 'sentence-transformers/multi-qa-mpnet-base-dot-v1'. Make sure that:

- 'sentence-transformers/multi-qa-mpnet-base-dot-v1' is a correct model identifier listed on 'https://huggingface.co/models'

- or 'sentence-transformers/multi-qa-mpnet-base-dot-v1' is the correct path to a directory containing a config.json file



In [15]:
from transformers import AutoTokenizer, AutoModel
import torch

#CLS Pooling - Take output from first token
def cls_pooling(model_output):
    return model_output.last_hidden_state[:,0]

#Encode text
def encode(texts):
    # Tokenize sentences
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input, return_dict=True)

    # Perform pooling
    embeddings = cls_pooling(model_output)

    return embeddings


# Sentences we want sentence embeddings for
query = "How many people live in London?"
docs = ["Around 9 Million people live in London", "London is known for its financial district"]

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")
model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")

#Encode query and docs
query_emb = encode(query)
doc_emb = encode(docs)

#Compute dot score between query and all document embeddings
scores = torch.mm(query_emb, doc_emb.transpose(0, 1))[0].cpu().tolist()

#Combine docs & scores
doc_score_pairs = list(zip(docs, scores))

#Sort by decreasing score
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

#Output passages & scores
for doc, score in doc_score_pairs:
    print(score, doc)

OSError: Can't load config for 'sentence-transformers/multi-qa-mpnet-base-dot-v1'. Make sure that:

- 'sentence-transformers/multi-qa-mpnet-base-dot-v1' is a correct model identifier listed on 'https://huggingface.co/models'

- or 'sentence-transformers/multi-qa-mpnet-base-dot-v1' is the correct path to a directory containing a config.json file



In [18]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.1.0.tar.gz (78 kB)
     |████████████████████████████████| 78 kB 1.6 MB/s            
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
     |████████████████████████████████| 3.1 MB 2.9 MB/s            
[?25hCollecting tokenizers>=0.10.3
  Using cached tokenizers-0.10.3-cp38-cp38-macosx_10_11_x86_64.whl (2.2 MB)
Collecting torch>=1.6.0
  Using cached torch-1.10.0-cp38-none-macosx_10_9_x86_64.whl (147.1 MB)
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.1.0-py3-none-any.whl size=120999 sha256=1f9af32cac151eedf56d317a110cdeb168cdd5afdca45e8b9188c337db27b497
  Stored in directory: /Users/fred/Library/Caches/pip/wheels/52/19/88/6625593382e23a926740e6

In [24]:
from transformers import AutoTokenizer, AutoModel
import torch


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
sentences = ['This is an example sentence', 'Each sentence is converted']

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling. In this case, max pooling.
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

print("Sentence embeddings:")
print(sentence_embeddings)

OSError: Can't load config for 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'. Make sure that:

- 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2' is a correct model identifier listed on 'https://huggingface.co/models'

- or 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2' is the correct path to a directory containing a config.json file

