In [1]:
!pip install pymongo pandas transformers sentence-transformers transformers numpy boto3 torch scikit-learn matplotlib nltk sentence-transformers pandas langchain lark tiktoken langchain_community huggingface_hub replicate

Collecting transformers
  Using cached transformers-4.40.2-py3-none-any.whl.metadata (137 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-2.7.0-py3-none-any.whl.metadata (11 kB)
Collecting boto3
  Using cached boto3-1.34.105-py3-none-any.whl.metadata (6.6 kB)
Collecting nltk
  Using cached nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting langchain
  Using cached langchain-0.1.20-py3-none-any.whl.metadata (13 kB)
Collecting lark
  Using cached lark-1.1.9-py3-none-any.whl.metadata (1.9 kB)
Collecting tiktoken
  Using cached tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting langchain_community
  Using cached langchain_community-0.0.38-py3-none-any.whl.metadata (8.7 kB)
Collecting huggingface_hub
  Using cached huggingface_hub-0.23.0-py3-none-any.whl.metadata (12 kB)
Collecting replicate
  Using cached replicate-0.26.0-py3-none-any.whl.metadata (24 kB)
Collecting regex!=2019.12.17 (from transformers)
  U

In [2]:
from pymongo import MongoClient
from bson.json_util import dumps
import pandas as pd
import gzip

In [4]:
from transformers import AutoTokenizer, AutoModel
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
import nltk
from nltk.corpus import stopwords
import os
import json


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
client = MongoClient('mongodb://localhost:27017/')
db = client['imdb_database']

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
models = {
    "bart": {
        "model_name": "facebook/bart-large",
        "tokenizer": AutoTokenizer.from_pretrained("facebook/bart-large", trust_remote_code=True),
        "model": AutoModel.from_pretrained("facebook/bart-large", trust_remote_code=True)
    },
    "gte": {
        "model_name": "Alibaba-NLP/gte-large-en-v1.5",
        "tokenizer": AutoTokenizer.from_pretrained("Alibaba-NLP/gte-large-en-v1.5", trust_remote_code=True),
        "model": AutoModel.from_pretrained("Alibaba-NLP/gte-large-en-v1.5", trust_remote_code=True)
    },
    "MiniLM": {
        "model_name": 'all-MiniLM-L12-v2',
        "model": SentenceTransformer('all-MiniLM-L12-v2')
    },
    "roberta": {
        "model_name": 'sentence-transformers/nli-roberta-large',
        "model": SentenceTransformer('sentence-transformers/nli-roberta-large')
    },
    "e5-large":{
        "model_name": 'intfloat/e5-large',
        "tokenizer": AutoTokenizer.from_pretrained('intfloat/e5-large', trust_remote_code=True),
        "model": AutoModel.from_pretrained('intfloat/e5-large', trust_remote_code=True)
    }
}



: 

: 

In [None]:
def preprocess(text):
    tokens = text.split()
    stopwords_set = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stopwords_set]
    return ' '.join(tokens)

In [None]:
def normalize_embeddings(embeddings):
    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
    normalized_embeddings = embeddings / norms
    return normalized_embeddings

In [None]:
def generate_embedding(movies_data, model_key, normalize=True):
    model_config = models[model_key]
    if 'tokenizer' in model_config:
        movie_texts = [preprocess(movie['primaryTitle']) for movie in movies_data]
        inputs = model_config['tokenizer'](movie_texts, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            outputs = model_config['model'](**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
    else:
        movie_texts = [preprocess(movie['primaryTitle']) for movie in movies_data]
        embeddings = model_config['model'].encode(movie_texts)
    if normalize:
        embeddings = normalize_embeddings(embeddings)
    return embeddings

In [None]:
movies = list(db['title.basics'].find({}, {'primaryTitle': 1, '_id': 0}))
model_key = 'MiniLM'  # Example model
embeddings = generate_embedding(movies, model_key)
print(embeddings[:5])  # Display the first 5 embeddings