In [1]:
!pip install transformers psycopg2 numpy boto3 torch scikit-learn matplotlib nltk sentence-transformers pandas langchain lark pgvector psycopg2-binary tiktoken langchain_community huggingface_hub replicate keras tf-keras

Defaulting to user installation because normal site-packages is not writeable


In [2]:
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Pool
from transformers import AutoTokenizer, AutoModel
import numpy as np
import torch
import psycopg2
from sentence_transformers import SentenceTransformer
import json
import os
from tqdm import tqdm





In [3]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\antoi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
models = {
    "bart": {
        "model_name": "facebook/bart-large",
        "tokenizer": AutoTokenizer.from_pretrained("facebook/bart-large", trust_remote_code=True),
        "model": AutoModel.from_pretrained("facebook/bart-large", trust_remote_code=True)
    },
    "gte": {
        "model_name": "Alibaba-NLP/gte-large-en-v1.5",
        "tokenizer": AutoTokenizer.from_pretrained("Alibaba-NLP/gte-large-en-v1.5", trust_remote_code=True),
        "model": AutoModel.from_pretrained("Alibaba-NLP/gte-large-en-v1.5", trust_remote_code=True)
    },
    "MiniLM": {
        "model_name": 'all-MiniLM-L12-v2',
        "model": SentenceTransformer('all-MiniLM-L12-v2')
    },
    "roberta": {
        "model_name": 'sentence-transformers/nli-roberta-large',
        "model": SentenceTransformer('sentence-transformers/nli-roberta-large')
    },
    "e5-large":{
        "model_name": 'intfloat/e5-large',
        "tokenizer": AutoTokenizer.from_pretrained('intfloat/e5-large', trust_remote_code=True),
        "model": AutoModel.from_pretrained('intfloat/e5-large', trust_remote_code=True)
    }
}

In [5]:
current_directory = os.getcwd()
with open(os.path.join(current_directory, "movies.json"), "r", encoding="utf-8") as f:
    movies = json.load(f)

movies_data = []
for movie in movies["films"]["film"]:

    roles = movie.get("role", [])
    if isinstance(roles, dict):  # If 'roles' is a dictionary, make it a single-item list
        roles = [roles]

    # Extract actor information
    actors = []
    for role in roles:
        actor_info = role.get("acteur", {})
        if "__text" in actor_info:
            actors.append(actor_info["__text"])

    movies_data.append({
        "title": movie.get("titre", ""),
        "year": movie.get("annee", ""),
        "country": movie.get("pays", ""),
        "language": movie.get("langue", ""),
        "duration": movie.get("duree", ""),
        "summary": movie.get("synopsis", ""),
        "genre": movie.get("genre", ""),
        "director": movie.get("realisateur", {"__text": ""}).get("__text", ""),
        "writers": movie.get("scenariste", []),
        "actors": actors,
        "poster": movie.get("poster", ""),
        "id": movie.get("id", "")
    })

In [6]:
def preprocess(text):
    # Example preprocessing step simplified for demonstration
    tokens = text.split()
    # Assuming stopwords are already loaded to avoid loading them in each process
    stopwords_set = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stopwords_set]
    return ' '.join(tokens)

In [7]:
def normalize_embeddings(embeddings):
    """ Normalize the embeddings to unit vectors. """
    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
    normalized_embeddings = embeddings / norms
    return normalized_embeddings

In [8]:
def generate_embedding(movies_data, model_key, normalize=True):
    model_config = models[model_key]
    if 'tokenizer' in model_config:
        # Handle HuggingFace transformer models
        movie_texts = [
            f"{preprocess(movie['title'])} {movie['year']} {' '.join(movie['genre'])} "
            f"{' '.join(movie['actors'])} {movie['director']} "
            f"{preprocess(movie['summary'])} {movie['country']}"
            for movie in movies_data
        ]
        inputs = model_config['tokenizer'](movie_texts, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            outputs = model_config['model'](**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
    else:
        # Handle Sentence Transformers
        movie_texts = [
            f"{preprocess(movie['title'])} {movie['year']} {' '.join(movie['genre'])} "
            f"{' '.join(movie['actors'])} {movie['director']} "
            f"{preprocess(movie['summary'])} {movie['country']}"
            for movie in tqdm(movies_data, desc="Encoding movie texts and generating embeddings")
        ]
        embeddings = model_config['model'].encode(movie_texts)

    if normalize:
        embeddings = normalize_embeddings(embeddings)

    return embeddings


In [9]:
embeddings_bart = generate_embedding(movies_data, 'bart')
embeddings_bart = np.array(embeddings_bart)
print("BART embeddings shape:", embeddings_bart.shape)
print("BART embeddings:", embeddings_bart[0])

BART embeddings shape: (631, 1024)
BART embeddings: [ 0.02980587  0.01581689 -0.0162874  ... -0.00557002 -0.02872263
 -0.02430014]


In [10]:
embeddings_gte = generate_embedding(movies_data, 'gte')
embeddings_gte = np.array(embeddings_gte)
print("GTE embeddings shape:", embeddings_gte.shape)
print("GTE embeddings:", embeddings_gte[0])

GTE embeddings shape: (631, 1024)
GTE embeddings: [ 0.00271679  0.00554605  0.01195132 ...  0.04074343 -0.00983375
 -0.02142335]


In [11]:
embeddings_MiniLM = generate_embedding(movies_data, 'MiniLM')
embeddings_MiniLM = np.array(embeddings_MiniLM)
print("MiniLM embeddings shape:", embeddings_MiniLM.shape)
print("MiniLM embeddings:", embeddings_MiniLM[0])

Encoding movie texts and generating embeddings: 100%|██████████| 631/631 [00:00<00:00, 1742.70it/s]


MiniLM embeddings shape: (631, 384)
MiniLM embeddings: [-7.04024499e-03  6.07011141e-03  1.28376424e-01 -4.50421683e-02
  4.66892868e-02  1.10597022e-01  5.81973493e-02  2.08956115e-02
  5.28918505e-02  1.14241503e-01 -9.48251039e-02 -4.47546691e-02
 -1.81262288e-02  1.00744896e-01 -4.28932756e-02  5.07416651e-02
 -1.08549166e-02  7.32663646e-02 -2.75690667e-02  8.67665000e-03
 -7.07044676e-02  4.54404689e-02 -3.44804972e-02 -8.39157868e-03
  1.86072644e-02 -7.11285621e-02  2.02321932e-02 -5.20065464e-02
  3.22543341e-03 -1.99356731e-02  5.68822511e-02 -2.74389284e-03
 -2.67803054e-02  2.91455481e-02 -4.49042680e-04  5.57996966e-02
 -1.30207360e-01  1.55704385e-02 -2.01300066e-02 -5.58847636e-02
  1.33942459e-02 -3.11912205e-02  1.25691712e-01 -9.65598375e-02
 -1.80415250e-02 -1.05671175e-02 -5.14752939e-02 -3.55250835e-02
  4.20107394e-02  8.37841257e-02 -7.53990039e-02 -2.90898960e-02
 -1.50384801e-03  8.55621509e-03 -1.42899957e-02 -8.19770396e-02
 -5.21209054e-02  1.44929150e-02  1

In [12]:
embeddings_roberta = generate_embedding(movies_data, 'roberta')
embeddings_roberta = np.array(embeddings_roberta)
print("RoBERTa embeddings shape:", embeddings_roberta.shape)
print("RoBERTa embeddings:", embeddings_roberta[0])

Encoding movie texts and generating embeddings: 100%|██████████| 631/631 [00:00<00:00, 1285.15it/s]


RoBERTa embeddings shape: (631, 1024)
RoBERTa embeddings: [ 0.02298054  0.04175862 -0.01646351 ...  0.01556577 -0.0256803
  0.06172808]


In [13]:
embeddings_e5_large = generate_embedding(movies_data, 'e5-large')
embeddings_e5_large = np.array(embeddings_e5_large)
print("e5-large embeddings shape:", embeddings_e5_large.shape)
print("e5-large embeddings:", embeddings_e5_large[0])

e5-large embeddings shape: (631, 1024)
e5-large embeddings: [-0.01276984 -0.05366214  0.04731817 ...  0.01435485  0.02704632
  0.03876564]


# Create connection to the database

In [14]:
conn = psycopg2.connect(database="postgres", host="localhost", user="postgres", password="postgres", port="5432")
cur = conn.cursor()

In [15]:
cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
conn.commit()
cur.execute("CREATE EXTENSION IF NOT EXISTS cube;")
conn.commit()

In [16]:
def setup_database():
    cur.execute('DROP TABLE IF EXISTS movies')
    cur.execute('''
        CREATE TABLE movies (
            id SERIAL PRIMARY KEY,
            title TEXT NOT NULL,
            actors TEXT,
            year INTEGER,
            country TEXT,
            language TEXT,
            duration INTEGER,
            summary TEXT,
            genre TEXT[],
            director TEXT,
            scenarists TEXT[],
            poster TEXT,
            embedding_bart VECTOR(1024),
            embedding_gte VECTOR(1024),
            embedding_MiniLM VECTOR(384),
            embedding_roberta VECTOR(1024),
            embedding_e5_large VECTOR(1024)
        );
    ''')
    conn.commit()

setup_database()


# Insert

In [17]:
def insert_movies(movie_data, embeddings_bart, embeddings_gte, embeddings_MiniLM, embeddings_roberta, embeddings_e5_large):
    for movie, emb_bart, emb_gte, emb_MiniLM , emb_roberta, emb_e5_large in zip(movie_data, embeddings_bart, embeddings_gte, embeddings_MiniLM, embeddings_roberta, embeddings_e5_large):
        # Joining actors into a single string separated by commas
        actor_names = ', '.join(movie['actors'])
        # Convert list of genres into a PostgreSQL array format
        genre_array = '{' + ', '.join([f'"{g}"' for g in movie['genre']]) + '}'
        # Convert list of scenarists into a PostgreSQL array format
        scenarist_array = '{' + ', '.join([f'"{s}"' for s in movie['writers']]) + '}'
        # Convert embeddings to a string properly formatted as a list
        embedding_bart_str = '[' + ', '.join(map(str, emb_bart)) + ']'
        embedding_gte_str = '[' + ', '.join(map(str, emb_gte)) + ']'
        embedding_MiniLM_str = '[' + ', '.join(map(str, emb_MiniLM)) + ']'
        embedding_roberta_str = '[' + ', '.join(map(str, emb_roberta)) + ']'
        embedding_e5_large_str = '[' + ', '.join(map(str, emb_e5_large)) + ']'

        cur.execute('''
            INSERT INTO movies (title, actors, year, country, language, duration, summary, genre, director, scenarists, poster, embedding_bart, embedding_gte, embedding_MiniLM, embedding_roberta, embedding_e5_large)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
        ''', (
            movie['title'], actor_names, movie['year'], movie['country'], movie['language'],
            movie['duration'], movie['summary'], genre_array, movie['director'],
            scenarist_array, movie['poster'], embedding_bart_str, embedding_gte_str, embedding_MiniLM_str, embedding_roberta_str, embedding_e5_large_str
        ))
    conn.commit()

In [18]:
insert_movies(movies_data, embeddings_bart, embeddings_gte, embeddings_MiniLM, embeddings_roberta, embeddings_e5_large)