In [None]:
pip install --upgrade pip

In [None]:
!pip install transformers psycopg2 numpy boto3 torch scikit-learn matplotlib nltk sentence-transformers pandas langchain lark pgvector psycopg2-binary tiktoken langchain_community huggingface_hub replicate

In [None]:
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Pool
from transformers import AutoTokenizer, AutoModel
import numpy as np
import torch
import psycopg2
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import json
import os
import pandas as pd

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

In [None]:
models = {
    "bart": {
        "model_name": "facebook/bart-large",
        "tokenizer": AutoTokenizer.from_pretrained("facebook/bart-large", trust_remote_code=True),
        "model": AutoModel.from_pretrained("facebook/bart-large", trust_remote_code=True)
    },
    "gte": {
        "model_name": "Alibaba-NLP/gte-large-en-v1.5",
        "tokenizer": AutoTokenizer.from_pretrained("Alibaba-NLP/gte-large-en-v1.5", trust_remote_code=True),
        "model": AutoModel.from_pretrained("Alibaba-NLP/gte-large-en-v1.5", trust_remote_code=True)
    },
    "MiniLM": {
        "model_name": 'all-MiniLM-L12-v2',
        "model": SentenceTransformer('all-MiniLM-L12-v2')
    },
    "roberta": {
        "model_name": 'sentence-transformers/nli-roberta-large',
        "model": SentenceTransformer('sentence-transformers/nli-roberta-large')
    },
    "e5-large":{
        "model_name": 'intfloat/e5-large',
        "tokenizer": AutoTokenizer.from_pretrained('intfloat/e5-large', trust_remote_code=True),
        "model": AutoModel.from_pretrained('intfloat/e5-large', trust_remote_code=True)
    }
}

In [None]:
current_directory = os.getcwd()
with open(os.path.join(current_directory, "movies.json"), "r") as f:
    movies = json.load(f)

movies_data = []
for movie in movies["films"]["film"]:

    roles = movie.get("role", [])
    if isinstance(roles, dict):  # If 'roles' is a dictionary, make it a single-item list
        roles = [roles]

    # Extract actor information
    actors = []
    for role in roles:
        actor_info = role.get("acteur", {})
        if "__text" in actor_info:
            actors.append(actor_info["__text"])

    movies_data.append({
        "title": movie.get("titre", ""),
        "year": movie.get("annee", ""),
        "country": movie.get("pays", ""),
        "language": movie.get("langue", ""),
        "duration": movie.get("duree", ""),
        "summary": movie.get("synopsis", ""),
        "genre": movie.get("genre", ""),
        "director": movie.get("realisateur", {"__text": ""}).get("__text", ""),
        "writers": movie.get("scenariste", []),
        "actors": actors,
        "poster": movie.get("affiche", ""),
        "id": movie.get("id", "")
    })

In [None]:
def preprocess(text):
    # Example preprocessing step simplified for demonstration
    tokens = text.split()
    # Assuming stopwords are already loaded to avoid loading them in each process
    stopwords_set = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stopwords_set]
    return ' '.join(tokens)

In [None]:
def normalize_embeddings(embeddings):
    """ Normalize the embeddings to unit vectors. """
    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
    normalized_embeddings = embeddings / norms
    return normalized_embeddings

In [None]:
def generate_embedding(movies_data, model_key, normalize=True):
    model_config = models[model_key]
    if 'tokenizer' in model_config:
        # Handle HuggingFace transformer models
        movie_texts = [
            f"{preprocess(movie['title'])} {movie['year']} {' '.join(movie['genre'])} "
            f"{' '.join(movie['actors'])} {movie['director']} "
            f"{preprocess(movie['summary'])} {movie['country']}"
            for movie in movies_data
        ]
        inputs = model_config['tokenizer'](movie_texts, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            outputs = model_config['model'](**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
    else:
        # Handle Sentence Transformers
        movie_texts = [
            f"{preprocess(movie['title'])} {movie['year']} {' '.join(movie['genre'])} "
            f"{' '.join(movie['actors'])} {movie['director']} "
            f"{preprocess(movie['summary'])} {movie['country']}"
            for movie in movies_data
        ]
        embeddings = model_config['model'].encode(movie_texts)

    if normalize:
        embeddings = normalize_embeddings(embeddings)

    return embeddings


In [None]:
embeddings_bart = generate_embedding(movies_data, 'bart')
embeddings_bart = np.array(embeddings_bart)
print("BART embeddings shape:", embeddings_bart.shape)
print("BART embeddings:", embeddings_bart[0])

In [None]:
embeddings_gte = generate_embedding(movies_data, 'gte')
embeddings_gte = np.array(embeddings_gte)
print("GTE embeddings shape:", embeddings_gte.shape)
print("GTE embeddings:", embeddings_gte[0])

In [None]:
embeddings_MiniLM = generate_embedding(movies_data, 'MiniLM')
embeddings_MiniLM = np.array(embeddings_MiniLM)
print("MiniLM embeddings shape:", embeddings_MiniLM.shape)
print("MiniLM embeddings:", embeddings_MiniLM[0])

In [None]:
embeddings_roberta = generate_embedding(movies_data, 'roberta')
embeddings_roberta = np.array(embeddings_roberta)
print("RoBERTa embeddings shape:", embeddings_roberta.shape)
print("RoBERTa embeddings:", embeddings_roberta[0])

In [None]:
embeddings_e5_large = generate_embedding(movies_data, 'e5-large')
embeddings_e5_large = np.array(embeddings_e5_large)
print("e5-large embeddings shape:", embeddings_e5_large.shape)
print("e5-large embeddings:", embeddings_e5_large[0])

# Create connection to the database

In [None]:
conn = psycopg2.connect(database="admin", host="localhost", user="admin", password="admin", port="5432")
cur = conn.cursor()

In [None]:
cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
conn.commit()
cur.execute("CREATE EXTENSION IF NOT EXISTS cube;")
conn.commit()

In [None]:
def setup_database():
    cur.execute('DROP TABLE IF EXISTS movies')
    cur.execute('''
        CREATE TABLE movies (
            id SERIAL PRIMARY KEY,
            title TEXT NOT NULL,
            actors TEXT,
            year INTEGER,
            country TEXT,
            language TEXT,
            duration INTEGER,
            summary TEXT,
            genre TEXT[],
            director TEXT,
            scenarists TEXT[],
            poster TEXT,
            embedding_bart VECTOR(1024),
            embedding_gte VECTOR(1024),
            embedding_MiniLM VECTOR(384),
            embedding_roberta VECTOR(1024),
            embedding_e5_large VECTOR(1024)
        );
    ''')
    conn.commit()

setup_database()


# Insert

In [None]:
def insert_movies(movie_data, embeddings_bart, embeddings_gte, embeddings_MiniLM, embeddings_roberta, embeddings_e5_large):
    for movie, emb_bart, emb_gte, emb_MiniLM , emb_roberta, emb_e5_large in zip(movie_data, embeddings_bart, embeddings_gte, embeddings_MiniLM, embeddings_roberta, embeddings_e5_large):
        # Joining actors into a single string separated by commas
        actor_names = ', '.join(movie['actors'])
        # Convert list of genres into a PostgreSQL array format
        genre_array = '{' + ', '.join([f'"{g}"' for g in movie['genre']]) + '}'
        # Convert list of scenarists into a PostgreSQL array format
        scenarist_array = '{' + ', '.join([f'"{s}"' for s in movie['writers']]) + '}'
        # Convert embeddings to a string properly formatted as a list
        embedding_bart_str = '[' + ', '.join(map(str, emb_bart)) + ']'
        embedding_gte_str = '[' + ', '.join(map(str, emb_gte)) + ']'
        embedding_MiniLM_str = '[' + ', '.join(map(str, emb_MiniLM)) + ']'
        embedding_roberta_str = '[' + ', '.join(map(str, emb_roberta)) + ']'
        embedding_e5_large_str = '[' + ', '.join(map(str, emb_e5_large)) + ']'

        cur.execute('''
            INSERT INTO movies (title, actors, year, country, language, duration, summary, genre, director, scenarists, poster, embedding_bart, embedding_gte, embedding_MiniLM, embedding_roberta, embedding_e5_large)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
        ''', (
            movie['title'], actor_names, movie['year'], movie['country'], movie['language'],
            movie['duration'], movie['summary'], genre_array, movie['director'],
            scenarist_array, movie['poster'], embedding_bart_str, embedding_gte_str, embedding_MiniLM_str, embedding_roberta_str, embedding_e5_large_str
        ))
    conn.commit()

In [None]:
insert_movies(movies_data, embeddings_bart, embeddings_gte, embeddings_MiniLM, embeddings_roberta, embeddings_e5_large)