In [22]:
!pip install transformers psycopg2 numpy boto3 torch scikit-learn matplotlib nltk sentence-transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting sentence-transformers
  Obtaining dependency information for sentence-transformers from https://files.pythonhosted.org/packages/76/2c/bd95032aeb087b0706596af0a4518c4bfe0439a1bb149048ece18b617766/sentence_transformers-2.7.0-py3-none-any.whl.metadata
  Downloading sentence_transformers-2.7.0-py3-none-any.whl.metadata (11 kB)
Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-2.7.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Pool
from transformers import AutoTokenizer, AutoModel
import numpy as np
import torch
import psycopg2
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances
import matplotlib.pyplot as plt
import json
import os

# Model

In [None]:
model_name = "facebook/bart-large"
tokenizer_name = "facebook/bart-large"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

# Load the model
model = AutoModel.from_pretrained(model_name)

# Remove stopwords to reduce noise

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# Define a list of movie titles

In [None]:
current_directory = os.getcwd()
with open(os.path.join(current_directory, "movies.json"), "r") as f:
    movies = json.load(f)

movies_data = []
for movie in movies["films"]["film"]:

    roles = movie.get("role", [])
    if isinstance(roles, dict):  # If 'roles' is a dictionary, make it a single-item list
        roles = [roles]

    # Extract actor information
    actors = []
    for role in roles:
        actor_info = role.get("acteur", {})
        if "__text" in actor_info:
            actors.append(actor_info["__text"])

    movies_data.append({
        "title": movie.get("titre", ""),
        "year": movie.get("annee", ""),
        "country": movie.get("pays", ""),
        "language": movie.get("langue", ""),
        "duration": movie.get("duree", ""),
        "summary": movie.get("synopsis", ""),
        "genre": movie.get("genre", ""),
        "director": movie.get("realisateur", {"__text": ""}).get("__text", ""),
        "writers": movie.get("scenariste", []),
        "actors": actors,
        "poster": movie.get("affiche", ""),
        "id": movie.get("id", "")
    })

# Generate embeddings for movies

In [23]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load a large model
model = SentenceTransformer('all-MiniLM-L12-v2')

# Example sentences
sentences = ["This is a fox.", "This is a dog."]

# Generate embeddings
embeddings = model.encode(sentences)

# Calculate cosine similarity
cosine_similarity = np.dot(embeddings[0], embeddings[1]) / (np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1]))
print("Cosine Similarity:", cosine_similarity)

Cosine Similarity: 0.4649309


In [None]:
def preprocess(text):
    # Example preprocessing step simplified for demonstration
    tokens = text.split()
    # Assuming stopwords are already loaded to avoid loading them in each process
    stopwords_set = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stopwords_set]
    return ' '.join(tokens)

In [None]:
def generate_embedding(text):
    movie_texts = [
        f"{preprocess(movie['title'])} {movie['year']} {' '.join(movie['genre'])} "
        f"{' '.join(movie['actors'])} {movie['director']} "
        f"{preprocess(movie['summary'])} {movie['country']}"
        for movie in movies_data
    ]
    inputs = tokenizer(movie_texts, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
    return embeddings

In [None]:
embeddings = generate_embedding(movies_data)

# Create connection to the database

In [None]:
conn = psycopg2.connect(database="admin", host="localhost", user="admin", password="admin", port="5432")
cur = conn.cursor()

In [None]:
cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
conn.commit()

In [None]:
def setup_database():
    cur.execute('DROP TABLE IF EXISTS movies')
    cur.execute('''
        CREATE TABLE movies (
            id SERIAL PRIMARY KEY,
            title TEXT NOT NULL,
            actors TEXT,
            year INTEGER,
            country TEXT,
            language TEXT,
            duration INTEGER,
            summary TEXT,
            genre TEXT[],
            director TEXT,
            scenarists TEXT[],
            poster TEXT,
            embedding vector(1024)
        );
    ''')
    conn.commit()

setup_database()


# Insert movie titles and their embeddings into the 'movies' table

In [None]:
def insert_movies(movie_data, embeddings):
    for movie, embedding in zip(movie_data, embeddings):
        # Joining actors into a single string separated by commas
        actor_names = ', '.join(movie['actors'])
        # Convert list of genres into a PostgreSQL array format
        genre_array = '{' + ', '.join([f'"{g}"' for g in movie['genre']]) + '}'
        # Convert list of scenarists into a PostgreSQL array format
        scenarist_array = '{' + ', '.join([f'"{s}"' for s in movie['writers']]) + '}'
        # Convert embedding to a string properly formatted as a list
        embedding_str = '[' + ', '.join(map(str, embedding)) + ']'

        cur.execute('''
            INSERT INTO movies (title, actors, year, country, language, duration, summary, genre, director, scenarists, poster, embedding)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
        ''', (
            movie['title'], actor_names, movie['year'], movie['country'], movie['language'],
            movie['duration'], movie['summary'], genre_array, movie['director'],
            scenarist_array, movie['poster'], embedding_str  # Insert the string representation of embedding
        ))
    conn.commit()

In [None]:
insert_movies(movies_data, embeddings)

# Finding similar movies

In [None]:
def get_query_embedding(title):
    cur.execute('SELECT embedding FROM movies WHERE title = %s', (title,))
    result = cur.fetchone()
    if result:
        embedding_str = result[0]
        embedding = [float(x) for x in embedding_str.strip('[]').split(',')]
        return np.array(embedding, dtype=float).reshape(1, -1)
    else:
        return None

def find_similar_movies(title, threshold=0.5, return_n=25, distance_function='cosine_similarity'):
    query_embedding = get_query_embedding(title)
    if query_embedding is None:
        print(f"No embedding found for the movie titled '{title}'.")
        return []

    cur.execute('SELECT title, embedding FROM movies')
    rows = cur.fetchall()

    embeddings = []
    movie_titles = []
    for other_title, embedding_str in rows:
        if other_title != title:
            embedding = np.array([float(x) for x in embedding_str.strip('[]').split(',')])
            embeddings.append(embedding)
            movie_titles.append(other_title)

    if distance_function == 'cosine_similarity':
        distances = pairwise_distances(query_embedding, embeddings, metric='cosine')
        similarities = 1 - distances
    elif distance_function == 'euclidean_distance':
        distances = pairwise_distances(query_embedding, embeddings, metric='euclidean')
        similarities = 1 / (1 + distances)
    elif distance_function == 'inner_product':
        inner_products = np.dot(query_embedding, np.array(embeddings).T)
        similarities = inner_products / (np.linalg.norm(query_embedding) * np.linalg.norm(embeddings, axis=1))
    elif distance_function == 'hamming_distance':
        # convert embeddings to binary
        query_binary = np.where(query_embedding > 0, 1, 0)
        embeddings_binary = np.where(np.array(embeddings) > 0, 1, 0)
        distances = pairwise_distances(query_binary, embeddings_binary, metric='hamming')
        similarities = 1 - distances
    elif distance_function == 'jaccard_distance':
        # convert embeddings to binary
        query_binary = np.where(query_embedding > 0, 1, 0)
        embeddings_binary = np.where(np.array(embeddings) > 0, 1, 0)
        distances = pairwise_distances(query_binary, embeddings_binary, metric='jaccard')
        similarities = 1 - distances
    else:
        print("Unsupported distance function.")
        return []

    similar_movies = [(movie_titles[i], similarities[0][i]) for i in range(len(movie_titles)) if similarities[0][i] > threshold]
    similar_movies.sort(key=lambda x: x[1], reverse=True)
    return similar_movies[:return_n]


# SQL query to find similar movies

In [None]:
def find_similar_movies_sql(title, threshold=0.1, return_n=10, distance_function='<->'):
    allowed_functions = ['<->', '<#>', '<=>', '<+>']  # L2, negative inner product, cosine, L1
    if distance_function not in allowed_functions:
        print("Unsupported distance function.")
        return []

    try:
        cur.execute(f"""
            SELECT title, embedding, embedding {distance_function} (SELECT embedding FROM movies WHERE title = %s) AS distance
            FROM movies
            WHERE title != %s
            ORDER BY distance
            LIMIT %s;
        """, (title, title, return_n))

        results = cur.fetchall()
        if distance_function == '<=>':  # Adjust for cosine similarity
            similar_movies = [(row[0], 1 - row[2]) for row in results if (1 - row[2]) > threshold]
        else:
            similar_movies = [(row[0], row[2]) for row in results if row[2] < threshold]

        return similar_movies
    except Exception as e:
        print(f"An error occurred: {e}")
        return []


# Define a query movie title

In [None]:
query_movie_title = 'The Incredibles'

# Plot

In [None]:
def plot_similar_movies(similar_movies, title):
    # Prepare data
    titles, similarities = zip(*similar_movies)
    similarities = [round(sim * 100, 3) for sim in similarities]  # Convert to percentage and round off

    # Create a vertical bar chart
    plt.figure(figsize=(12, 8))
    bars = plt.bar(titles, similarities, color='skyblue')
    plt.ylabel('Similarity Score (%)')
    plt.title(f"{title} - Similar Movies for '{query_movie_title}'")
    plt.xticks(rotation=45, ha='right')

    plt.tight_layout()
    plt.show()

# Perform a similarity search

In [None]:
# For cosine similarity
similar_movies = find_similar_movies_sql(query_movie_title, threshold=0.9, return_n=10, distance_function='<=>')
plot_similar_movies(similar_movies, 'Cosine Similarity')

In [None]:
# For cosine similarity
similar_movies = find_similar_movies(query_movie_title, threshold=0.9, distance_function='cosine_similarity')
plot_similar_movies(similar_movies,'cosine_similarity')


# For L2 Distance (Euclidean Distance)
similar_movies = find_similar_movies(query_movie_title, threshold=0.1, distance_function='euclidean_distance')
plot_similar_movies(similar_movies, 'euclidean_distance')

# For Inner Product
similar_movies = find_similar_movies(query_movie_title, threshold=0.9, distance_function='inner_product')
plot_similar_movies(similar_movies, 'inner_product')

# For Hamming Distance
similar_movies = find_similar_movies(query_movie_title, threshold=0.1, distance_function='hamming_distance')
plot_similar_movies(similar_movies, 'hamming_distance')

# For Jaccard Distance
similar_movies = find_similar_movies(query_movie_title, threshold=0.1, distance_function='jaccard_distance')
plot_similar_movies(similar_movies, 'jaccard_distance')
