In [1]:
import pandas as pd
import numpy as np
import json
import re
from sentence_transformers import SentenceTransformer
import math
import requests
from PIL import Image
from io import BytesIO
import matplotlib.pyplot as plt
import random

# MongoDB
from pymongo import MongoClient
from pymongo.operations import SearchIndexModel
import mplcursors


import warnings
warnings.filterwarnings("ignore")


In [2]:
# CONNECTION_STRING = "mongodb+srv://saar_david:saar@kal-media.ggmvds5.mongodb.net/"
CONNECTION_STRING = "mongodb://localhost:27018/"
DB_NAME = "images"
COLLECTION_NAME = "press_office"

In [3]:
def build_db(sql_query_filepath="kalos-media-tagging.sql"):
    # Read sql file
    with open(sql_query_filepath, 'r', encoding="utf8") as file:
        sql_query = file.readlines()

    df = pd.DataFrame(sql_query)
    df.columns = ['sql_query']
    df = df.iloc[4:]
    df.reset_index(drop=True, inplace=True)

    # Define the regex pattern to extract values between single quotes
    pattern = r"\('([^']*)','([^']*)','([^']*)','([^']*)'\)"
    df[['url', 'descriptionHebrew', 'descriptionEnglish', 'tags']] = df['sql_query'].str.extract(pattern)
    return df


# Use regular expression to extract English words
def extract_english_words(text):
    if not isinstance(text, str):
        return []
    text = text.replace("tags", "")
    english_words = re.findall(r'\b[a-zA-Z0-9.]+\b', text)
    english_string = ' '.join(english_words)
    return english_string


# Use regular expression to extract Hebrew words
def extract_hebrew_words(text):
    if not isinstance(text, str):
        return []
    text = text.replace("tags", "")
    hebrew_words = re.findall(r'[א-ת0-9.]+', text)
    hebrew_string = ' '.join(hebrew_words)
    return hebrew_string


def english_embeddings(df, embedding_model=SentenceTransformer('all-mpnet-base-v2')):
    df['english_tags'] = df['tags'].apply(extract_english_words)
    df['english_words'] = df['descriptionEnglish'].apply(extract_english_words)
    df['english_tokens'] = df['english_tags'].astype(str) + ' ' + df['english_words'].astype(str)

    # Embed df tokens - done only once
    corpus_english = df['english_tokens'].values
    corpus_embeddings_en = embedding_model.encode(corpus_english, convert_to_tensor=True)
    embeddings_list = [embedding.cpu().detach().numpy() for embedding in corpus_embeddings_en]
    df['embeddings_en'] = embeddings_list


def hebrew_embeddings(df, embedding_model=SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')):
    df['hebrew_tags'] = df['tags'].apply(extract_hebrew_words)
    df['hebrew_words'] = df['descriptionHebrew'].apply(extract_hebrew_words)
    df['hebrew_tokens'] = df['hebrew_tags'].astype(str) + ' ' + df['hebrew_words'].astype(str)

    # Embed df tokens - done only once
    corpus_hebrew = df['hebrew_tokens'].values
    corpus_embeddings_he = embedding_model.encode(corpus_hebrew, convert_to_tensor=True)
    embeddings_list = [embedding.cpu().detach().numpy() for embedding in corpus_embeddings_he]
    df['embeddings_he'] = embeddings_list


def cos_sim(row, query_embeddings, language_code='en'):
    """
    Calculate the cosine similarity between the image embedding and the query embedding.
    Args:
        row(pd.Series): Row of the DataFrame containing the image embedding.
        query_embeddings(torch.Tensor): Query embedding tensor.
        language_code(str): Language code ('en' for English, 'he' for Hebrew).
    Returns:
        float: Cosine similarity between the image and the query embeddings.
    """
    img_embedding = row[f'embeddings_{language_code}']

    if isinstance(img_embedding, str):
        try:
            img_embedding = json.loads(img_embedding)
        except json.JSONDecodeError:
            print("Failed to convert description embedding from string to list.")
            return 0

    if np.all(img_embedding == 0):
        return 0

    img_sim = (np.dot(img_embedding, query_embeddings) /
               (np.linalg.norm(img_embedding) * np.linalg.norm(query_embeddings)))

    return img_sim


def check_input(input_string):
    """
    Check if the input string contains only English or Hebrew letters, signs, and numbers.
    Args:
        input_string(str): Input string to check.
    Returns:
        str: 'en' if the input string is in English, 'he' if the input string is in Hebrew, False otherwise.
                
    """
    # Define regex patterns to match only English and Hebrew letters, signs, numbers, and spaces
    english_pattern = r'^[a-zA-Z0-9\s.,!?@#$%^&*()-_+=]*$'
    hebrew_pattern = r'^[א-ת0-9\s.,!?@#$%^&*()-_+=]*$'

    if re.match(english_pattern, input_string):
        return 'en'
    elif re.match(hebrew_pattern, input_string):
        return 'he'
    else:
        return False


def display_images_in_grid(urls):
    """
    Display images in a grid layout.
    Args:
        urls(list): List of image URLs.
    """
    num_images = len(urls)
    grid_size = (math.ceil(num_images ** 0.5), math.ceil(math.sqrt(num_images)))
    figsize = (grid_size[1] * 3, grid_size[0] * 3)  # Adjust figsize based on the grid size

    fig, axes = plt.subplots(*grid_size, figsize=figsize)
    axes = axes.flatten()  # Flatten the axes array for easy iteration

    for ax, url in zip(axes, urls):
        try:
            response = requests.get(url)
            img = Image.open(BytesIO(response.content))
            ax.imshow(img)
            ax.axis('off')
        except Exception as e:
            print(f"Error loading image from {url}: {e}")
            ax.axis('off')  # Hide the axis if the image cannot be loaded

    # Hide any remaining axes if there are more grid cells than images
    for ax in axes[len(urls):]:
        ax.axis('off')

    plt.tight_layout()
    plt.show()

def display_images_in_grid2(urls, descriptions):
    """
    Display images one after the other with their descriptions.
    Args:
        urls (list): List of image URLs.
        descriptions (list): List of descriptions corresponding to each image URL.
    """
    num_images = len(urls)
    figsize = (6, num_images * 5)  # Adjust figsize based on the number of images

    fig, axes = plt.subplots(num_images, 1, figsize=figsize)

    # Ensure axes is always iterable
    if num_images == 1:
        axes = [axes]

    for ax, url, desc in zip(axes, urls, descriptions):
        try:
            response = requests.get(url)
            img = Image.open(BytesIO(response.content))
            ax.imshow(img)
            ax.axis('off')
            ax.set_title(desc, fontsize=12, pad=10)  # Set the description as the title for the current image plot
        except Exception as e:
            print(f"Error loading image from {url}: {e}")
            ax.axis('off')  # Hide the axis if the image cannot be loaded

    plt.tight_layout()
    plt.show()

def search(df, n=5):
    query = input("Please write a sentence which describes the image you want to fetch from DB:\n")
    language_code = check_input(query)
    while not language_code:
        language_code = check_input(query)
        print(
            "Search sentence you have entered is invalid. Please note that it must include only letters of one language, signs, and numbers.")
        query = input(
            "Please enter a sentence in english which describes the image you want to fetch from DB:\n")

    if language_code == 'en':
        embedding_model = SentenceTransformer('all-mpnet-base-v2')
        query_embeddings = embedding_model.encode(query, convert_to_tensor=True).cpu()
    else:
        embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
        query_embeddings = embedding_model.encode(query, convert_to_tensor=True).cpu()

    df['cos_sim'] = df.apply(lambda row: cos_sim(row, query_embeddings, language_code), axis=1)
    df = df.sort_values(by='cos_sim', ascending=False)
    display_images_in_grid(df['url'].values[:n].tolist())
    return df


def mongo_search(connection, db_name, collection_name, query, filters, n=5):
    """
    Search for similar images in MongoDB.
    Args:
        connection(str): MongoDB connection string.
        db_name(str): Name of the database.
        collection_name(str): Name of the collection.
        query(list): List of embedding values.
        filters(dict): Dictionary of filters.
        n(int): Number of images to display.
    Returns:
        urls(list): URLs of the most similar images.
    """
    client = MongoClient(connection)
    db = client[db_name]
    collection = db[collection_name]

    pipeline = [
        {
            '$vectorSearch': {
                'exact': False,
                "filter": filters,
                'index': 'hebrew_search_index',
                'path': 'embeddings_he',
                'queryVector': query,
                'numCandidates': 3000,
                'limit': n
            }
        },
        {
            '$project': {
                '_id': 0,
                'url': 1,
                'descriptionHebrew': 1,
            }
        }
    ]

    results = collection.aggregate(pipeline)
    urls = []
    description = []
    for result in results:
        urls.append(result['url'])
        description.append(result['descriptionHebrew'])
    return (urls, description)



In [5]:
df = build_db(sql_query_filepath="kalos-media-tagging.sql")
df = df.drop(columns=['sql_query']).dropna()
embedding_model = SentenceTransformer('all-mpnet-base-v2')
hebrew_embeddings(df)
english_embeddings(df)
df.drop(columns=['hebrew_tags', 'hebrew_words', 'hebrew_tokens', 'english_tags', 'english_words', 'english_tokens'],
        inplace=True)

In [6]:
# convert embeddings to list and insert to MongoDB
df['embeddings_he'] = df['embeddings_he'].apply(lambda x: x.tolist())
df['embeddings_en'] = df['embeddings_en'].apply(lambda x: x.tolist())

df_dict = df.to_dict('records')
# collection.insert_many(df_dict)


In [7]:
embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
v = embedding_model.encode("בנימין נתניהו", convert_to_tensor=True).tolist()

filters = {
        "$and": [
          {
            "filter_attempt": {  "$eq": 1  }
          },
        ]
      }

In [8]:
urls, description = mongo_search(CONNECTION_STRING, DB_NAME, COLLECTION_NAME, v, filters, n=100)

In [69]:
urls

In [73]:
description

In [99]:

def display_images_in_grid2(urls, descriptions):
    """
    Display images one after the other with their descriptions.
    Args:
        urls (list): List of image URLs.
        descriptions (list): List of descriptions corresponding to each image URL.
    """
    num_images = len(urls)
    figsize = (6, num_images * 5)  # Adjust figsize based on the number of images

    fig, axes = plt.subplots(num_images, 1, figsize=figsize)

    # Ensure axes is always iterable
    if num_images == 1:
        axes = [axes]

    for ax, url, desc in zip(axes, urls, descriptions):
        try:
            response = requests.get(url)
            img = Image.open(BytesIO(response.content))
            ax.imshow(img)
            ax.axis('off')
            title = ax.set_title(desc, fontsize=12, pad=10, loc = 'right')  # Set the description as the title for the current image plot
            title.set_path_effects([path_effects.withStroke(linewidth=3, foreground='white')])  # Add a white background to the text

        except Exception as e:
            print(f"Error loading image from {url}: {e}")
            ax.axis('off')  # Hide the axis if the image cannot be loaded

    plt.tight_layout()
    plt.show()



In [128]:
def search_and_plot(query, n=5):
    embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
    v = embedding_model.encode(query, convert_to_tensor=True).tolist()

    filters = {
            "$and": [
            {
                "filter_attempt": {  "$eq": 1  }
            },
            ]
        }
    urls, description = mongo_search(CONNECTION_STRING, DB_NAME, COLLECTION_NAME, v, filters, n=n)
    for i in range(len(urls)):
        print(f"Description: {description[i]}")
        # plot the image
        response = requests.get(urls[i])
        img = Image.open(BytesIO(response.content))
        plt.imshow(img)
        plt.axis('off')
        plt.show()
        plt.close()

In [133]:
search_and_plot("אנשים מתפללים בכותל", n=5)

In [129]:
search_and_plot("תפילה", n=10)

In [130]:
search_and_plot("ביבי", n=10)

In [131]:
search_and_plot("ירושלים", n=5)

In [132]:
search_and_plot("בנימין נתניהו", n=5)


In [None]:
search_and_plot("בטקס האזכרה", n=5)

In [134]:
search_and_plot("בטקס האזכרה", n=5)


In [108]:
display_images_in_grid(urls)



# United Hatzala attempt

In [None]:
CONNECTION_STRING = "mongodb+srv://saar_david:saar@kal-media.ggmvds5.mongodb.net/"
DB_NAME = "kal-media"
COLLECTION_NAME = "united-hatsala"

client = MongoClient(CONNECTION_STRING)
db = client[DB_NAME]
collection = db[COLLECTION_NAME]
new_collection = db['united_hatzala_embeddings']

In [None]:
# Load embeddings models
english_model = SentenceTransformer('all-mpnet-base-v2')
hebrew_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# Fetch the first 50,000 documents from the original collection
cursor = collection.find().limit(50000)
documents = list(cursor)

# Convert to DataFrame for easier processing
df = pd.DataFrame(documents)

# Extract English and Hebrew words
df['english_words'] = df['descriptionEnglish'].apply(extract_english_words)
df['hebrew_words'] = df['descriptionHebrew'].apply(extract_hebrew_words)

# # Create tokens
# df['english_tokens'] = df['tags'].apply(extract_english_words) + ' ' + df['english_words']
# df['hebrew_tokens'] = df['tags'].apply(extract_hebrew_words) + ' ' + df['hebrew_words']

# Generate English embeddings
corpus_english = df['english_words'].values
corpus_embeddings_en = english_model.encode(corpus_english, convert_to_tensor=True)
df['embeddings_en'] = [embedding.cpu().detach().numpy() for embedding in corpus_embeddings_en]

# Generate Hebrew embeddings
corpus_hebrew = df['hebrew_words'].values
corpus_embeddings_he = hebrew_model.encode(corpus_hebrew, convert_to_tensor=True)
df['embeddings_he'] = [embedding.cpu().detach().numpy() for embedding in corpus_embeddings_he]

# Insert documents with embeddings into the new collection
new_documents = df.to_dict(orient='records')
for doc in new_documents:
    doc['embeddings_en'] = doc['embeddings_en'].tolist()
    doc['embeddings_he'] = doc['embeddings_he'].tolist()
    new_collection.insert_one(doc)