In [118]:
from gensim.models import Word2Vec
import multiprocessing
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

In [119]:
def load_data():
    #read data
    with open('labelled_bedrooms_data_sample.txt.txt') as f:
        bedrooms_lines = f.readlines()
    with open('labelled_living_rooms_data_sample.txt.txt') as f:
        living_rooms_lines = f.readlines()
        
    #adding all bedrooms and living rooms to list of vectors and to list of filenames
    rooms_files = []
    rooms = []
    for bedroom_line in bedrooms_lines:
        rooms_files.append(bedroom_line.split("; ")[0])
        rooms.append(bedroom_line.split("; ")[1:])

    for living_room_line in living_rooms_lines:
        rooms_files.append(living_room_line.split("; ")[0])
        rooms.append(living_room_line.split("; ")[1:])
        
    return rooms_files, rooms

In [130]:
def get_embedding(sentence,w2v_model):
    vectors = []
    for word in sentence:
        if word in w2v_model.wv.index_to_key:
            vector = w2v_model.wv[word]
            vectors.append(vector)
    #embedding of the whole sentence is average vector        
    sentence_embedding = np.mean(vectors, axis=0)
    return sentence_embedding

In [121]:
def get_w2v_model(rooms):
    #word to vec CBOW
    cores = multiprocessing.cpu_count() 
    w2v_model = Word2Vec(min_count=2,
                         window=2,
                         sg = 0,
                         sample=6e-5, 
                         alpha=0.03, 
                         min_alpha=0.0007, 
                         negative=20,
                         workers=cores-1)
    w2v_model.build_vocab(rooms)
    return w2v_model

In [128]:
#the mothods creating embeddigs for the whole dataset
def create_embeddings():
    rooms_files, rooms = load_data()
    w2v_model = get_w2v_model(rooms)
    
    embeddings_of_sentences = []
    for room in rooms:
        sentence_embedding = get_embedding(room,w2v_model)
        embeddings_of_sentences.append(sentence_embedding)
    return embeddings_of_sentences, rooms_files

In [123]:
# Function to calculate cosine similarity between two vectors
def calculate_cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [131]:
embeddings_of_sentences, rooms_files = create_embeddings()

In [132]:
embeddings_of_sentences[2]

array([-0.00551843,  0.00092605,  0.0005839 ,  0.0045524 , -0.00184452,
       -0.00077599,  0.0023034 ,  0.00353621,  0.0016551 , -0.00301923,
        0.00549416,  0.00022849, -0.00219101,  0.00144941, -0.00133725,
       -0.00429091,  0.00084675,  0.00133931, -0.00413802,  0.0014893 ,
        0.00629102, -0.00145136,  0.0067666 ,  0.00435929,  0.00091054,
       -0.00142936,  0.00484697,  0.00128054,  0.00051112,  0.00114805,
       -0.00240997,  0.0001031 ,  0.00301417, -0.0026215 ,  0.00298622,
        0.00309654,  0.00527194, -0.00150414, -0.00155672,  0.0006128 ,
       -0.00150922,  0.00334447,  0.00082867, -0.00420284,  0.00103433,
        0.00053205, -0.00032278,  0.00021761,  0.00323315,  0.00125839,
        0.0006195 ,  0.00247863, -0.00229962, -0.00443576, -0.00028388,
       -0.00051117,  0.0012467 ,  0.00286251, -0.00130612,  0.00256649,
       -0.00067403,  0.00155715,  0.00203206, -0.00367074, -0.0004807 ,
       -0.00396573,  0.00026535,  0.00631947, -0.00365675,  0.00

In [125]:
rooms_files[2]

'bed_3.jpg'

In [97]:
def get_prompt_embedding(prompt):
    text = prompt.lower()
    
    # tokenization
    tokens = word_tokenize(text)
    
    # delete stop_words
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
    
    prompt_embedding = get_embedding(tokens,w2v_model)
    
    return prompt_embedding

In [77]:
prompt = "I want to have bedroom with bedroom chair and mirrow"

In [91]:
prompt_embedding = get_prompt_embedding(prompt)
    

In [138]:
similiarity = calculate_cosine_similarity(embeddings_of_sentences, prompt_embedding)

In [141]:
rooms_files[np.argmax(similiarity)]

'bed_75.jpg'