In [24]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import torch
from torch_geometric.data import Data
import torch.nn.functional as F
import warnings
import os
warnings.filterwarnings('ignore')
from torch_geometric.loader import NeighborLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
%matplotlib inline

In [25]:
from pprint import pprint
import json
import copy

import gensim
from gensim.models import Word2Vec
from multiprocessing import Pool
from itertools import compress
from tqdm import tqdm
import time

import multiprocessing
import random
import xxhash

In [40]:
def combine_word2vec_models(models):
    # Create an empty unified model
    unified_model = Word2Vec(vector_size=models[0].vector_size, window=models[0].window, min_count=models[0].min_count, sg=models[0].sg)

    # Initialize the vocabulary with the words from the first model
    unified_model.build_vocab([list(models[0].wv.index_to_key)])

    # Copy the vectors from the first model to the unified model for the initial vocabulary
    for word in unified_model.wv.index_to_key:
        unified_model.wv[word] = models[0].wv[word]

    # Iterate through the remaining models and add their unique words and average vectors for overlapping words
    for model in models[1:]:
        # Get the set of unique words in the current model's vocabulary
        unique_words = set(model.wv.index_to_key) - set(unified_model.wv.index_to_key)

        # Add the unique words to the unified model's vocabulary
        unified_model.build_vocab([list(unique_words)], update=True)

        # Iterate through the overlapping words and average their vectors
        for word in set(model.wv.index_to_key).intersection(set(unified_model.wv.index_to_key)):
            unified_model.wv[word] = (unified_model.wv[word] + model.wv[word]) / 2.0

        # Copy the vectors for the unique words from the current model to the unified model
        for word in unique_words:
            unified_model.wv[word] = model.wv[word]

    return unified_model

In [41]:
from cryptography.fernet import Fernet
import gensim
import base64

def generate_key():
    """
    Generates a random encryption key.

    Returns:
    bytes: A random encryption key.
    """
    return Fernet.generate_key()

def encrypt_word2vec_model(word2vec_model, encryption_key):
    """
    Encrypts the tokens in a Word2Vec model using a given encryption key.

    Args:
    word2vec_model (gensim.models.Word2Vec): The Word2Vec model to encrypt.
    encryption_key (bytes): The encryption key for token privacy.

    Returns:
    gensim.models.Word2Vec: The modified Word2Vec model with encrypted tokens.
    """
    f = Fernet(encryption_key)
    vector_size = word2vec_model.vector_size  # Get the vector size from the original model
    encrypted_model = gensim.models.Word2Vec(vector_size=vector_size, min_count=1)  # Create a new Word2Vec model
    
    for word in word2vec_model.wv.index_to_key:
        vector = word2vec_model.wv.get_vector(word)
        # Encrypt the word using the encryption_key
        encrypted_word = f.encrypt(word.encode()).decode()
        
        # Add the encrypted word and its vector to the new model
        encrypted_model.wv[encrypted_word] = vector
    
    return encrypted_model

def decrypt_word2vec_model(word2vec_model, encryption_key):
    """
    Decrypts the tokens in an encrypted Word2Vec model using a given encryption key.

    Args:
    encrypted_model (gensim.models.Word2Vec): The encrypted Word2Vec model to decrypt.
    encryption_key (bytes): The encryption key used for encryption.

    Returns:
    gensim.models.Word2Vec: The Word2Vec model with original tokens.
    """
    f = Fernet(encryption_key)
    vector_size = word2vec_model.vector_size  # Get the vector size from the original model
    decrypted_model = gensim.models.Word2Vec(vector_size=vector_size, min_count=1)  # Create a new Word2Vec model
    
    for word in word2vec_model.wv.index_to_key:
        vector = word2vec_model.wv.get_vector(word)
        # Encrypt the word using the encryption_key
        decrypted_word = f.decrypt(word.encode()).decode()
        
        # Add the encrypted word and its vector to the new model
        decrypted_model.wv[decrypted_word] = vector
    
    return decrypted_model

In [42]:
encryption_key = generate_key()

encrypted_models = []
for path in ["051.txt.model", "201.txt.model", "501.txt.model"]:
    word2vec_model = gensim.models.Word2Vec.load(f"Content_FL_Exp/{path}")
    
    # Encrypt the Word2Vec model
    encrypted_model = encrypt_word2vec_model(word2vec_model, encryption_key)
    encrypted_models.append(encrypted_model)

In [43]:
final_model = combine_word2vec_models(encrypted_models)
final_model.save("Content_FL_Exp/encrypted_word2vec.model")

In [44]:
decrypted_model = decrypt_word2vec_model(final_model, encryption_key)
decrypted_model.save("Content_FL_Exp/decrypted_word2vec.model")

In [49]:
print(encrypted_models[0].wv.index_to_key[0])
print(encrypted_models[1].wv.index_to_key[0])
print(encrypted_models[2].wv.index_to_key[0])

gAAAAABlHz7S60-hZnmgRNwnRGm6uQ9pJ2PyYyC_CTf2I3sgl3ucsgCAdLNP96cOgwMQST07L2Ot0k7ZwE81VBkKStI2rR64mA==
gAAAAABlHz7V9YJPTFuBgzdMNDAnIi59A2QTUPK__jXGBvy58wvf0hArQfPa72u4SDvDNYkHJs0dJLOuBcfFQxEnl6lt8nhPQA==
gAAAAABlHz7lANrRUR7DU9spkuVsCeUGwNHfPguyyuzTPBKzqyt64LVz5n1PKCm9WwBQ_MeNns4fMq_qgIG9A5LVBUOSssrGyg==


In [55]:
print(final_model.wv.index_to_key[0])

True


In [52]:
print(decrypted_model.wv.index_to_key[100])

\Device\HarddiskVolume1\WINDOWS\SYSTEM32\WPDBUSENUM.DLL
