In [1]:
# Import Zone
import os
# import sys
import random
import nltk
import math
import json
import numpy as np
import pandas as pd
from openpyxl import load_workbook
from nltk.stem import WordNetLemmatizer

from Crypto.Cipher import AES
import base64

## Levenshtein Distance Algorithm

In [2]:
# Levenshtein Distance Algorithm

def damerau_levenshtein_distance(string1, string2):
    n1 = len(string1)
    n2 = len(string2)
    return _levenshtein_distance_matrix(string1, string2, True)[n1, n2]

def get_ops(string1, string2, is_damerau=False):
    i, j = _levenshtein_distance_matrix(string1, string2, is_damerau).shape
    i -= 1
    j -= 1
    ops = list()
    while i != -1 and j != -1:
        if is_damerau:
            if i > 1 and j > 1 and string1[i-1] == string2[j-2] and string1[i-2] == string2[j-1]:
                if dist_matrix[i-2, j-2] < dist_matrix[i, j]:
                    ops.insert(0, ('transpose', i - 1, i - 2))
                    i -= 2
                    j -= 2
                    continue
        index = np.argmin([dist_matrix[i-1, j-1], dist_matrix[i, j-1], dist_matrix[i-1, j]])
        if index == 0:
            if dist_matrix[i, j] > dist_matrix[i-1, j-1]:
                ops.insert(0, ('replace', i - 1, j - 1))
            i -= 1
            j -= 1
        elif index == 1:
            ops.insert(0, ('insert', i - 1, j - 1))
            j -= 1
        elif index == 2:
            ops.insert(0, ('delete', i - 1, i - 1))
            i -= 1
    return ops

def execute_ops(ops, string1, string2):
    strings = [string1]
    string = list(string1)
    shift = 0
    for op in ops:
        i, j = op[1], op[2]
        if op[0] == 'delete':
            del string[i + shift]
            shift -= 1
        elif op[0] == 'insert':
            string.insert(i + shift + 1, string2[j])
            shift += 1
        elif op[0] == 'replace':
            string[i + shift] = string2[j]
        elif op[0] == 'transpose':
            string[i + shift], string[j + shift] = string[j + shift], string[i + shift]
        strings.append(''.join(string))
    return strings
#Levenshtein Distance 
def _levenshtein_distance_matrix(string1, string2, is_damerau=False):
    n1 = len(string1)
    n2 = len(string2)
    d = np.zeros((n1 + 1, n2 + 1), dtype=int)
    for i in range(n1 + 1):
        d[i, 0] = i
    for j in range(n2 + 1):
        d[0, j] = j
    for i in range(n1):
        for j in range(n2):
            if string1[i] == string2[j]:
                cost = 0
            else:
                cost = 1
            d[i+1, j+1] = min(d[i, j+1] + 1, # insert
                              d[i+1, j] + 1, # delete
                              d[i, j] + cost) # replace
            if is_damerau:
                if i > 0 and j > 0 and string1[i] == string2[j-1] and string1[i-1] == string2[j]:
                    d[i+1, j+1] = min(d[i+1, j+1], d[i-1, j-1] + cost) # transpose
    return d

## Converting measures

In [3]:
# convert bit to megabyte
def convert_Bit_to_Megabyte(bit):
    Megabyte = bit/(1024*1024*8)
    return Megabyte
def CalCharsbit(word):
    wordcount = len(word)*8
    return wordcount
# storage of a list
def calstorageofList(alist):
    strforlist=0
    for i in alist:
        strforlist += CalCharsbit(i)
    return strforlist

## Parameters

In [4]:
# Reading the required paramerters from args.json file

with open('args.json', 'r') as json_file:
    # Parse the JSON data into a Python dictionary
    parameters = json.load(json_file)

threshold = parameters["threshold"]
Memorysize = parameters["Memorysize"]
Hashsize = parameters["Hashsize"]
encSize = parameters["encSize"]
thestartrow = parameters["thestartrow"]
ifdemaru= False
AES_key = parameters["AES_key"]

In [5]:
print("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^")
print("data config: threshold: ",threshold," DL(F/T): ",ifdemaru ," Memorysize: ", Memorysize, " MB "," Hash size: ",Hashsize, " Encryption size ", encSize)
print("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^")

^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
data config: threshold:  6  DL(F/T):  False  Memorysize:  0.005  MB   Hash size:  8  Encryption size  128
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


## Reading dataset

In [6]:
# Reading data from dataset
data = [json.loads(line)
        for line in open('train.json', 'r', encoding='utf-8')]

DataSet=[]
for item in data:
        DataSet.append(item['navigation_text'])

In [7]:
# Getting size of dataset
SampleSize = len(DataSet)

# Raw data storage (char by char calculation)
liststr = []
for item in DataSet:
    strforlist = calstorageofList(item)
    liststr.append(strforlist)

## Tokenization

In [8]:
# Tokenization

#nltk.download('punkt')
def tokenize(sampleEmail):
    words = nltk.tokenize.word_tokenize(str(sampleEmail))
    return words
    
#gives the list of tokenized elements
tokenized = []
for entry in DataSet:
    tokenized.append(tokenize(entry))

NumofTokenizedWords = 0
for i in tokenized:
    NumofTokenizedWords += len(i)

## Extracting Bases & devs

In [9]:
def string_to_binary(astring):
    return ''.join(format(ord(i), 'b') for i in astring)

In [10]:
def crc_remainder(input_string, polynomial_bitstring, initial_filler):
    """Calculate the CRC remainder of a string of bits using a chosen polynomial.
    initial_filler should be '1' or '0'.
    """
    input_bitstring = string_to_binary(input_string)
    polynomial_bitstring = polynomial_bitstring.lstrip('0')
    len_input = len(input_bitstring)
    initial_padding = (len(polynomial_bitstring) - 1) * initial_filler
    input_padded_array = list(input_bitstring + initial_padding)
    while '1' in input_padded_array[:len_input]:
        cur_shift = input_padded_array.index('1')
        for i in range(len(polynomial_bitstring)):
            input_padded_array[cur_shift + i] \
            = str(int(polynomial_bitstring[i] != input_padded_array[cur_shift + i]))
    return ''.join(input_padded_array)[len_input:]

In [11]:
def binary_random_polynomial(bitsNumber):
    # Gets number of bits (N) and returns a N-bit random binary string
    binaryRand=""
    for i in range(bitsNumber):
        randBit = random.randint(0, 1)
        binaryRand += str(randBit)
    
    binaryRand = '1' + binaryRand

    return binaryRand

In [12]:
def encryption_AES(text):
    cipher = AES.new(AES_key.encode("utf8"), AES.MODE_EAX)
    nonce = cipher.nonce
    cipherbyte, tag = cipher.encrypt_and_digest(text.encode("utf8"))

    return cipherbyte

In [13]:
def to_base64(byte_cipher):
    return base64.b64encode(byte_cipher)

In [14]:
def sending_crc_devs(devs, crc):
    message = str((devs, crc))
    sending_commmand = f"mosquitto_pub -h test.mosquitto.org -t /zvrqrh -m \"{message}\""
    os.system(sending_commmand)

In [15]:
def sending_crc_devs_encyption(devs, crc, encrypted_base64):
    message = str((devs, crc, encrypted_base64))
    sending_commmand = f"mosquitto_pub -h test.mosquitto.org -t /zvrqrh -m \"{message}\""
    os.system(sending_commmand)

In [16]:
# Storages and variables 

wordnet_lemmatizer = WordNetLemmatizer()
numberofDels=0
numberofIns=0
numberofRep=0
numberoftrans=0

polynomial_bitstring=binary_random_polynomial(Hashsize)

FinalBase=""
Base_Freq={}
localBases=[]
HashListSenttoServer=[]
SendServerencryptedbased=0
notmergedBases=[]
mergedBases=[]
DeviationsStorage=[]
numodfDuplicateSentBases=0
localstorageHASHbased=0
mergedduplicates=0
mergedBasesSize=0
notmergedBasesSize=0
Sendmorehashes=0
pointertonewlyadded=0

In [17]:
# Extracting Bases & devs

#nltk.download('wordnet')
for singleEntry in tokenized:   
    for word in singleEntry:
        devsize=0
        ops = []
        #ectract the base started
        if len(word)*8 <= int(encSize):
            #extract the base of the word
            extractedBase = wordnet_lemmatizer.lemmatize(word)    

            #find the distance and get the operations for converting
            dist_matrix = _levenshtein_distance_matrix(extractedBase, word, is_damerau=ifdemaru)
            ops = get_ops(extractedBase, word, is_damerau=ifdemaru)

            numofedits = len(ops)

            #if less than threshold, define a deviation
            # ops are the final operations and they should be sent
            if numofedits>0 and numofedits<int(threshold):
                FinalBase = extractedBase
                for i in range(len(ops)):
                    if ops[i][0]=='delete':
                        devsize+=(2+8+math.ceil(math.log2(len(extractedBase))))
                        numberofDels+=1
                    elif ops[i][0]=='replace':
                        devsize+=(2+8+math.ceil(math.log2(len(extractedBase))))
                        numberofRep+=1
                    elif ops[i][0]=='insert':
                        devsize+=(2+8+math.ceil(math.log2(len(extractedBase))))
                        numberofIns+=1
                    elif ops[i][0]=='transpose':
                        devsize+=(2+math.ceil(math.log2(len(extractedBase))))
                        numberoftrans+=1
                DeviationsStorage.append(devsize)

            # if dist=0 OR dist >= threshold the final base would be the word itself 
            else:
                FinalBase=word

        #else if the length of the word is more than 15 (2^4 = 2^7 / 8)
        else:
            FinalBase=word
        #ectracting the base ends here

        #check if its already available in local storage - if yes only the hash is sent to the server
        if FinalBase in localBases:
            #increase the number of encrypted bases that are sent
            numodfDuplicateSentBases += 1
            
            #send the hash(base)
            #if hashing is used as an ID (e.g. in md5, 128 bit is rquried) 
            crc = crc_remainder(FinalBase, polynomial_bitstring, '0')
            
            # Sending: (ops, hash)
            sending_crc_devs(ops, crc)
            HashListSenttoServer.append(crc)

            #increase the frequency of the base 
            if FinalBase in Base_Freq:
                Base_Freq[FinalBase] += 1
            else:
                Base_Freq[FinalBase] = 1

        #if its not avialbe in the local storage -> then we should check if it still has some places left or not!        
        else:
            #calculate the storage needed for storing a hash based on given hash function
            OnehashsizeinMB = convert_Bit_to_Megabyte(int(Hashsize))
            #check if localstorage is not full yet
            if convert_Bit_to_Megabyte(localstorageHASHbased) <= (float(Memorysize)-OnehashsizeinMB):

                #the raw value appended just for simplicity of comparing
                localBases.append(FinalBase)
    
                #add the hash value size to bases
                # ToDo[tag.2]: we should calculate hashes, create a storage(dict) for local hashes 
                # and store hashes there
                crc = crc_remainder(FinalBase, polynomial_bitstring, '0')
                localstorageHASHbased+=int(Hashsize)

                #increase the frequency of the base
                if FinalBase in Base_Freq:
                    Base_Freq[FinalBase] +=1
                else:
                    Base_Freq[FinalBase] =1

                # ToDo[tag.1]: We should really send it to server (publisher)
                #send server the ecnrypted(base) - the size is based on the encryption method
                # Encrypting the Final base and converting it to Base64 for sending it
                encrypted_text = encryption_AES(FinalBase)
                encrypted_base64_text = to_base64(encrypted_text)
                
                
                # Ask: should we do the server(publisher) side procedure?
                sending_crc_devs_encyption(ops, crc, encrypted_base64_text)
                Sendmorehashes+=int(Hashsize)
                SendServerencryptedbased += int(encSize)


                #server side storage simulation
                if FinalBase not in mergedBases: 
                    mergedBasesSize+= int(encSize)
                    mergedBases.append(FinalBase)
                    pointertonewlyadded+=1
                    
                else:
                    mergedduplicates+=1


            #if the storage is full 
            else: 
                #extract the one with minimum frequency
                min_freq =  min(Base_Freq, key=Base_Freq.get)
                localBases.remove(min_freq)
                Base_Freq.pop(min_freq)
                #set the frequency to zero
                Base_Freq[FinalBase] = 0
                localBases.append(FinalBase)
                
                #send server the ecnrypted(base) - the size is based on the encryption method
                encrypted_text = encryption_AES(FinalBase)
                encrypted_base64_text = to_base64(encrypted_text)

                # ToDo[tag.2]: we should calculate hashes, create a storage(dict) for local hashes 
                # and store hashes there 
                # (Ask: here kiyana stores the FinalBase and check with it not hash (CRC) of it, 
                # should we compare it with the CRC?)
                crc = crc_remainder(FinalBase, polynomial_bitstring, '0')
                
                sending_crc_devs_encyption(ops, crc, encrypted_base64_text)
                Sendmorehashes+=int(Hashsize)
                SendServerencryptedbased+= int(encSize)
                
                #server side storage simulation
                # Ask: should we do the server(publisher) side procedure?
                if FinalBase not in mergedBases: 
                    pointertonewlyadded+=1
                    mergedBases.append(FinalBase)
                    mergedBasesSize+= int(encSize)
                else:
                    mergedduplicates+=1

## Saving the results

In [18]:
#server Storage pointer based
totaldups= numodfDuplicateSentBases+mergedduplicates

ServerStorage=convert_Bit_to_Megabyte(sum(DeviationsStorage)+mergedBasesSize+(totaldups*math.log2(len(mergedBases)))+(int(Hashsize)*len(mergedBases)))

pointerofnewlyaddedsize=convert_Bit_to_Megabyte(pointertonewlyadded*math.log2(len(mergedBases)))
ServerStoragewithpointer=convert_Bit_to_Megabyte(sum(DeviationsStorage)+mergedBasesSize+(totaldups*math.log2(len(mergedBases)))+pointertonewlyadded*math.log2(len(mergedBases))+ (int(Hashsize)*len(mergedBases)))

results = {'Memory size ': str(Memorysize),
                  'SampleSize ': str(SampleSize),
                  'raw  storage (MB) ': str(convert_Bit_to_Megabyte(sum(liststr))),
                  'number of tokenized words': str(NumofTokenizedWords),
                  'hash size': str(Hashsize),
                  'threshold': str(threshold),
                  'enc size': str(encSize),
                  'ifdemaru': str(ifdemaru),
                  'local storage (MB)': str(convert_Bit_to_Megabyte(localstorageHASHbased)),
                  'ServerStorage (MB)': str(ServerStorage),
                  'pointers to newly (MB)': str(pointerofnewlyaddedsize),
                  'Total ServerStoragewithpointer (MB)': str(ServerStoragewithpointer),
                  'encrypted Sent bases (MB)': str(convert_Bit_to_Megabyte(SendServerencryptedbased)),
                  'encrypted deviations (MB)': str(convert_Bit_to_Megabyte(sum(DeviationsStorage))),
                  'Send Server Hash  (MB)': str(convert_Bit_to_Megabyte(len(HashListSenttoServer) * Hashsize)),
                  'Sendmorehashes (MB)': str(convert_Bit_to_Megabyte(Sendmorehashes)),
                  'size of merged stored bases (MB)': str(convert_Bit_to_Megabyte(mergedBasesSize)),
                  'numodfDuplicateSentBases (hashes)': str(numodfDuplicateSentBases),                  
                  'number of merged duplicates (final bases)': str(mergedduplicates),
                  'number of merged bases': str(len(mergedBases)),
                  'len(deviationList)': str(len(DeviationsStorage)),
                  'tot pointer storage for merged': str(convert_Bit_to_Megabyte(totaldups*math.log2(len(mergedBases)))),
                  }

path = 'touchdownResultsJuly_1402_7_3'
with open(path, 'w') as f:
     f.write(json.dumps(results))

In [19]:
#server Storage pointer based
totaldups= numodfDuplicateSentBases+mergedduplicates

ServerStorage=convert_Bit_to_Megabyte(sum(DeviationsStorage)+mergedBasesSize+(totaldups*math.log2(len(mergedBases)))+(int(Hashsize)*len(mergedBases)))

pointerofnewlyaddedsize=convert_Bit_to_Megabyte(pointertonewlyadded*math.log2(len(mergedBases)))
ServerStoragewithpointer=convert_Bit_to_Megabyte(sum(DeviationsStorage)+mergedBasesSize+(totaldups*math.log2(len(mergedBases)))+pointertonewlyadded*math.log2(len(mergedBases))+ (int(Hashsize)*len(mergedBases)))

df = pd.DataFrame({'Memory size ': [str(Memorysize)],
                  'SampleSize ': [str(SampleSize)],
                  'raw  storage (MB) ': [str(convert_Bit_to_Megabyte(sum(liststr)))],
                  'number of tokenized words': [str(NumofTokenizedWords)],
                  'hash size': [str(Hashsize)],
                  'threshold': [str(threshold)],
                  'enc size': [str(encSize)],
                  'ifdemaru': [str(ifdemaru)],
                  'local storage (MB)': [str(convert_Bit_to_Megabyte(localstorageHASHbased))],
                  'ServerStorage (MB)': [str(ServerStorage)],
                  'pointers to newly (MB)': [str(pointerofnewlyaddedsize)],
                  'Total ServerStoragewithpointer (MB)': [str(ServerStoragewithpointer)],
                  'encrypted Sent bases (MB)': [str(convert_Bit_to_Megabyte(SendServerencryptedbased))],
                  'encrypted deviations (MB)': [str(convert_Bit_to_Megabyte(sum(DeviationsStorage)))],
                  'Send Server Hash  (MB)': [str(convert_Bit_to_Megabyte(len(HashListSenttoServer) * Hashsize))],
                  'Sendmorehashes (MB)': [str(convert_Bit_to_Megabyte(Sendmorehashes))],
                  'size of merged stored bases (MB)': [str(convert_Bit_to_Megabyte(mergedBasesSize))],
                  'numodfDuplicateSentBases (hashes)': [str(numodfDuplicateSentBases)],                  
                  'number of merged duplicates (final bases)': [str(mergedduplicates)],
                  'number of merged bases': [str(len(mergedBases))],
                  'len(deviationList)': [str(len(DeviationsStorage))],
                  'tot pointer storage for merged': [str(convert_Bit_to_Megabyte(totaldups*math.log2(len(mergedBases))))],
                  })

# Write the DataFrame to an Excel file
excel_file_path = 'touchdownResultsJuly_1402_7_2.xlsx'
df.to_excel(excel_file_path, index=False)

print(f"DataFrame successfully written to {excel_file_path}")

DataFrame successfully written to touchdownResultsJuly_1402_7_2.xlsx
