Solution :

1. For this we will be using mapping file. We will generate a random int64 number and map it to the email id.
2. We will generate unique number for each email id and replace it in the encoded file.
3. Limitation: This can only generate 2**64 unique numbers

### **IMPORTANT: Do not use any existing coder / encryptor for this problem**

Encoder.py

In [None]:
import random
import os

def encode_file_usingmap(input_file_name, output_file_name, mapping_file_name):
    used_numbers = set()
    user_email_ids = {}

    def get_unique_number():
        while True:
            num = random.getrandbits(64)
            if num not in used_numbers:
                used_numbers.add(num)
                return num

    with open(input_file_name, 'r') as in_file, \
         open(output_file_name, 'w') as out_file, \
         open(mapping_file_name, 'w') as mapping_file:

        out_file.write("user_id,product_id,score\n")
        mapping_file.write("user_id_number,user_id\n")

        for line in in_file:
            values = line.strip().split(',')

            if values[0] == "user_email_id":
                continue  # Skip header

            email = values[0]
            if email in user_email_ids:
                unique_id = user_email_ids[email]
            else:
                unique_id = get_unique_number()
                user_email_ids[email] = unique_id
                mapping_file.write(f"{unique_id},{email}\n")

            values[0] = str(unique_id)
            out_file.write(','.join(values) + "\n")


if __name__ == "__main__":
    encode_file_usingmap("data/our_data.csv", "data/encoded_data.csv", "data/email_mapping.csv")


Decoder.py

In [None]:
import os

def decode_file_usingmap(input_file_name, output_file_name, mapping_file_name):
    def get_userid_mapping(mapping_file_name):
        with open(mapping_file_name, 'r') as map_file:
            lines = map_file.readlines()[1:]  # skip header
            return {key: value for key, value in (line.strip().split(',') for line in lines)}

    user_map = get_userid_mapping(mapping_file_name)

    with open(input_file_name, 'r') as in_file, open(output_file_name, 'w') as out_file:
        out_file.write("user_email_id,product_id,score\n")

        for line in in_file:
            values = line.strip().split(',')

            if values[0] == "user_id":
                continue  # Skip header

            values[0] = user_map.get(values[0], "UNKNOWN_EMAIL")
            out_file.write(','.join(values) + "\n")


if __name__ == "__main__":
    decode_file_usingmap("data/encoded_data.csv", "data/decoded_data.csv", "data/email_mapping.csv")


Sample Input File



In [None]:
user_email_id,product_id,score
alice@example.com,10,4.0
bob_the_builder@fixit.com,12,3.8
charlie123@nowhere.org,14,2.5


Encoder Output:

In [None]:
user_id,product_id,score
10293847561028374658,10,4.0
99887766554433221100,12,3.8
12345678901234567890,14,2.5


Mapping File:

In [None]:
user_id_number,user_id
10293847561028374658,alice@example.com
99887766554433221100,bob_the_builder@fixit.com
12345678901234567890,charlie123@nowhere.org


Decoded Output:

In [None]:
user_email_id,product_id,score
alice@example.com,10,4.0
bob_the_builder@fixit.com,12,3.8
charlie123@nowhere.org,14,2.5
