In [2]:
import json
import copy
import pandas as pd
import random
from datetime import datetime

In [3]:
with open("contacts.json", "r") as f:
    data = json.load(f)

# Get data with id as index

In [4]:
data_with_index = dict(zip(map(lambda x: x["Id"], data), data))

In [5]:
{k:v for k,v in data_with_index.items() if k in range(5)}

{0: {'Id': 0,
  'Email': 'gkzAbIy@qq.com',
  'Phone': '',
  'Contacts': 1,
  'OrderId': ''},
 1: {'Id': 1,
  'Email': '',
  'Phone': '329442681752',
  'Contacts': 4,
  'OrderId': 'vDDJJcxfLtSfkooPhbYnJdxov'},
 2: {'Id': 2,
  'Email': '',
  'Phone': '9125983679',
  'Contacts': 0,
  'OrderId': ''},
 3: {'Id': 3,
  'Email': 'mdllpYmE@gmail.com',
  'Phone': '',
  'Contacts': 0,
  'OrderId': 'bHquEnCbbsGLqllwryxPsNOxa'},
 4: {'Id': 4, 'Email': '', 'Phone': '300364407', 'Contacts': 2, 'OrderId': ''}}

# Get ids linked by either email/phone/orderid + keys

In [6]:
index = {}
for d in data:
    if d['Email']:
        l = index.get(d['Email'],[])
        l.append(d["Id"])
        index[d['Email']] = l
    if d['Phone']:
        l = index.get(d['Phone'],[])
        l.append(d["Id"])
        index[d['Phone']] = l
    if d['OrderId']:
        l = index.get(d['OrderId'],[])
        l.append(d["Id"])
        index[d['OrderId']] = l

In [7]:
dict(random.sample(index.items(), 5))

{'35076664440': [188008, 498646],
 'gcCqUghAnTlnUGgaakU@hotmail.com': [35057],
 '80388936333': [426517],
 'wrCLVabZAglAwSS@hotmail.com': [36081, 499309],
 '32396091600': [127706]}

# Process data row by row to add linked ids and sum contacts

In [8]:
searched = {}
new = data_with_index

for k, v in new.items():

    matched = []
    if v["Email"]:
        for key in index[v["Email"]]:
            matched_data = searched.get(key)
            if matched_data: 
                matched.append(matched_data)
    if v["Phone"]:
        for key in index[v["Phone"]]:
            matched_data = searched.get(key)
            if matched_data:
                matched.append(matched_data)
    if v["OrderId"]:
        for key in index[v["OrderId"]]:
            matched_data = searched.get(key)
            if matched_data:
                matched.append(matched_data)
                
    to_add = v.copy()
    to_add["Ids"] = {to_add["Id"]}
    to_add["Contacts_total"] = to_add["Contacts"]

    searched.update({k: to_add})

    if matched:
        matched.append(to_add)
        ids = set.union(*map(lambda x: x["Ids"], matched))
        contacts_total = sum(searched[id]["Contacts"] for id in ids)

        for j in ids:
            searched[j].update({"Ids": ids})
            searched[j].update({"Contacts_total": contacts_total})

# Extract wanted columns

In [9]:
def get_wanted_data(dic):
    output = "-".join(map(str, sorted(list(dic['Ids'])))) + ", " + str(dic['Contacts_total'])
    return {"ticket_id": dic["Id"], 'ticket_trace/contact': output}
df = pd.DataFrame(list(map(lambda x: get_wanted_data(x), searched.values())))

In [10]:
df

Unnamed: 0,ticket_id,ticket_trace/contact
0,0,"0, 1"
1,1,"1-2458-98519-115061-140081-165605-476346, 12"
2,2,"2-159312-322639-348955, 4"
3,3,"3, 0"
4,4,"4, 2"
...,...,...
499995,499995,"499995, 2"
499996,499996,"499996, 4"
499997,499997,"499997, 2"
499998,499998,"121111-499998, 5"


# Compare with output from method using networkx library

In [11]:
df_nx = pd.read_csv("output_networkx.csv")
df_nx

Unnamed: 0,ticket_id,ticket_trace/contact
0,0,"0, 1"
1,1,"1-2458-98519-115061-140081-165605-476346, 12"
2,2,"2-159312-322639-348955, 4"
3,3,"3, 0"
4,4,"4, 2"
...,...,...
499995,499995,"499995, 2"
499996,499996,"499996, 4"
499997,499997,"499997, 2"
499998,499998,"121111-499998, 5"


In [12]:
df.compare(df_nx)

In [13]:
compare = (df == df_nx)
compare

Unnamed: 0,ticket_id,ticket_trace/contact
0,True,True
1,True,True
2,True,True
3,True,True
4,True,True
...,...,...
499995,True,True
499996,True,True
499997,True,True
499998,True,True


In [14]:
print(set(compare["ticket_id"]))
print(set(compare["ticket_trace/contact"]))

{True}
{True}


# Compare output from eko

In [31]:
df_eko = pd.read_csv("../solution_eko/ans.csv")
df_eko

Unnamed: 0,ticket_id,ticket_trace/contact
0,0,"0, 1"
1,1,"1-2458-98519-115061-140081-165605-476346, 12"
2,2,"2-159312-322639-348955, 4"
3,3,"3, 0"
4,4,"4, 2"
...,...,...
499995,499995,"499995, 2"
499996,499996,"499996, 4"
499997,499997,"499997, 2"
499998,499998,"121111-499998, 5"


In [32]:
df.compare(df_eko)