In [255]:
import json
import networkx 
import pandas as pd
from networkx.algorithms.components.connected import connected_components
from datetime import datetime

In [256]:
with open("contacts.json", "r") as f:
    data = json.load(f)

# Get data with id as index

In [257]:
data_with_index = dict(zip(map(lambda x: x["Id"], data), data))

In [258]:
{k:v for k,v in data_with_index.items() if k in range(10)}

{0: {'Id': 0,
  'Email': 'gkzAbIy@qq.com',
  'Phone': '',
  'Contacts': 1,
  'OrderId': ''},
 1: {'Id': 1,
  'Email': '',
  'Phone': '329442681752',
  'Contacts': 4,
  'OrderId': 'vDDJJcxfLtSfkooPhbYnJdxov'},
 2: {'Id': 2,
  'Email': '',
  'Phone': '9125983679',
  'Contacts': 0,
  'OrderId': ''},
 3: {'Id': 3,
  'Email': 'mdllpYmE@gmail.com',
  'Phone': '',
  'Contacts': 0,
  'OrderId': 'bHquEnCbbsGLqllwryxPsNOxa'},
 4: {'Id': 4, 'Email': '', 'Phone': '300364407', 'Contacts': 2, 'OrderId': ''},
 5: {'Id': 5, 'Email': '', 'Phone': '840113148', 'Contacts': 0, 'OrderId': ''},
 6: {'Id': 6,
  'Email': 'hESiBgYTxMnj@hotmail.com',
  'Phone': '',
  'Contacts': 4,
  'OrderId': ''},
 7: {'Id': 7,
  'Email': 'hqyunTiaXfIjhO@yahoo.com',
  'Phone': '',
  'Contacts': 1,
  'OrderId': 'vxjYZtMUvNhtEBDruzQfjpsmX'},
 8: {'Id': 8,
  'Email': 'XGUjcSiDpFKWEUCs@qq.com',
  'Phone': '223604620644',
  'Contacts': 2,
  'OrderId': ''},
 9: {'Id': 9,
  'Email': 'osIIQgAiZX@hotmail.com',
  'Phone': '',
  'Contac

# Get ids linked by either email/phone/orderid

In [261]:
index = {}
for d in data:
    if d['Email']:
        l = index.get(d['Email'],[])
        l.append(d["Id"])
        index[d['Email']] = l
    if d['Phone']:
        l = index.get(d['Phone'],[])
        l.append(d["Id"])
        index[d['Phone']] = l
    if d['OrderId']:
        l = index.get(d['OrderId'],[])
        l.append(d["Id"])
        index[d['OrderId']] = l
linked_ids = list(index.values())

In [262]:
len(linked_ids)

629135

# Get ids linked by email+phone+orderid

In [263]:
def to_graph(l):
    G = networkx.Graph()
    for part in l:
        # each sublist is a bunch of nodes
        G.add_nodes_from(part)
        # it also imlies a number of edges:
        G.add_edges_from(list(zip(part[:-1], part[1:])))
    return G  

G = to_graph(linked_ids)
linked_ids_combined = list(connected_components(G))

In [264]:
print(len(linked_ids_combined))
linked_ids_combined[:10]

291904


[{0},
 {1, 2458, 98519, 115061, 140081, 165605, 476346},
 {2, 159312, 322639, 348955},
 {3},
 {4},
 {5, 50, 212533, 215197, 226720, 383605, 404324, 458692, 482810},
 {6, 38, 32871, 142067, 236367},
 {7},
 {8, 183160, 406623},
 {9, 13, 16708, 33415, 343161, 417916, 468927, 484896}]

# Add linked_ids and total contacts to data

In [265]:
for i, linked_ids in enumerate(linked_ids_combined):
    contacts_total = 0
    for id in linked_ids:
        contacts_total += data_with_index[id]["Contacts"]
    
    output = "-".join(map(str, sorted(list(linked_ids)))) + ", " + str(contacts_total)
    for id in linked_ids:
        data_with_index[id].update({"ticket_trace/contact": output})

In [266]:
{k:v for k,v in data_with_index.items() if k in range(10)}

{0: {'Id': 0,
  'Email': 'gkzAbIy@qq.com',
  'Phone': '',
  'Contacts': 1,
  'OrderId': '',
  'ticket_trace/contact': '0, 1'},
 1: {'Id': 1,
  'Email': '',
  'Phone': '329442681752',
  'Contacts': 4,
  'OrderId': 'vDDJJcxfLtSfkooPhbYnJdxov',
  'ticket_trace/contact': '1-2458-98519-115061-140081-165605-476346, 12'},
 2: {'Id': 2,
  'Email': '',
  'Phone': '9125983679',
  'Contacts': 0,
  'OrderId': '',
  'ticket_trace/contact': '2-159312-322639-348955, 4'},
 3: {'Id': 3,
  'Email': 'mdllpYmE@gmail.com',
  'Phone': '',
  'Contacts': 0,
  'OrderId': 'bHquEnCbbsGLqllwryxPsNOxa',
  'ticket_trace/contact': '3, 0'},
 4: {'Id': 4,
  'Email': '',
  'Phone': '300364407',
  'Contacts': 2,
  'OrderId': '',
  'ticket_trace/contact': '4, 2'},
 5: {'Id': 5,
  'Email': '',
  'Phone': '840113148',
  'Contacts': 0,
  'OrderId': '',
  'ticket_trace/contact': '5-50-212533-215197-226720-383605-404324-458692-482810, 15'},
 6: {'Id': 6,
  'Email': 'hESiBgYTxMnj@hotmail.com',
  'Phone': '',
  'Contacts': 4,
 

# Extract wanted columns

In [267]:
df = pd.DataFrame(list(map(lambda x: {"ticket_id": x["Id"], 'ticket_trace/contact': x['ticket_trace/contact']}, data_with_index.values())))

In [268]:
df

Unnamed: 0,ticket_id,ticket_trace/contact
0,0,"0, 1"
1,1,"1-2458-98519-115061-140081-165605-476346, 12"
2,2,"2-159312-322639-348955, 4"
3,3,"3, 0"
4,4,"4, 2"
...,...,...
499995,499995,"499995, 2"
499996,499996,"499996, 4"
499997,499997,"499997, 2"
499998,499998,"121111-499998, 5"


In [269]:
df.to_csv("output_networkx.csv", index=False)