In [1]:
import pandas as pd
import json
import pickle
from tqdm import tqdm

In [2]:
# Original files:
# - Fib-25: Fib-25/Fib25.txt
# - LGN: LGN/LGN.txt
# - Janelia: Janelia/Janelia.json

Fib_org = 'Fib-25/Fib25.txt'
LGN_org = 'LGN/LGN.txt'
Janelia_org = 'Janelia/Janelia.json'
PATH = 'PDM/PART1/datasets/'

# Check Fib-25

In [39]:
f = open(Fib_org, 'r')
n_nodes = f.readline()
n_edges = f.readline()

edge_list_f = pd.DataFrame(columns=['source', 'target', 'weight'])
for l in tqdm(f.readlines()):
    _list = l.split(',')
    if _list[0] != _list[1]:
        edge_list_f = edge_list_f.append({k: int(_list[i].rstrip('\n')) for i, k in enumerate(edge_list_f.columns)}, ignore_index=True)


100%|██████████| 9515/9515 [00:15<00:00, 611.47it/s]


In [40]:
# Make unique fragment IDs from 1-n+1
set_frag = list(set(edge_list_f.source.unique())|set(edge_list_f.target.unique()))
n_frag = len(set_frag)
dict_frag = {set_frag[i]: i+1 for i in tqdm(range(n_frag))}
edge_list_f.source = edge_list_f.source.map(dict_frag)
edge_list_f.target = edge_list_f.target.map(dict_frag)

100%|██████████| 749/749 [00:00<00:00, 868066.79it/s]


In [41]:
# Check for repeated connections
edge_list_f[edge_list_f.duplicated(keep=False)]

Unnamed: 0,source,target,weight


In [42]:
# Check for connections that are both ways
edge_list_f[(edge_list_f[['target', 'source']].values == edge_list_f[['source', 'target']]).source]

Unnamed: 0,source,target,weight


In [43]:
print('edges : {}'.format(len(edge_list_f)))
print('nodes : {}'.format(max(edge_list_f.source.max(), edge_list_f.target.max())))
print('mean weight : {}'.format(edge_list_f.weight.mean()))
print('max weight : {}'.format(edge_list_f.weight.max()))

edges : 9515
nodes : 749
mean weight : 3.282816605359958
max weight : 188


In [51]:
reduced_edge_list_f = edge_list_f[edge_list_f.weight>5]

In [52]:
# Make unique fragment IDs from 1-n+1
set_frag = list(set(reduced_edge_list_f.source.unique())|set(reduced_edge_list_f.target.unique()))
n_frag = len(set_frag)
dict_frag = {set_frag[i]: i+1 for i in tqdm(range(n_frag))}
reduced_edge_list_f.source = reduced_edge_list_f.source.map(dict_frag)
reduced_edge_list_f.target = reduced_edge_list_f.target.map(dict_frag)

100%|██████████| 474/474 [00:00<00:00, 889530.24it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [53]:
print('edges : {}'.format(len(reduced_edge_list_f)))
print('nodes : {}'.format(max(reduced_edge_list_f.source.max(), reduced_edge_list_f.target.max())))

edges : 1141
nodes : 474


# Check LGN

In [15]:
f = open(LGN_org, 'r')
n_nodes = f.readline()
n_edges = f.readline()

edge_list_l = pd.DataFrame(columns=['source', 'target', 'weight'])
for l in tqdm(f.readlines()):
    _list = l.split('\t')
    if _list[0] != _list[1]:
        edge_list_l = edge_list_l.append({k: _list[i].rstrip('\n') for i, k in enumerate(edge_list_l.columns)}, ignore_index=True)

100%|██████████| 818/818 [00:01<00:00, 448.35it/s]


In [16]:
# Make unique fragment IDs from 1-n+1
set_frag = list(set(edge_list_l.source.unique())|set(edge_list_l.target.unique()))
n_frag = len(set_frag)
dict_frag = {set_frag[i]: i+1 for i in tqdm(range(n_frag))}
edge_list_l.source = edge_list_l.source.map(dict_frag)
edge_list_l.target = edge_list_l.target.map(dict_frag)

100%|██████████| 420/420 [00:00<00:00, 862687.40it/s]


In [17]:
# Check for repeated connections
edge_list_l[edge_list_l.duplicated(keep='first')]

Unnamed: 0,source,target,weight


In [18]:
# Check for connections that are both ways
tmp = pd.DataFrame(edge_list_l)
tmp.columns = ['target', 'source', 'weight']
tmp.append(edge_list_l, ignore_index=True)
tmp[tmp.duplicated(keep=False)]

Unnamed: 0,target,source,weight


In [None]:
edge_list_l.source

In [19]:
print('edges : {}'.format(len(edge_list_l)))
print('nodes : {}'.format(max(edge_list_l.source.max(), edge_list_l.target.max())))

edges : 818
nodes : 420


# Check Janelia

In [20]:
f = json.load(open(Janelia_org, 'r'))

In [None]:
edge_list = pd.DataFrame(columns=['source', 'target'])
n = len(f['data'])
for d in tqdm(f['data']):
    #d = f['data'][i]
    e = {'source': d['T-bar']['body ID']}
    if d['partners']:
        for p in d['partners']:
            e['target'] = p['body ID']
            edge_list = edge_list.append(e, ignore_index=True)
backup = pd.DataFrame(edge_list)
pickle.dump(backup, open('Janelia/pickled_df.p', 'wb+'))

In [21]:
# Load pickled df
edge_list = pickle.load(open('Janelia/pickled_df.p', 'rb'))

In [22]:
# Make unique fragment IDs from 1-n+1
set_frag = list(set(edge_list.source.unique())|set(edge_list.target.unique()))
n_frag = len(set_frag)
dict_frag = {set_frag[i]: i+1 for i in tqdm(range(n_frag))}
edge_list.source = edge_list.source.map(dict_frag)
edge_list.target = edge_list.target.map(dict_frag)

100%|██████████| 92133/92133 [00:00<00:00, 1413711.55it/s]


In [23]:
source_not_target_bool = ~(edge_list.source == edge_list.target)
edge_list = edge_list[source_not_target_bool]
edge_list['weight'] = 1
weighted_edge_list = edge_list.groupby(['source', 'target']).count().reset_index()

In [24]:
# Check for repeated connections
weighted_edge_list[weighted_edge_list.duplicated(keep=False)]

Unnamed: 0,source,target,weight


In [25]:
# Check for connections that are both ways
tmp = pd.DataFrame(weighted_edge_list)
tmp.columns = ['target', 'source', 'weight']
tmp.append(weighted_edge_list, ignore_index=True)
tmp[tmp.duplicated(keep=False)]

Unnamed: 0,target,source,weight


In [28]:
reduced_edge_list = weighted_edge_list.loc[weighted_edge_list.weight>15]

In [29]:
# Make unique fragment IDs from 1-n+1
set_frag = list(set(reduced_edge_list.source.unique())|set(reduced_edge_list.target.unique()))
n_frag = len(set_frag)
dict_frag = {set_frag[i]: i+1 for i in tqdm(range(n_frag))}
reduced_edge_list.source = reduced_edge_list.source.map(dict_frag)
reduced_edge_list.target = reduced_edge_list.target.map(dict_frag)

100%|██████████| 460/460 [00:00<00:00, 699811.33it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [30]:
print('edges : {}'.format(len(reduced_edge_list)))
print('nodes : {}'.format(max(reduced_edge_list.source.max(), reduced_edge_list.target.max())))

edges : 889
nodes : 460
