In [13]:
%matplotlib inline
import pandas as pd

In [14]:
df = pd.read_csv("./small_set/TTS1.csv", header = None, encoding = "big5")

- TT TT1 郵件狀態代碼 X(2) | Status_code
- TT TT2 掛號號碼 X(20) | Mail_num
- TT TT3 處理日期 X(10) | Mail_date
- TT TT4 處理時間 X(8) | Mail_time
- TT TT5 處理局號 X(6) | OP_office // operation office
- TT TT6 其它 X(42) | other

In [15]:
df.columns = ["Status_code", "Mail_num", "Mail_date", "Mail_time", "OP_office", "other"]

In [16]:
df

Unnamed: 0,Status_code,Mail_num,Mail_date,Mail_time,OP_office,other
0,Y4,00000000000000,2018-01-01,09:49:04,330031,003
1,Y4,00000000000000,2018-01-01,09:58:08,330031,002
2,I4,00000000000000,2018-01-01,14:11:51,330031,
3,Y4,00000000000000,2018-01-01,14:23:32,330031,006
4,I4,00000000000000,2018-01-01,14:52:09,330031,
5,I4,00000000000000,2018-01-01,16:15:52,330031,
6,Y4,58668700100170,2018-01-01,11:06:44,100250,001
7,I3,59928400100170,2018-01-01,21:10:45,220014,
8,P5,75233300100170,2018-01-01,05:30:09,704583,
9,A3,19491400101070,2018-01-01,21:40:46,320008,65


# 定義：狀態碼為**點**

In [17]:
df.Status_code.unique()

array(['Y4', 'I4', 'I3', 'P5', 'A3', 'Z4', 'P4', 'V4', 'H4', 'W2', 'Z2',
       'I2', 'T2', 'A1', 'V2', 'I7', 'A2', 'G2', 'T5', 'X2', 'VL', 'H7',
       'Z1', 'G1', 'I1', 'Y7', 'V3', 'T4', 'S2', 'G3', 'I5', 'T1'],
      dtype=object)

In [18]:
nodes = list(df.Status_code.unique())

In [19]:
head_nodes = sorted(list(set([node[0] for node in nodes])))

In [20]:
dic_nodes = dict( zip(head_nodes, range(len(head_nodes))) )

In [21]:
label_node = lambda x:dic_nodes[x[0]]

In [22]:
label_node("T4")

6

In [23]:
data_nodes = []
for node in nodes:
    data = { "id":node, "group": label_node(node)}
    data_nodes.append(data)

In [24]:
data_nodes

[{'group': 10, 'id': 'Y4'},
 {'group': 3, 'id': 'I4'},
 {'group': 3, 'id': 'I3'},
 {'group': 4, 'id': 'P5'},
 {'group': 0, 'id': 'A3'},
 {'group': 11, 'id': 'Z4'},
 {'group': 4, 'id': 'P4'},
 {'group': 7, 'id': 'V4'},
 {'group': 2, 'id': 'H4'},
 {'group': 8, 'id': 'W2'},
 {'group': 11, 'id': 'Z2'},
 {'group': 3, 'id': 'I2'},
 {'group': 6, 'id': 'T2'},
 {'group': 0, 'id': 'A1'},
 {'group': 7, 'id': 'V2'},
 {'group': 3, 'id': 'I7'},
 {'group': 0, 'id': 'A2'},
 {'group': 1, 'id': 'G2'},
 {'group': 6, 'id': 'T5'},
 {'group': 9, 'id': 'X2'},
 {'group': 7, 'id': 'VL'},
 {'group': 2, 'id': 'H7'},
 {'group': 11, 'id': 'Z1'},
 {'group': 1, 'id': 'G1'},
 {'group': 3, 'id': 'I1'},
 {'group': 10, 'id': 'Y7'},
 {'group': 7, 'id': 'V3'},
 {'group': 6, 'id': 'T4'},
 {'group': 5, 'id': 'S2'},
 {'group': 1, 'id': 'G3'},
 {'group': 3, 'id': 'I5'},
 {'group': 6, 'id': 'T1'}]

# 定義：狀態的改變**線** 
- 要依據狀態、郵件號碼及時間去決定線的連接
- 時間的轉換，請參考 [pandas.to_datetime()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html)


# 定義：**線**，狀態的改變
- 要依據狀態、郵件號碼及時間去決定**線** 的連接
- 時間的轉換，請參考 [pandas.to_datetime()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html)

In [25]:
df["Mail_datetime"] = df.Mail_date + " " + df.Mail_time

In [26]:
df.Mail_datetime = pd.to_datetime(df.Mail_datetime)

```
all_mail["58668700100170"] = [ ("Y4", "2018-01-01 09:49:04"),
                           ("I4", "2018-01-01 14:11:51"), ... ] 

```

In [27]:
all_mail ={}
for idx, row in df.head(5000).iterrows():
#     print(idx, row.Status_code, row.Mail_num.strip(), row.Mail_datetime)
    
    if not row.Mail_num.strip() in all_mail:
        all_mail[row.Mail_num.strip()] = []
        
    all_mail[row.Mail_num.strip()].append( (row.Status_code, row.Mail_datetime) )

In [28]:
len(all_mail.keys())

3159

In [29]:
def convert_2_edge(mail_status):
    edges = []
    for idx in range(len(mail_status)-1):
        edges.append( (mail_status[idx][0], mail_status[idx+1][0]) )
        
    return edges
# "Y4" -> "H4"
# "H4" -> "Z2"

mail_status = all_mail['96410700000070']
convert_2_edge(mail_status)

[('Y4', 'Y4'), ('Y4', 'I4'), ('I4', 'I4')]

In [30]:
all_edges = [] 
for mail_code in all_mail:
    status_num = len(all_mail[mail_code])
    
    if (status_num) > 2:
        mail_status = all_mail[mail_code]
        all_edges.extend(convert_2_edge(mail_status))

In [31]:
import collections
import numpy as np

In [32]:
value_list = {}
for edge, value in collections.Counter(all_edges).most_common(1000):
    value_list[edge] = value

list_max = np.max(list(value_list.values()))
list_min = np.min(list(value_list.values()))
list_diff = float(list_max - list_min)

k = 20

normal_val = lambda x: int(k/3+k*2*(x-list_min)/list_diff)

In [33]:
normal_val(1700)

1484

In [34]:
data_edges = []
for edge, value in collections.Counter(all_edges).most_common(1000):
    edge = {"source": edge[0], "target":edge[1], "value":normal_val(value)}
    data_edges.append(edge)

In [35]:
all_data = {"nodes":data_nodes, "links":data_edges}

In [36]:
import json

In [37]:
open("data_4_d3.json", "w").write(json.dumps(all_data, indent = 2))

4471

# 標準化

In [38]:
from sklearn import preprocessing
import numpy as np

In [39]:
data_edges = []
sss = []
for edge, value in collections.Counter(all_edges).most_common(1000):
    edge = { "source": edge[0], "target":edge[1], "value":value }
    data_edges.append(edge)
    sss.append(value) 

In [40]:
print(sss)

[47, 36, 21, 15, 11, 10, 9, 6, 4, 4, 4, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [41]:
X = preprocessing.scale(sss)

In [42]:
print(X)

[ 4.3887404   3.22939949  1.64848008  1.01611231  0.5945338   0.48913917
  0.38374454  0.06756066 -0.1432286  -0.1432286  -0.1432286  -0.24862322
 -0.24862322 -0.35401785 -0.35401785 -0.35401785 -0.35401785 -0.35401785
 -0.35401785 -0.35401785 -0.35401785 -0.35401785 -0.35401785 -0.45941248
 -0.45941248 -0.45941248 -0.45941248 -0.45941248 -0.45941248 -0.45941248
 -0.45941248 -0.45941248 -0.45941248 -0.45941248 -0.45941248 -0.45941248
 -0.45941248 -0.45941248 -0.45941248]


In [7]:
df['Mail_num'].value_counts()

00000000000000          13
02620770702618           6
02074210701218           5
88026300100270           5
47101000100270           5
96410700000070           4
04874230237110           4
44702310028418807913     4
02604970702618           4
07051010701218           4
05450600100170           4
44619310028418557000     4
44619810028418557555     4
83362900100170           4
09020900100170           4
28280000100070           4
03093910701218           4
58608200100170           4
07020010701218           4
07475730205628           4
24547800100270           4
04871830237110           4
18863110028418           3
18959130245016           3
31509500100170           3
58688400100170           3
07476830205628           3
00631040100878           3
44778910028418950557     3
12684340100816           3
                        ..
03573640100816           1
14206210701018           1
07705330205628           1
75607140000070           1
20792930245018           1
07705130205628           1
2

In [11]:
X = preprocessing.scale(df['Mail_num'])



In [12]:
print(X)

[-0.20302762 -0.20302762 -0.20302762 ... -0.20302719 -0.20302719
 -0.20302719]
