In [11]:
import os
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
import json

def walkdir(root, filterOut=[])->list:
    allfile = []
    for dirpath, dirname, filename in os.walk(root):
        for f in filename:
            if f in filterOut:
                continue
            allfile.append(os.path.join(dirpath, f))
    return allfile


def write2dlist(filepath, twoDlst:list)->None:
    with open(filepath, "w+") as output:
        for sublst in tqdm(twoDlst):
            for e in sublst:
                output.write(f"{e} ")
            output.write("\n")
    

def read2dlist(filepath)->list:
    ret = []
    with open(filepath,"r") as input:
        for line in input.readlines():
            aline= []
            for token in line.split(" "):
                pureToken = token.strip()
                if len(pureToken):
                    aline.append(pureToken)
            ret.append(aline)
    return ret

### Path 

In [13]:
data = "attraction"
clusternum = 7
clusters = f"K_5_{clusternum}"

resultroot = os.path.join(
    "result",data,clusters,"clustering_result"
)

kwdir = os.path.join(
    resultroot,"kw"
)
if os.path.exists(kwdir):
    print(kwdir)
else:
    raise FileExistsError(kwdir)


clusterlstdir = os.path.join(
    resultroot,"eachC"
)

if os.path.exists(clusterlstdir):
    print(clusterlstdir)
else:
    raise FileExistsError(clusterlstdir)

result/attraction/K_5_7/clustering_result/kw
result/attraction/K_5_7/clustering_result/eachC


### Get clustering data

#### file list generating

In [14]:
kwfiles = walkdir(kwdir,filterOut="all.kw")
clusterfiles = walkdir(clusterlstdir, filterOut='nonsen.txt')
kwfiles.sort()
clusterfiles.sort()
cluster_with_kw = zip(clusterfiles, kwfiles)

#### duplicate keywords for each cluster

In [15]:
pbar = tqdm(zip(clusterfiles, kwfiles), total=len(clusterfiles))
topK = 15
cid = 0
dupdir = os.path.join(resultroot ,"dupkw")
if not os.path.exists(dupdir):
    os.mkdir(dupdir)

for cdf, kwf in pbar:
    kwbag = {}
    attraction_name_lst = pd.read_csv(cdf, encoding='utf-8')['name'].tolist()
    kws_lst = read2dlist(filepath=kwf)
    if len(kws_lst) != len(attraction_name_lst):
        print(f"Not the same len !\n{cdf}, {kwf}")
        break  

    for docuid, kws in enumerate(kws_lst):
        name = attraction_name_lst[docuid]
        for k in kws:
            
            if k not in kwbag:
                kwbag[k]=[1, [name]]
            else: 
                kwbag[k][0] += 1
                kwbag[k][1].append(name)

    kwlst = []
    for k, v in kwbag.items():
        kwlst.append([k, v[0], v[1]])
    kwlst.sort(key=lambda row: row[1], reverse=True)

    with open(os.path.join(dupdir,f"{cid}.kw") ,"w+") as f:
        for k in kwlst[:topK]:
            f.write(f"{k[0]}\n")

    cid += 1
    

100%|██████████| 7/7 [00:00<00:00, 26.65it/s]


### logistic regression to find important kw for wach cluster

In [6]:
labels = torch.load("result/attraction/K_5_7/cluster.pt")

In [8]:
kwlist = read2dlist(filepath=os.path.join(kwdir, "all.kw"))
kwlist[:3]

[['巧克力',
  '建築',
  '展場',
  '工廠',
  '宏亞',
  '主題',
  '陳列物',
  '外觀',
  '歷史',
  '寓教',
  '於樂',
  '專業',
  '廠館',
  '外景',
  '造型',
  '文字',
  '食品'],
 ['金屬',
  '工廠',
  '鋼金',
  '部門',
  '創意館',
  '板金',
  '產品',
  '流程',
  '品牌',
  '館區',
  '鋼鐵',
  '協力',
  '思維',
  '決議',
  '專業板',
  '參觀',
  '志鋼',
  '電腦',
  '作業',
  '系統',
  '手動',
  '氣動',
  '知性',
  '光化',
  '雷射',
  '化生',
  '文化',
  '成形'],
 ['生態',
  '泛舟',
  '出海口',
  '虹影',
  '步道',
  '阿美族',
  '溪畔',
  '長虹橋',
  '姑巒',
  '交際',
  '地標',
  '水鳥',
  '季節',
  '風景',
  '時光',
  '奚卜蘭',
  '獅球嶼',
  '綠意',
  '小島',
  '聖地',
  '集塊',
  '岩構',
  '島嶼',
  '鳥類',
  '魚類',
  '地點',
  '下午茶',
  '拱形',
  '靠岸',
  '山海']]

In [10]:
kw_idx = {}
idx_kw = {}
idx = 0
for ki in tqdm(kwlist):
    for k in ki:
        if k not in kw_idx:
            kw_idx[k] = idx
            idx_kw[idx] = k
            idx += 1
print(idx)

100%|██████████| 4674/4674 [00:00<00:00, 119803.57it/s]

28887





In [11]:
features = torch.zeros((labels.size()[0], idx), dtype=torch.float)

print(features.size())

torch.Size([4674, 28887])


In [12]:
for docid, ki in enumerate(kwlist):
    for k in ki:
        features[docid][kw_idx[k]] = 1
print(features[:5])

tensor([[1., 1., 1.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])


In [17]:
torch.save(features,os.path.join(resultroot, "bow.pt"))

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import torch.nn.functional as F

class LinearR(torch.nn.Module):
    def __init__(self, in_dim, out_dim) -> None:
        super(LinearR, self).__init__()    
        self.layer = torch.nn.Linear(in_dim, out_dim)

    def forward(self, x):
        xi = self.layer(x)
        return xi.sigmoid()



In [18]:
model = LinearR(
        in_dim=features.size()[1], 
        out_dim=labels.size()[1],
).to(device)
loss = torch.nn.BCELoss().to(device)
optr = torch.optim.Adam(model.parameters(),lr = 0.001)

In [19]:
bar = tqdm(range(2000))
features = features.to(device)
labels = labels.to(device)
for e in bar :
    model.train()
    emd = model(x = features)
    l = loss(emd, labels.type(torch.cuda.FloatTensor))
    bar.set_postfix(ordered_dict={'l':f"{l:.2f}"})
    optr.zero_grad()
    l.backward()
    optr.step()
bar.close()

100%|██████████| 2000/2000 [00:11<00:00, 168.60it/s, l=0.01]


In [20]:
weightlist = model.layer.weight
weightlist = weightlist.cpu()
print(weightlist[:2])

tensor([[-0.4619, -0.2177, -0.3714,  ..., -0.5320, -0.5333, -0.5314],
        [-0.5270,  0.5075,  0.1233,  ..., -0.5062, -0.5130, -0.5108]],
       grad_fn=<SliceBackward0>)


In [21]:
abswlist = torch.abs(weightlist)
print(abswlist[:2])

tensor([[0.4619, 0.2177, 0.3714,  ..., 0.5320, 0.5333, 0.5314],
        [0.5270, 0.5075, 0.1233,  ..., 0.5062, 0.5130, 0.5108]],
       grad_fn=<SliceBackward0>)


In [24]:
c0_w = abswlist[0]
v, i = torch.sort(c0_w, descending=True)
print(v[:3])


tensor([1.6713, 1.4881, 1.4856], grad_fn=<SliceBackward0>)


In [25]:
i = i.tolist()
for wordid  in i[:10]:
    print(idx_kw[wordid])

遊樂行
位在菓葉村的
植面
酸菜
匯德
嬸婆
採獨棟
航空公司
卻是全
甲區
