In [1]:
from tqdm import *
import numpy as np
import time, jieba, os, json, csv, re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix
from score_functions import twostage
from itertools import starmap
from bm25 import BM25Transformer

queryFile = os.path.join('..', 'data', 'QS_1.csv')
stopwordFile = os.path.join('..', 'data', "stopword.txt")
outputFile = os.path.join('..', 'submit', 'current.csv')
titleJson = os.path.join('..', 'data', "title.json")

cut_method = jieba.cut_for_search
tokenFile = os.path.join('..', 'tokens', 'search_dict_token.txt')
tokeyFile = os.path.join('..', 'tokens', 'search_dict_tokey.txt')
queryDictFile = os.path.join('..', 'data', 'dict.txt')

jieba.load_userdict(queryDictFile)

def retain_chinese(line):
    return re.compile(r"[^\u4e00-\u9fa5]").sub('', line).replace('臺', '台')

def get_screen_len(line):
    chlen = len(retain_chinese(line))
    return (len(line) - chlen) + chlen * 2

if __name__ == '__main__':

    stopwords = open(stopwordFile, 'r').read().split()
    queries = dict([row for row in csv.reader(open(queryFile, 'r'))][1:])
    titles = json.load(open(titleJson, "r"))

    trim = lambda f: [t.strip() for t in f if t.strip()]
    token = trim(open(tokenFile).read().split('\n'))#[:5000]#[:301]
    tokey = trim(open(tokeyFile).read().split('\n'))#[:5000]#[:301]

    # append title to doc
    print("""
appending title to document...
""")

    title_weight = 1

    for i, key in enumerate(tqdm(tokey)):
        title = retain_chinese(titles.get(key, '')).strip()
        if title and title != "Non":
            title_token = ' {}'.format(' '.join([w for w
                in cut_method(title) if w not in stopwords])) * title_weight
            token[i] += title_token
            #print('+= ' + title_token)

    if len(token) != len(tokey):
        print('token len sould eq to tokey len')
        exit(0)

    bm25 = BM25Transformer()
    vectorizer = TfidfVectorizer()
    print("""
    building corpus vector space...
        """)

    doc_tf = vectorizer.fit_transform(tqdm(token))

    bm25.fit(doc_tf)
    doc_bm25 = bm25.transform(doc_tf)

    print('\ncorpus vector space - ok\n')

    
    

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/bc/jsywdsrx5jd3sh37pspc1mhh0000gn/T/jieba.cache
Loading model cost 0.974 seconds.
Prefix dict has been built succesfully.
  0%|          | 198/99917 [00:00<00:50, 1970.64it/s]


appending title to document...



100%|██████████| 99917/99917 [00:39<00:00, 2556.44it/s]
  1%|          | 1142/99917 [00:00<00:17, 5589.91it/s]


    building corpus vector space...
        


100%|██████████| 99917/99917 [00:19<00:00, 5148.84it/s]



corpus vector space - ok



In [4]:
tmptoken = [t.split(" ") for t in token]


'北市'

In [7]:

tt = np.array(tmptoken)

In [8]:
from gensim import corpora

dictionary = corpora.Dictionary(tt)
corpus = [dictionary.doc2bow(text) for text in tt]


[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 2),
 (15, 2),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 1),
 (21, 1),
 (22, 1),
 (23, 2),
 (24, 2),
 (25, 2),
 (26, 1),
 (27, 1),
 (28, 1),
 (29, 1),
 (30, 1),
 (31, 2),
 (32, 1),
 (33, 1),
 (34, 2),
 (35, 1),
 (36, 1),
 (37, 1),
 (38, 1),
 (39, 1),
 (40, 1),
 (41, 1),
 (42, 1),
 (43, 1),
 (44, 1),
 (45, 1),
 (46, 3),
 (47, 1),
 (48, 1),
 (49, 1),
 (50, 1),
 (51, 1),
 (52, 1),
 (53, 1),
 (54, 1),
 (55, 1),
 (56, 1),
 (57, 1),
 (58, 1),
 (59, 1),
 (60, 1),
 (61, 2),
 (62, 4),
 (63, 2),
 (64, 2),
 (65, 1),
 (66, 1),
 (67, 1),
 (68, 1),
 (69, 1),
 (70, 1),
 (71, 1),
 (72, 1),
 (73, 1),
 (74, 1),
 (75, 5),
 (76, 1),
 (77, 1),
 (78, 1),
 (79, 1),
 (80, 1),
 (81, 3),
 (82, 3),
 (83, 2),
 (84, 1),
 (85, 1),
 (86, 1),
 (87, 1),
 (88, 3),
 (89, 1),
 (90, 1),
 (91, 1),
 (92, 1),
 (93, 1),
 (94, 1),
 (95, 1),
 (96, 1),
 (97, 1),
 (98, 1),
 (99, 1),
 (100, 1),

In [12]:
from gensim.models import Phrases
from gensim.models import Word2Vec
# bigram_transformer = Phrases(tt)
# model = Word2Vec(bigram_transformer[tt], min_count=1)

In [15]:
# model.train(tt, total_examples=len(tt), epochs=100)

(2786895119, 2810294600)

In [16]:
# model.save("./model.w2v")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [34]:
model = Word2Vec.load("./model.w2v")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [40]:
def cosSim(v1,v2):
        return np.dot(v1,v2)/(np.sqrt((v1*v1).sum())*np.sqrt((v2*v2).sum()))

In [21]:
model.wv["支持"]

array([ 6.5576663 ,  1.1192425 ,  1.5257726 , -2.9596596 , -2.0510314 ,
        3.5871036 ,  0.413788  , -0.7508377 , -2.3556578 , -3.4461    ,
        1.4267297 , -0.21273573, -1.4951439 , -1.0722139 ,  5.6647687 ,
       -0.2764437 ,  0.51061463,  1.975527  ,  1.1960163 , -3.9791973 ,
        2.8652143 , -3.17638   ,  4.322652  ,  1.3153908 , -3.5056844 ,
        3.4308507 ,  0.60897857,  3.1287992 , -3.9260778 ,  4.308882  ,
        4.4180827 ,  2.2497222 ,  2.1650782 , -0.40933117,  2.160309  ,
       -3.3779986 , -2.163458  ,  1.3997703 , -1.3266774 ,  0.19308692,
        3.073745  ,  0.2268004 , -3.1317358 ,  0.4952132 , -0.21560237,
       -3.2652175 ,  3.0387008 ,  1.5051608 , -4.8596516 , -1.9798121 ,
        1.7529259 ,  1.5999805 ,  4.234009  , -1.6427894 , -0.666432  ,
        0.03241699,  0.38046533,  4.646892  ,  1.2129058 , -1.4001262 ,
       -1.7795942 , -5.3961287 , -0.7493347 ,  1.7067924 , -3.6159568 ,
        4.4780188 , -1.8129102 ,  2.1592896 ,  0.10626329,  1.82

In [174]:
tmpVec = np.zeros((len(tmptoken),len(model.wv[tmptoken[0][0]])))
for i in range(len(tmptoken)):
    for j in range(len(tmptoken[i])):
        tmpVec[i] += model.wv[tmptoken[i][j]]

In [191]:
tmpVec[0]

array([-8.39992290e+01,  6.82515966e+01, -1.36086924e+02, -3.71622288e+02,
       -1.92190721e+02,  3.55742338e+02,  1.78321830e+02, -2.06715327e+01,
        2.15504038e+02, -3.01888997e+02,  2.66091637e+02, -9.64820034e+01,
       -4.01495618e+01,  3.30316797e+02,  3.16274124e+02,  1.46375351e+02,
       -2.19227789e+02,  3.03366870e+02,  5.07105749e+01, -1.00007415e+02,
        3.38908023e+02, -2.66795773e+02,  6.11915658e+01, -3.46390806e+01,
       -1.95351017e+02, -3.04292442e+01, -1.28320564e+02,  1.42307523e+02,
       -1.13359275e+02,  2.12413983e+02,  2.77743363e+01,  1.97688530e+02,
        1.64288004e+02, -3.46583797e+02, -5.09196611e+01, -3.29654030e+02,
       -2.77223392e+02,  3.57462996e+02, -4.57433445e+02,  4.50802575e+01,
        5.96218434e+00,  1.59113969e+02, -4.86398919e+01,  2.82505946e+02,
       -2.19712350e+02, -7.01310140e+01, -1.30387921e+02, -3.60528659e+02,
       -6.04555957e+01,  1.83274254e+02,  3.63330139e+01, -6.13793417e+01,
        3.00048565e+02, -

In [184]:
qVec = np.zeros((len(queries),len(model.wv[tmptoken[0][0]])))
for idx, q_id in enumerate(tqdm(queries)):
            query = " ".join([w for w in cut_method(queries[q_id].replace('臺', '台'))
                                if w not in stopwords])

            if '中國學生' in queries[q_id]:
                query += ' 陸生 中生 大陸 學生'
            if '證所' in queries[q_id]:
                query += ' 證交稅 證交'
            arrQ = str(query).split(" ")
            for i in range(len(arrQ)):
                if not 'ECFA' == arrQ[i]:
                    qVec[idx] += model.wv[arrQ[i]]

100%|██████████| 20/20 [00:00<00:00, 2208.69it/s]


In [192]:
qVec[0]

array([  3.56810975,   4.00773245,  10.46025255, -17.80170852,
        -4.08235262,  -1.69459467,  18.36074734,   5.39938687,
        -9.23470613,  -9.42770457,  10.00529775, -16.03083272,
        -3.14838126,  20.22437549,  -3.20608684,   0.93645483,
         2.73487355,   4.40819226,   5.20818944,  17.2538532 ,
        -4.44364959,   9.43236125,   8.26833332,   7.14559656,
       -12.07207733,  -1.41059649,  -1.25651303,  -4.35024491,
         4.96286345,   7.47638   ,   3.955522  ,  -1.12594549,
        -3.11910835,  -4.110855  ,   3.38317847, -14.93816489,
         2.42774957,  19.25571698, -15.27299738,  -4.42327173,
        14.05682182,  -4.58578908,  -3.85652122,   2.89296973,
        -5.85860375,  -2.53099322,  10.76760435,   3.80943641,
        -3.24338518,  -2.05608559,   0.623032  ,  -4.30167055,
        21.68389463,   9.63797355,  -2.69032785, -10.84806818,
        -3.29620367,   8.74891186, -18.90895998,   9.19774485,
        -9.11076358, -20.73060441,  -8.32248308,  -0.81

In [190]:
scores = np.zeros((len(queries),len(tmptoken)))
for i in range(len(queries)):
    for j in range(len(tmptoken)):
        scores[i][j] = cosSim(qVec[i],tmpVec[j])


array([[ 0.30048128, -0.10606141,  0.22084023, ...,  0.17933733,
         0.16357617,  0.21927577],
       [ 0.21159386,  0.0025054 ,  0.06804307, ...,  0.20432211,
         0.24401997,  0.09188853],
       [ 0.3701137 ,  0.16762266,  0.48283084, ...,  0.31538718,
         0.32561444,  0.3156602 ],
       ...,
       [ 0.30597629,  0.09100647,  0.22675069, ...,  0.3161897 ,
         0.29753776,  0.21291079],
       [ 0.13481879,  0.04326903,  0.20431324, ...,  0.15971502,
        -0.02473682,  0.02022542],
       [ 0.21906409,  0.06239096,  0.32394048, ...,  0.16355286,
         0.13485201,  0.26400552]])

In [None]:

    with open(outputFile, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        headers = ['Query_Index'] + ['Rank_{:03d}'.format(i) for i in range(1, 301)]
        writer.writerow(headers)

        for idx, q_id in enumerate(tqdm(queries)):


            query = ' '.join([w for w in cut_method(queries[q_id].replace('臺', '台'))
                                if w not in stopwords])

            if '中國學生' in queries[q_id]:
                query += ' 陸生 中生 大陸 學生'
            if '證所' in queries[q_id]:
                query += ' 證交稅 證交'

            stages = [20, 40, 60, 80, 100]

            init_bar = '[ stage 0/{} ] Query{}: {}'.format(len(stages), idx + 1, query)
            print(init_bar)
            qry_tf = vectorizer.transform([query])
            qry_bm25 = bm25.transform(qry_tf)

            sims = cosine_similarity(qry_bm25, doc_bm25)[0]
            sims += scores[idx]
            ranks = [(t, v) for (v, t) in zip(sims, tokey)]
            ranks.sort(key=lambda e: e[-1], reverse=True)

            for stage, fb_n in enumerate(stages):

                print("\033[F[ stage {}/{} ]".format(stage + 1, len(stages)))

                # relavance feedback stage 1
                qry_bm25 = qry_bm25 + \
                         np.sum(doc_bm25[tokey.index(ranks[i][0])] * 0.5 \
                         for i in range(fb_n))


                sims = cosine_similarity(qry_bm25, doc_bm25)[0]
                sims += scores[idx]
                ranks = [(t, v) for (v, t) in zip(sims, tokey)]
                ranks.sort(key=lambda e: e[-1], reverse=True)

            entry = [q_id] + [e[0] for e in ranks[:300]]
            writer.writerow(entry)

            print("\033[F" + ' ' * get_screen_len(init_bar))
            print("\033[F" * 3)



  0%|          | 0/20 [00:00<?, ?it/s]

[ stage 0/5 ] Query1: 通姦 刑法 應該 除罪 除罪化
[F[ stage 1/5 ]
[F[ stage 2/5 ]
[F[ stage 3/5 ]
[F[ stage 4/5 ]
[F[ stage 5/5 ]


  5%|▌         | 1/20 [00:13<04:13, 13.36s/it]

[F                                                
[F[F[F
[ stage 0/5 ] Query2: 應該 取消 機車 強制 二段 段式 二段式 左轉 待轉
[F[ stage 1/5 ]
[F[ stage 2/5 ]
[F[ stage 3/5 ]
[F[ stage 4/5 ]
[F[ stage 5/5 ]


 10%|█         | 2/20 [00:25<03:56, 13.14s/it]

[F                                                                    
[F[F[F
[ stage 0/5 ] Query3: 支持 博弈 特區 台灣 合法 合法化
[F[ stage 1/5 ]
[F[ stage 2/5 ]
[F[ stage 3/5 ]
[F[ stage 4/5 ]
[F[ stage 5/5 ]


 15%|█▌        | 3/20 [00:38<03:39, 12.94s/it]

[F                                                     
[F[F[F
[ stage 0/5 ] Query4: 中華 航空 空服 空服員 罷工 合理
[F[ stage 1/5 ]
[F[ stage 2/5 ]
[F[ stage 3/5 ]
[F[ stage 4/5 ]
[F[ stage 5/5 ]


 20%|██        | 4/20 [00:48<03:14, 12.14s/it]

[F                                                     
[F[F[F
[ stage 0/5 ] Query5: 性交 交易 性交易 應該 合法 合法化
[F[ stage 1/5 ]
[F[ stage 2/5 ]
[F[ stage 3/5 ]
[F[ stage 4/5 ]
[F[ stage 5/5 ]


 25%|██▌       | 5/20 [01:00<02:59, 11.98s/it]

[F                                                       
[F[F[F
[ stage 0/5 ] Query6: ECFA 早收 清單 達到 預期 成效
[F[ stage 1/5 ]
[F[ stage 2/5 ]
[F[ stage 3/5 ]
[F[ stage 4/5 ]
[F[ stage 5/5 ]


 30%|███       | 6/20 [01:11<02:45, 11.80s/it]

[F                                                   
[F[F[F
[ stage 0/5 ] Query7: 應該 減免 證所 證所稅 證交稅 證交
[F[ stage 1/5 ]
[F[ stage 2/5 ]
[F[ stage 3/5 ]
[F[ stage 4/5 ]
[F[ stage 5/5 ]


 35%|███▌      | 7/20 [01:22<02:28, 11.44s/it]

[F                                                       
[F[F[F
[ stage 0/5 ] Query8: 贊成 中油 觀塘 興建 第三 天然 天然氣 接收 接收站
[F[ stage 1/5 ]
[F[ stage 2/5 ]
[F[ stage 3/5 ]
[F[ stage 4/5 ]
[F[ stage 5/5 ]


 40%|████      | 8/20 [01:32<02:13, 11.14s/it]

[F                                                                      
[F[F[F
[ stage 0/5 ] Query9: 支持 中國 學生 納入 健保 陸生 中生 大陸 學生
[F[ stage 1/5 ]
[F[ stage 2/5 ]
[F[ stage 3/5 ]
[F[ stage 4/5 ]
[F[ stage 5/5 ]


 45%|████▌     | 9/20 [01:46<02:11, 11.98s/it]

[F                                                                  
[F[F[F
[ stage 0/5 ] Query10: 支持 台灣 中小 中小學 含 高職 專科 服儀 規定 含 髮 襪 鞋 給予 學生 自主
[F[ stage 1/5 ]
[F[ stage 2/5 ]
[F[ stage 3/5 ]
[F[ stage 4/5 ]
[F[ stage 5/5 ]


 50%|█████     | 10/20 [01:59<02:02, 12.29s/it]

[F                                                                                              
[F[F[F
[ stage 0/5 ] Query11: 不 支持 使用 加密 貨幣
[F[ stage 1/5 ]
[F[ stage 2/5 ]
[F[ stage 3/5 ]
[F[ stage 4/5 ]
[F[ stage 5/5 ]


 55%|█████▌    | 11/20 [02:14<01:56, 12.90s/it]

[F                                             
[F[F[F
[ stage 0/5 ] Query12: 不 支持 雜費 學雜費 調漲
[F[ stage 1/5 ]
[F[ stage 2/5 ]
[F[ stage 3/5 ]
[F[ stage 4/5 ]
[F[ stage 5/5 ]


 60%|██████    | 12/20 [02:26<01:42, 12.83s/it]

[F                                               
[F[F[F
[ stage 0/5 ] Query13: 同意 政府 舉債 發展 前瞻 建設 計畫
[F[ stage 1/5 ]
[F[ stage 2/5 ]
[F[ stage 3/5 ]
[F[ stage 4/5 ]
[F[ stage 5/5 ]


 65%|██████▌   | 13/20 [02:38<01:28, 12.66s/it]

[F                                                         
[F[F[F
[ stage 0/5 ] Query14: 支持 電競 列入 體育 競技
[F[ stage 1/5 ]
[F[ stage 2/5 ]
[F[ stage 3/5 ]
[F[ stage 4/5 ]
[F[ stage 5/5 ]


 70%|███████   | 14/20 [02:51<01:15, 12.53s/it]

[F                                               
[F[F[F
[ stage 0/5 ] Query15: 反對 台鐵 東移 徵收 徵收案
[F[ stage 1/5 ]
[F[ stage 2/5 ]
[F[ stage 3/5 ]
[F[ stage 4/5 ]
[F[ stage 5/5 ]


 75%|███████▌  | 15/20 [03:04<01:03, 12.77s/it]

[F                                                 
[F[F[F
[ stage 0/5 ] Query16: 支持 陳 前 總統 保外 就醫
[F[ stage 1/5 ]
[F[ stage 2/5 ]
[F[ stage 3/5 ]
[F[ stage 4/5 ]
[F[ stage 5/5 ]


 80%|████████  | 16/20 [03:21<00:56, 14.18s/it]

[F                                                
[F[F[F
[ stage 0/5 ] Query17: 年金 改革 應 取消 或應 調降 公教 軍公教 月退 優存 利率 十八 趴
[F[ stage 1/5 ]
[F[ stage 2/5 ]
[F[ stage 3/5 ]
[F[ stage 4/5 ]
[F[ stage 5/5 ]


 85%|████████▌ | 17/20 [03:36<00:42, 14.17s/it]

[F                                                                                     
[F[F[F
[ stage 0/5 ] Query18: 同意 動物 實驗
[F[ stage 1/5 ]
[F[ stage 2/5 ]
[F[ stage 3/5 ]
[F[ stage 4/5 ]
[F[ stage 5/5 ]


 90%|█████████ | 18/20 [03:49<00:27, 13.88s/it]

[F                                     
[F[F[F
[ stage 0/5 ] Query19: 油價 應該 凍漲 緩漲
[F[ stage 1/5 ]
[F[ stage 2/5 ]
