In [16]:
import numpy
import os  

class LDA:
    def __init__(self, K, alpha, beta, docs, V, smartinit=True):
        self.K = K
        self.alpha = alpha # parameter of topics prior
        self.beta = beta   # parameter of words prior
        self.docs = docs
        self.V = V

        self.z_m_n = [] # topics of words of documents
        self.n_m_z = numpy.zeros((len(self.docs), K)) + alpha     # word count of each document and topic
        self.n_z_t = numpy.zeros((K, V)) + beta # word count of each topic and vocabulary
        self.n_z = numpy.zeros(K) + V * beta    # word count of each topic

        self.N = 0 
        for m, doc in enumerate(docs):
            self.N += len(doc)
            z_n = []
            for t in doc:
                if smartinit:
                    p_z = self.n_z_t[:, t] * self.n_m_z[m] / self.n_z
                    z = numpy.random.multinomial(1, p_z / p_z.sum()).argmax()
                else:
                    z = numpy.random.randint(0, K)
                z_n.append(z)
                self.n_m_z[m, z] += 1
                self.n_z_t[z, t] += 1
                self.n_z[z] += 1
            self.z_m_n.append(numpy.array(z_n))

    def inference(self):
        """learning once iteration"""
        for m, doc in enumerate(self.docs):
            z_n = self.z_m_n[m]
            n_m_z = self.n_m_z[m]
            for n, t in enumerate(doc):
                # discount for n-th word t with topic z
                z = z_n[n]
                n_m_z[z] -= 1
                self.n_z_t[z, t] -= 1
                self.n_z[z] -= 1

                # sampling topic new_z for t
                p_z = self.n_z_t[:, t] * n_m_z / self.n_z
                new_z = numpy.random.multinomial(1, p_z / p_z.sum()).argmax()

                # set z the new topic and increment counters
                z_n[n] = new_z
                n_m_z[new_z] += 1
                self.n_z_t[new_z, t] += 1
                self.n_z[new_z] += 1

    def worddist(self):
        """get topic-word distribution"""
        return self.n_z_t / self.n_z[:, numpy.newaxis]

    def perplexity(self, docs=None):
        if docs == None: docs = self.docs
        phi = self.worddist()
        log_per = 0
        N = 0
        Kalpha = self.K * self.alpha
        for m, doc in enumerate(docs):
            theta = self.n_m_z[m] / (len(self.docs[m]) + Kalpha)
            for w in doc:
                log_per -= numpy.log(numpy.inner(phi[:,w], theta))
            N += len(doc)
        return numpy.exp(log_per / N)

def lda_learning(lda, iteration, voca, fw):
    pre_perp = lda.perplexity()
    print ("initial perplexity=%f" % pre_perp)
#     fw.write("initial perplexity=%f" % pre_perp)
#     fw.write("\n")
    for i in range(iteration):
        lda.inference()
        perp = lda.perplexity()
#         print ("-%d p=%f" % (i + 1, perp))
        if pre_perp:
            if pre_perp < perp:
                output_word_topic_dist(lda, voca, fw)
                pre_perp = None
            else:
                pre_perp = perp
    output_word_topic_dist(lda, voca,fw)

def output_word_topic_dist(lda, voca,fw):
    zcount = numpy.zeros(lda.K, dtype=int)
    wordcount = [dict() for k in range(lda.K)]
    for xlist, zlist in zip(lda.docs, lda.z_m_n):
        for x, z in zip(xlist, zlist):
            zcount[z] += 1
            if x in wordcount[z]:
                wordcount[z][x] += 1
            else:
                wordcount[z][x] = 1

    phi = lda.worddist()
    
    
    for k in range(lda.K):
#         print ("\n-- topic: %d (%d words)" % (k, zcount[k]))
        fw.write("\n-- topic: %d (%d words)" % (k, zcount[k]))
        fw.write("\n")
        for w in numpy.argsort(-phi[k])[:15]:
#             print ("%s: %f (%d)" % (voca[w], phi[k,w], wordcount[k].get(w,0)))
#             print ("%s" % (voca[w]))
            fw.write("%s" % (voca[w]))
            fw.write("\n")
    fw.write("\n")
    fw.write("－－－－－－－－－－－－－－－－－－－－－－－－")
    fw.write("\n")
    print('OK')

def main(test, fw):
    import optparse
    import vocabulary
    parser = optparse.OptionParser()
    parser.add_option("-f", dest="filename", help="corpus filename")
    parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)")
    parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=5)
    parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5)
    parser.add_option("-k", dest="K", type="int", help="number of topics", default=6)
    parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100)
    parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=True)
    parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False)
    parser.add_option("--seed", dest="seed", type="int", help="random seed")
    parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0)
#     test = ['-f','C:/Users/USER/Desktop/專題/corpus_16.txt'] #輸入目標文件(全部文章)
    (options, args) = parser.parse_args(test) #運行模型
    if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)")

    if options.filename:
        corpus = vocabulary.load_file(options.filename)
    else:
        corpus = vocabulary.load_corpus(options.corpus)
        if not corpus: parser.error("corpus range(-c) forms 'start:end'")
    if options.seed != None:
        numpy.random.seed(options.seed)

    voca = vocabulary.Vocabulary(options.stopwords)
    docs = [voca.doc_to_ids(doc) for doc in corpus]
    if options.df > 0: docs = voca.cut_low_freq(docs, options.df)

    lda = LDA(options.K, options.alpha, options.beta, docs, voca.size(), options.smartinit)

    print (voca.vocas)
    print ("corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(corpus), len(voca.vocas), options.K, options.alpha, options.beta))
    fw.write("corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(corpus), len(voca.vocas), options.K, options.alpha, options.beta))    # 将字符串写入文件中
    fw.write("\n") 

    #import cProfile
    #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile')
    lda_learning(lda, options.iteration, voca, fw)

    

In [17]:
a=1
b=0.5
time = 1

if __name__ == "__main__":

    while a<=10:
        fw = open("C:/Users/USER/Desktop/專題/K=6/smart_int/a="+str(a)+",b="+str(b)+".txt", 'a', encoding = 'utf8')
        test = ['-f','C:/Users/USER/Desktop/專題/corpus_16.txt','--alpha',a,'--beta',b]
        main(test, fw)
        fw.close()
        time+=1
        a=round(a+1,1)
    

['春夏', '復刻', '目前', '鞋身', '經典', '定價', '品牌', '聯名', '後續', '潮流', 'nike', 'logo', 'nba', '消息', '元素', '作品', '街頭', '球鞋', '主題', '特色', '文化', '服飾', '創意', '話題', '全球', 'instagram', '造型', '風格', '低調', '靈感', '面料', '標誌', '鞋面', '朋友', '細節', '系列', '質感', '鞋履', 'cool', '鞋帶', '生活', 'jordan', '材質', '字樣', '售價', '運動', '鞋型', '門市', '外型', '市場', '旗下', '科技', '穿搭', '跑鞋', 'boost', '官方', '白色', 'adidas', 'primeknit', '高筒', '整體', '鞋迷', '方式', '視覺', '世界', '概念', '色調', '顏色', '氣墊', '單品', '網站', '皮革', '輪廓', '機會', '雙方', '年代', '圖案', '臺灣', '款式', '店鋪', '官網', '實體', '聯乘', '外觀', '麂皮', '資訊', '版本', '粉絲', '黑色', '藍本', '效果', '時間', '設計師', '美國', '紐約', '腳感', '日本', '興趣', 'converse', '人氣', '女鞋', '低筒', '紅色', 'timberland', '鞋店', 'outlet', '鞋子', '評價', '鞋墊', '企劃', '標誌性', '價格', '秋冬', '色彩', '型號', '襪子', '鞋舌', '技術', '折扣', '皮鞋', '店家', '高雄', '牌子', '靴子', '信息', '預算', '習慣', '專櫃', '店面', '休閒鞋', '價位', '價錢', '意見', '優惠', 'ptt', 'van', '店員', '正品', '賣家', '網拍']
corpus=49067, words=140, K=6, a=1.000000, b=0.500000
initial perplexity=107.983697
OK
OK
['春夏', '復刻', '目

initial perplexity=109.440219
OK
OK
['春夏', '復刻', '目前', '鞋身', '經典', '定價', '品牌', '聯名', '後續', '潮流', 'nike', 'logo', 'nba', '消息', '元素', '作品', '街頭', '球鞋', '主題', '特色', '文化', '服飾', '創意', '話題', '全球', 'instagram', '造型', '風格', '低調', '靈感', '面料', '標誌', '鞋面', '朋友', '細節', '系列', '質感', '鞋履', 'cool', '鞋帶', '生活', 'jordan', '材質', '字樣', '售價', '運動', '鞋型', '門市', '外型', '市場', '旗下', '科技', '穿搭', '跑鞋', 'boost', '官方', '白色', 'adidas', 'primeknit', '高筒', '整體', '鞋迷', '方式', '視覺', '世界', '概念', '色調', '顏色', '氣墊', '單品', '網站', '皮革', '輪廓', '機會', '雙方', '年代', '圖案', '臺灣', '款式', '店鋪', '官網', '實體', '聯乘', '外觀', '麂皮', '資訊', '版本', '粉絲', '黑色', '藍本', '效果', '時間', '設計師', '美國', '紐約', '腳感', '日本', '興趣', 'converse', '人氣', '女鞋', '低筒', '紅色', 'timberland', '鞋店', 'outlet', '鞋子', '評價', '鞋墊', '企劃', '標誌性', '價格', '秋冬', '色彩', '型號', '襪子', '鞋舌', '技術', '折扣', '皮鞋', '店家', '高雄', '牌子', '靴子', '信息', '預算', '習慣', '專櫃', '店面', '休閒鞋', '價位', '價錢', '意見', '優惠', 'ptt', 'van', '店員', '正品', '賣家', '網拍']
corpus=49067, words=140, K=6, a=10.000000, b=0.500000
initial perple

In [8]:
b= 0.1
b+=0.1
b+=0.1
b=round(b+0.1,1)
print(b)

0.4


In [2]:
zero = 25155
one = 22928
two = 22673
three = 23016

al = zero+one+two+three

print("topic0 : " + str(zero/al * 100))
print("topic1 : " + str(one/al * 100))
print("topic2 : " + str(two/al * 100))
print("topic3 : " + str(three/al * 100))

topic0 : 26.825704901249843
topic1 : 24.450795546645054
topic2 : 24.178859361003287
topic3 : 24.54464019110182


In [9]:
print(phi)

NameError: name 'phi' is not defined