In [1]:
import pandas as pd
import re
from gensim.models import doc2vec, ldamodel
from gensim import corpora

In [2]:
df = pd.read_csv("./resources/HillaryEmails.csv")
df = df[['Id', 'ExtractedBodyText']].dropna()
print(df.head())
print(df.shape)                 

   Id                                  ExtractedBodyText
1   2  B6\nThursday, March 3, 2011 9:45 PM\nH: Latest...
2   3                                                Thx
4   5  H <hrod17@clintonemail.com>\nFriday, March 11,...
5   6  Pis print.\n-•-...-^\nH < hrod17@clintonernail...
7   8  H <hrod17@clintonemail.corn>\nFriday, March 11...
(6742, 2)


In [3]:
def clean_email_text(text):
    # 数据清洗
    text = text.replace('\n', " ")  # 新行，我们是不需要的
    text = re.sub(r"-", " ", text)  # 把 "-" 的两个单词，分开。（比如：july-edu ==> july edu）
    text = re.sub(r"\d+/\d+/\d+", "", text)  # 日期，对主体模型没什么意义
    text = re.sub(r"[0-2]?[0-9]:[0-6][0-9]", "", text)  # 时间，没意义
    text = re.sub(r"[\w]+@[\.\w]+", "", text)  # 邮件地址，没意义
    text = re.sub(r"/[a-zA-Z]*[:\//\]*[A-Za-z0-9\-_]+\.+[A-Za-z0-9\.\/%&=\?\-_]+/i", "", text)  # 网址，没意义
    pure_text = ''
    # 以防还有其他特殊字符（数字）等等，我们直接把他们loop一遍，过滤掉
    for letter in text:
        # 只留下字母和空格
        if letter.isalpha() or letter == ' ':
            pure_text += letter
    # 再把那些去除特殊字符后落单的单词，直接排除。
    # 我们就只剩下有意义的单词了。
    text = ' '.join(word for word in pure_text.split() if len(word) > 1)  # 而且单词长度必须是2以上
    return text

In [4]:
docs = df['ExtractedBodyText']
docs = docs.apply(lambda s: clean_email_text(s))

# print(docs.head(1).values)

doclist = docs.values
print(docs)

1       Thursday March PM Latest How Syria is aiding Q...
2                                                     Thx
4       Friday March PM Huma Abedin Fw Latest How Syri...
5       Pis print Wednesday September PM Fw Meet The R...
7       Friday March PM Huma Abedin Fw Latest How Syri...
                              ...                        
7938    Hi Sorry havent had chance to see you but did ...
7939    assume you saw this by now if not its worth re...
7941    Big change of plans in the Senate Senator Reid...
7943    PVerveer Friday December AM From Please let me...
7944                                            See below
Name: ExtractedBodyText, Length: 6742, dtype: object


In [5]:
def remove_stopword():
    stopword = []
    with open('./resources/stopwords.txt', 'r', encoding='utf8') as f:
        lines = f.readlines()
        for line in lines:
            line = line.replace('\n', '')
            stopword.append(line)
    return stopword

In [6]:
stop_word = remove_stopword()
texts = [[word for word in doc.lower().split() if word not in stop_word] for doc in doclist]
print(texts[0])  # 第一个文本现在的样子

['thursday', 'march', 'pm', 'latest', 'syria', 'aiding', 'qaddafi', 'sid', 'hrc', 'memo', 'syria', 'aiding', 'libya', 'docx', 'hrc', 'memo', 'syria', 'aiding', 'libya', 'docx', 'march', 'hillary']


In [7]:
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
print(corpus[0])

[(0, 3), (1, 2), (2, 1), (3, 2), (4, 1), (5, 2), (6, 2), (7, 2), (8, 1), (9, 1), (10, 1), (11, 3), (12, 1)]


In [8]:
from pprint import pprint 
lda = ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20)
print(lda.print_topic(10, topn=5))  # 第10个主题最关键的五个词
# pprint(lda.print_topics(num_topics=20, num_words=3))  # 所有的主题

0.006*"obama" + 0.005*"american" + 0.005*"president" + 0.005*"time" + 0.005*"government"


In [9]:
lda.save("lda.model")

In [10]:
lda = ldamodel.LdaModel.load("lda.model")
text = "I was greeted by this heartwarming display on the corner of my street today. Thank you to all of you who did this. Happy Thanksgiving. -H"
text = clean_email_text(text)

texts = [word for word in text.lower().split() if word not in stop_word]
bow = dictionary.doc2bow(texts)
print(lda.get_document_topics(bow))

[(2, 0.29656535), (4, 0.12008975), (10, 0.27462456), (13, 0.1942672)]
