In [1]:
import nltk
from nltk.collocations import * 
import pandas as ps
import math
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from gensim.models import doc2vec
from collections import namedtuple
import re



In [2]:
ps.options.display.max_columns = 999

In [3]:
data = ps.read_csv("Emails.csv")

In [4]:
print(data["ExtractedBodyText"].describe())

count     6742
unique    5916
top        Fyi
freq       174
Name: ExtractedBodyText, dtype: object


In [5]:
text = data["ExtractedBodyText"]
text.fillna("", inplace=True)
text = text[text!=""]

In [6]:
len(text)

6742

In [7]:
vect = CountVectorizer(ngram_range = (2, 2), min_df = 2)
X = vect.fit_transform(text)

Для предобработки мы убрали все уникальные биграммы.

In [8]:
bigram_freq = list(zip(vect.get_feature_names(), np.asarray(X.sum(axis=0)).ravel()))
bigram_freq.sort(key=lambda a: a[1], reverse=True)
bigram_freq[:20]

[('of the', 3360),
 ('in the', 2536),
 ('to the', 1466),
 ('on the', 1341),
 ('for the', 1059),
 ('and the', 1057),
 ('that the', 791),
 ('to be', 785),
 ('state gov', 761),
 ('with the', 754),
 ('at the', 716),
 ('of state', 596),
 ('it is', 573),
 ('will be', 569),
 ('by the', 523),
 ('from the', 510),
 ('the united', 462),
 ('secretary office', 460),
 ('state department', 455),
 ('united states', 449)]

Неудивительно, что самые частые биграммы состоят из союзов и слов, которые должно употреблять в служебной переписке высокопоставленному чиновнику США.

In [9]:
vect = CountVectorizer(min_df = 3)
X = vect.fit_transform(text)
words = vect.inverse_transform(X)
words_merge = [words[i][j] for i in range(len(words)) for j in range(len(words[i]))]

In [10]:
len(words_merge)

279722

In [11]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(
   words_merge)
finder.apply_freq_filter(4)
finder.nbest(bigram_measures.pmi, 20)

[('9598', '6421'),
 ('aires', 'buenos'),
 ('lte', '4g'),
 ('parvez', 'ashfaq'),
 ('scowcroft', 'brent'),
 ('flu', 'swine'),
 ('heel', 'achilles'),
 ('kayani', 'parvez'),
 ('quam', 'lois'),
 ('qureshi', 'mehmood'),
 ('samaria', 'judea'),
 ('6421', 'hongju'),
 ('dobb', '3959'),
 ('duck', 'lame'),
 ('kyi', 'suu'),
 ('ltstrategy', '1626'),
 ('spence', 'roy'),
 ('suu', 'aung'),
 ('wendy', 'sherman'),
 ('proxies', 'puppet')]

Такие биграммы получены по PMI

In [136]:
kmeans = KMeans(n_clusters=5, random_state=1).fit(X)

In [137]:
labels = kmeans.labels_

In [138]:
ps.Series(labels).value_counts()

4    6489
0     202
2      32
1      16
3       3
dtype: int64

Простая кластеризация методом Kmeans

In [19]:
from gensim.models import doc2vec

In [20]:
from collections import namedtuple

In [139]:
vect = CountVectorizer(min_df = 1)
X = vect.fit_transform(text)
text_list = vect.inverse_transform(X)
docs = []
analyzedDocument = namedtuple("SSZ", 'words tags')
for i, mail in enumerate(text_list):
    tags = [i]
    docs.append(analyzedDocument(list(mail), tags))

Предобработка текста для последующей обработки при помощи doc2vec

In [22]:
model = doc2vec.Doc2Vec(docs, size = 100, window = 300, min_count = 3, workers = 4)

In [100]:
kmeans = KMeans(n_clusters=6, random_state=0).fit(model.docvecs)
labels = kmeans.labels_

In [101]:
vect = CountVectorizer(min_df = 3)
X = vect.fit_transform(text)

0 - неформальные письма, не относящиеся к политике, пересланные сообщения и письма с прикреплениями  (Fyi), деловая переписка
1- ?
2 - обсуждение прессы, статей, интервью, стенограмм
3 - ? упоминается ливия, сирия, йемен ?
4- ?
5 - формальные письма, преимущественно о политике, много упоминаний военизированных и полицейских структур (military?)

In [119]:
ps.Series(labels).value_counts()

0    4644
5    1434
3     472
2     141
1      38
4      13
dtype: int64

In [117]:
c = 0
lab = []
for i in range(1000):
    if labels[i] == 0:
        print (text[i])

Thx
FYI
Fyi
_ .....
From Randolph, Lawrence M
Sent: Wednesday, September 12, 2012 04:33 PM
To: Mills, Cheryl D
Subject: RE: Not a dry eye in NEA
Including mine. Her remarks were really moving. Chriswas an amazing man. Such a huge loss. You know, I was in Libya
before coming here and in my almost ten years, I have never worked with such a calm, cool headed, funny
diplomat. Made it all seem really easy- even in one of the hardest places to work in the world.
Hope. See picture below Kamala sent.
Another photo.
This is nice.
H <hrod17@clintonernaii.com›
Wednesday, September 12, 2012 11:26 PM
esullivanjj@state.gov'
Fw: Fwd: more on libya
Libya 37 sept 12 12,docx
We should get this around asap.
Pis print.
H < hrod17@clintoriernail.corn>
Wednesday, September 12, 2012 11:28 PM
°Russont@state.gov°
Fw: Fwd: more on libya
Libya 37 sept 12 12.dacx
Pis print.
Follow Up Flag: Follow up
Flag Status: Flagged
More info,
B5
Sidney Blumenthal
Thursday, September 13, 2012 12:29 AM
H: 