In [1]:
import pandas as pd
import numpy as np

ds = pd.read_csv("dataset/Emails.csv")
ds.head()

Unnamed: 0,Id,DocNumber,MetadataSubject,MetadataTo,MetadataFrom,SenderPersonId,MetadataDateSent,MetadataDateReleased,MetadataPdfLink,MetadataCaseNumber,...,ExtractedTo,ExtractedFrom,ExtractedCc,ExtractedDateSent,ExtractedCaseNumber,ExtractedDocNumber,ExtractedDateReleased,ExtractedReleaseInPartOrFull,ExtractedBodyText,RawText
0,1,C05739545,WOW,H,"Sullivan, Jacob J",87.0,2012-09-12T04:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH2/DOC_0C05739545...,F-2015-04841,...,,"Sullivan, Jacob J <Sullivan11@state.gov>",,"Wednesday, September 12, 2012 10:16 AM",F-2015-04841,C05739545,05/13/2015,RELEASE IN FULL,,UNCLASSIFIED\nU.S. Department of State\nCase N...
1,2,C05739546,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,H,,,2011-03-03T05:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH1/DOC_0C05739546...,F-2015-04841,...,,,,,F-2015-04841,C05739546,05/13/2015,RELEASE IN PART,"B6\nThursday, March 3, 2011 9:45 PM\nH: Latest...",UNCLASSIFIED\nU.S. Department of State\nCase N...
2,3,C05739547,CHRIS STEVENS,;H,"Mills, Cheryl D",32.0,2012-09-12T04:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH2/DOC_0C05739547...,F-2015-04841,...,B6,"Mills, Cheryl D <MillsCD@state.gov>","Abedin, Huma","Wednesday, September 12, 2012 11:52 AM",F-2015-04841,C05739547,05/14/2015,RELEASE IN PART,Thx,UNCLASSIFIED\nU.S. Department of State\nCase N...
3,4,C05739550,CAIRO CONDEMNATION - FINAL,H,"Mills, Cheryl D",32.0,2012-09-12T04:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH2/DOC_0C05739550...,F-2015-04841,...,,"Mills, Cheryl D <MillsCD@state.gov>","Mitchell, Andrew B","Wednesday, September 12,2012 12:44 PM",F-2015-04841,C05739550,05/13/2015,RELEASE IN PART,,UNCLASSIFIED\nU.S. Department of State\nCase N...
4,5,C05739554,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,"Abedin, Huma",H,80.0,2011-03-11T05:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH1/DOC_0C05739554...,F-2015-04841,...,,,,,F-2015-04841,C05739554,05/13/2015,RELEASE IN PART,"H <hrod17@clintonemail.com>\nFriday, March 11,...",B6\nUNCLASSIFIED\nU.S. Department of State\nCa...


In [2]:
import string

PUNCTUATION = "!\"(),.:;?[]{}…“"
BAD_SYMBOLS = "#$%&*+/<=>@\^|~â©ñ¹_" + string.digits


def filtered(text):
    for symbol in BAD_SYMBOLS:
        if text.find(symbol) != -1 or len(text) < 2:
            return ''

    return text


def transform_raw_texts(raw_texts):
    transformed_texts = []
    
    for line in raw_texts:
        transformed_line = ""
        
        for character in PUNCTUATION:
            line = line.replace(character, " ")

        for word in line.strip('\n').split():
            filtered_word = filtered(word)

            if filtered_word != '':
                transformed_line += filtered_word.lower() + " "
                
        transformed_texts.append(transformed_line)

    return transformed_texts

In [3]:
texts = transform_raw_texts(ds["RawText"].tolist())
print(len(texts))

7945


In [4]:
tokens = []
for text in texts:
    tokens += text.split()

In [5]:
import collections
from itertools import islice

frequencies = collections.Counter(zip(tokens, islice(tokens, 1, None)))

print(frequencies.most_common(20))

[(('of', 'state'), 28129), (('department', 'of'), 27169), (('doc', 'no'), 26534), (('case', 'no'), 26527), (('state', 'case'), 26515), (('no', 'date'), 26510), (('unclassified', 'department'), 26509), (('no', 'doc'), 26508), (('of', 'the'), 14052), (('date', 'unclassified'), 12830), (('in', 'the'), 10284), (('release', 'in'), 7916), (('date', 'release'), 7501), (('original', 'message'), 7012), (('subject', 're'), 6728), (('to', 'the'), 6538), (('message', 'from'), 6115), (('pm', 'to'), 5613), (('on', 'the'), 5170), (('to', 'subject'), 4484)]


In [6]:
import nltk
from nltk.collocations import BigramAssocMeasures
from nltk.collocations import BigramCollocationFinder

bigram_measures = nltk.collocations.BigramAssocMeasures()

finder = BigramCollocationFinder.from_words(tokens)
finder.nbest(bigram_measures.pmi, 20)

[("''rn", 'rince'),
 ("''s", 'pone'),
 ("'--", 'twatch'),
 ("'alexandre", "carl'"),
 ("'amitabh", "desai'"),
 ("'baby", "steps'"),
 ("'badly", "trained'"),
 ("'baer", 'danie'),
 ("'barry", "keith'"),
 ("'beitler", "ady'"),
 ("'benderslw", "matias'"),
 ("'big", "bang'"),
 ("'brian'", "'brocking"),
 ("'brocking", "elisabeth'"),
 ("'buhl", "cindy'"),
 ("'chang", "benjamin'"),
 ("'cheer", "on'"),
 ("'civ", "cas'"),
 ("'confidence", "building'"),
 ("'damage", "limitation'")]

In [7]:
from math import log

def log_likelihood(a, b, c, d):
    if a == 0 or b == 0 or c == 0 or d == 0:
        return 0

    return 2 * (a * log(a) + b * log(b) + c * log(c) + d * log(d) + (a + b + c + d) * log(a + b + c + d) -
                (a + b) * log(a + b) - (a + c) * log(a + c) - (b + d) * log(b + d) - (c + d) * log(c + d))

In [8]:
def add_word_to_dict(current_dict, word):
    if word == '':
        return

    if current_dict.get(word) is None:
        current_dict[word] = 1
    else:
        current_dict[word] += 1


def add_text_to_dict(current_dict, text):
    for token in text.split():
        if current_dict.get(token) is None:
            current_dict[token] = 1
        else:
            current_dict[token] += 1


def reduce_lexicon_dimension(current_dict, stop_words_number):
    words = sorted(current_dict.items(), key=lambda x: -x[1])

    return {item[0]: item[1] for item in words[stop_words_number:]}, \
           {item[0]: item[1] for item in words[:stop_words_number]}


def current_text_keywords(current_text_dict, normative_dict, total_number_of_words, stop_words_dict, ll_score_threshold):
    for word in stop_words_dict:
        current_text_dict.pop(word, None)

    current_number_of_words = sum(current_text_dict.values())

    current_scores_dict = {}
    for word in current_text_dict:
        ll_score = log_likelihood(current_text_dict[word],
                                  normative_dict[word],
                                  current_number_of_words - current_text_dict[word],
                                  total_number_of_words - normative_dict[word])

        if ll_score >= ll_score_threshold:
            current_scores_dict[word] = ll_score

    return [item[0] for item in sorted(current_scores_dict.items(), key=lambda x: -x[1])]


def build_normative_dict(texts):
    result_dict = {}
    
    for text in texts:
        add_text_to_dict(result_dict, text)

    return result_dict


def get_keywords(texts, ll_score_threshold, normative_dict, stop_words_dict,
                 total_number_of_words, lexicon_dimension):

    keywords_dict = {}

    for text in texts:
        current_text_dict = {}
        add_text_to_dict(current_text_dict, text)

        current_keywords = current_text_keywords(current_text_dict, normative_dict, total_number_of_words,
                                                 stop_words_dict, ll_score_threshold)

        for word in current_keywords:
            add_word_to_dict(keywords_dict, word)

    return set([item[0] for item in sorted(keywords_dict.items(), key=lambda x: -x[1])[:lexicon_dimension]])


def extract_features(texts, keywords_vector_dimension, stop_words_num, lexicon_dimension, ll_score_threshold):

    normative_dict = build_normative_dict(texts)
    normative_dict, stop_words_dict = reduce_lexicon_dimension(normative_dict, stop_words_num)
    total_number_of_words = sum(normative_dict.values())
    
    keywords = get_keywords(texts, ll_score_threshold, normative_dict,
                            stop_words_dict, total_number_of_words, lexicon_dimension)
    
    keywords_for_texts = []
    
    for text in texts:
        current_text_dict = {}
        add_text_to_dict(current_text_dict, text)

        raw_current_keywords = current_text_keywords(current_text_dict, normative_dict, total_number_of_words,
                                                     stop_words_dict, ll_score_threshold)

        current_keywords = []

        for keyword in raw_current_keywords:
            if keyword in keywords:
                 current_keywords.append(keyword)
                    
        keywords_for_text = ""
        
        for keyword in current_keywords[:keywords_vector_dimension]:
            keywords_for_text += keyword + " "

        keywords_for_texts.append(keywords_for_text)

    return keywords_for_texts

In [9]:
from sklearn.cluster import KMeans
from sklearn.cluster.hierarchical import AgglomerativeClustering
from sklearn.feature_extraction.text import CountVectorizer

def get_kmeans_clusters_map(texts, CLUSTERS_NUMBER):
    vectorizer = CountVectorizer()
    matrix = vectorizer.fit_transform(texts)
    
    print("Vectorizer matrix shape: " + str(matrix.shape))

    model = KMeans(n_clusters=CLUSTERS_NUMBER, random_state=1)
    return model.fit_predict(matrix.toarray())

def get_agglomerative_clusters_map(texts, CLUSTERS_NUMBER):
    vectorizer = CountVectorizer()
    matrix = vectorizer.fit_transform(texts)

    print("Vectorizer matrix shape: " + str(matrix.shape))

    model = AgglomerativeClustering(n_clusters=CLUSTERS_NUMBER, affinity='euclidean', linkage='ward')
    return model.fit_predict(matrix.toarray())

In [10]:
def clusters_as_texts(texts, clusters_map, CLUSTERS_NUMBER):
    clusters = ["" for i in range(CLUSTERS_NUMBER)]
    
    for i, text in enumerate(texts):
        clusters[clusters_map[i]] += text + " "
        
    return clusters

In [11]:
KEYWORDS_VECTOR_DIMENSION = 30
STOP_WORDS_NUM = 50
LEXICON_DIMENSION = 10000
LL_SCORE_THRESHOLD = 10.83
CLUSTERS_NUMBER = 20

keywords_for_texts = extract_features(texts, LEXICON_DIMENSION, STOP_WORDS_NUM, LEXICON_DIMENSION, LL_SCORE_THRESHOLD)

In [12]:
clusters_map = get_kmeans_clusters_map(keywords_for_texts, CLUSTERS_NUMBER)

Vectorizer matrix shape: (7945, 9367)


In [13]:
clusters = clusters_as_texts(texts, clusters_map, CLUSTERS_NUMBER)

In [14]:
keywords_for_clusters = extract_features(clusters, KEYWORDS_VECTOR_DIMENSION, 
                                         STOP_WORDS_NUM, LEXICON_DIMENSION, LL_SCORE_THRESHOLD)

for cluster_keywords in keywords_for_clusters:
    print(cluster_keywords + '\n')

susman shaun cameron boxed worked compelled perfectly beforehand lou call clarification reg made convenient empey sid considering timing february helpful felt seeking couple tactic hour half works afternoon event late 

baer lona dan muscatine lissa valmoro speech draft ornament refinement re-working dec agree ornaments 'daniel works getting topple huma tree sun daniel cc careful closer copying plane abedin raise anyone 

lauren jiloty call mashabane aug list update huma abedin cc thu tomorrow august mailto morning sat anytime can com do him ok dep i'm menendez armitage wexler fmr ill me 

room hrc dr hardy office sharon secretary's signs floor appointment guests photographer staff franklin pascual tauscher contact affidavit participants makes administers remarks escort ambassador seasons greet departs merten photos proceeds 

children moyes child trafficking haiti unicef save jillian kids disaster thousands earthquake adoption concern families westportnow atlanta need desperate agenci

In [15]:
def analyze_clusters(clusters_map, clusters_number):
    number_of_documents = 0
    clusters_consistency = [0 for i in range(clusters_number)]

    for cluster in clusters_map:
        clusters_consistency[int(cluster)] += 1
        number_of_documents += 1

    for i in range(clusters_number):
        print(str(i) + ' ' + str(clusters_consistency[i]) + ' ' + str(clusters_consistency[i] / number_of_documents))

In [16]:
analyze_clusters(clusters_map, CLUSTERS_NUMBER)

0 4 0.0005034612964128383
1 4 0.0005034612964128383
2 206 0.02592825676526117
3 2 0.0002517306482064191
4 5 0.0006293266205160479
5 6 0.0007551919446192574
6 17 0.0021397105097545627
7 2 0.0002517306482064191
8 104 0.013089993706733794
9 5 0.0006293266205160479
10 155 0.019509125235997484
11 17 0.0021397105097545627
12 14 0.001762114537444934
13 21 0.0026431718061674008
14 5 0.0006293266205160479
15 31 0.0039018250471994967
16 3 0.0003775959723096287
17 7330 0.9225928256765261
18 12 0.0015103838892385148
19 2 0.0002517306482064191


Можно попытаться использовать какую-либо меру качества, не требующую наличия эталонной кластеризации.

In [17]:
from sklearn.metrics import silhouette_samples
from sklearn.utils import check_random_state

def silhouette_score_for_samples(features, labels, metric='euclidean', sample_size=None, random_state=None, **kwds):
    n_labels = len(np.unique(labels))
    n_samples = features.shape[0]
    if not 1 < n_labels < n_samples:
        raise ValueError("Number of labels is %d. Valid values are 2 "
                         "to n_samples - 1 (inclusive)" % n_labels)

    if sample_size is not None:
        random_state = check_random_state(random_state)
        indices = random_state.permutation(features.shape[0])[:sample_size]
        if metric == "precomputed":
            features, labels = features[indices].T[indices].T, labels[indices]
        else:
            features, labels = features[indices], labels[indices]
    return silhouette_samples(features, labels, metric=metric, **kwds)

In [None]:
samples_scores = silhouette_score_for_samples(np.array(texts), np.array(clusters_map), metric='cosine')
print(str(np.mean(samples_scores)) + " " + str(np.median(samples_scores)))

In [None]:
from matplotlib import pyplot as plt

def bar_chart_for_samples_scores(samples_scores):
    plt.clf()
    plt.title("Distribution of silhouette scores for samples")
    plt.xlabel("Silhouette scores ")
    plt.ylabel("Samples number")
    plt.axis([-1, 1, 0, len(samples_scores)])

    plt.hist(samples_scores, bins=50)

    plt.show()

bar_chart_for_samples_scores(samples_scores)

Для случайной выборки текстов из корпуса каждый асессор может определять долю текстов, не соответствующих автоматически составленному описанию кластера в виде ключевых слов с точки зрения того, как он склонен ее интерпретировать. Итоговое значение ошибки необходимо подсчитывать для группы таких асессоров.