In [1]:
import os, re

# Folder containing all papers.
data_dir = 'visionscarto-backup-textes/'  # Set this path to the data on your machine.

# Get all document texts and their corresponding IDs.
docs = []
doc_ids = []
author2doc = dict()

i=0

# List of filenames + authors
for filen in os.listdir(data_dir):
    idx = re.search('\d+', filen)
    if idx:
        id = i
        i += 1
        doc_ids.append(id)
        idx = int(idx.group(0))
        with open(data_dir + filen, errors='ignore', encoding='utf-8') as fid:
            txt = fid.read()
            docs.append(txt)
            authors = re.search('\nAUTEUR:\s+\n((?:- (?:[^\n]*)\n)+)', txt)
            if authors:
                authors = re.findall('\n- ([^\n]*)', authors.group())
                for author in authors:
                    if not author2doc.get(author):
                        author2doc[author] = [id]
                    else:
                        author2doc[author].append(id)

print (len(docs), 'docs,', len(author2doc), 'authors')

330 docs, 87 authors


In [2]:
import spacy
nlp = spacy.load('en')

In [3]:
%%time
processed_docs = []    
for doc in nlp.pipe(docs, n_threads=4, batch_size=100):
    # Process document using Spacy NLP pipeline.
    
    ents = doc.ents  # Named entities.

    # Keep only words (no numbers, no punctuation).
    # Lemmatize tokens, remove punctuation and remove stopwords.
    doc = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]

    # Remove common words from a stopword list.
    #doc = [token for token in doc if token not in STOPWORDS]

    # Add named entities, but only if they are a compound of more than word.
    doc.extend([str(entity) for entity in ents if len(entity) > 1])
    
    processed_docs.append(doc)
    
docs = processed_docs
del processed_docs

CPU times: user 36.1 s, sys: 2.63 s, total: 38.7 s
Wall time: 42.8 s


In [4]:
# Compute bigrams.
from gensim.models import Phrases
# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)



In [5]:
# Create a dictionary representation of the documents, and filter out frequent and rare words.

from gensim.corpora import Dictionary
dictionary = Dictionary(docs)

# Remove rare and common tokens.
# Filter out words that occur too frequently or too rarely.
max_freq = 0.1
min_wordcount = 20
dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)

_ = dictionary[0]  # This sort of "initializes" dictionary.id2token.

In [6]:
# Vectorize data.

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [7]:
print('Number of authors: %d' % len(author2doc))
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of authors: 87
Number of unique tokens: 545
Number of documents: 330


In [8]:
num_topics = 5
from gensim.models import AuthorTopicModel
%time model = AuthorTopicModel(corpus=corpus, num_topics=num_topics, id2word=dictionary.id2token, \
                author2doc=author2doc, chunksize=2000, passes=1, eval_every=0, \
                iterations=1, random_state=1)

CPU times: user 102 ms, sys: 7.06 ms, total: 109 ms
Wall time: 127 ms


In [9]:
%%time
model_list = []
for i in range(5):
    model = AuthorTopicModel(corpus=corpus, num_topics=num_topics, id2word=dictionary.id2token, \
                    author2doc=author2doc, chunksize=2000, passes=200, gamma_threshold=1e-10, \
                    eval_every=0, iterations=1, random_state=i)
    top_topics = model.top_topics(corpus)
    tc = sum([t[1] for t in top_topics])
    model_list.append((model, tc))

CPU times: user 1min 7s, sys: 895 ms, total: 1min 8s
Wall time: 1min 13s


In [10]:
model, tc = max(model_list, key=lambda x: x[1])
print('Topic coherence: %.3e' %tc)

Topic coherence: -1.100e+03


In [11]:
# Save model.
model.save('/tmp/visionscarto.atmodel')

In [12]:
# Load model.
from gensim.models import AuthorTopicModel
model = AuthorTopicModel.load('/tmp/visionscarto.atmodel')

In [13]:
model.show_topic(0)

[('soviétique', 0.0093372902025256246),
 ('maritime', 0.0080782878343003389),
 ('film', 0.0078367748881924028),
 ('asie', 0.0070239092613826534),
 ('internet', 0.0069849015019235617),
 ('méditerranée', 0.0069346218497508315),
 ('code', 0.0061949415742276245),
 ('atlas', 0.0060618637359574146),
 ('réseau', 0.0060449879285346623),
 ('del', 0.005627781126895777)]

In [14]:
for topic in model.show_topics(num_topics=num_topics):
    words = ''
    for word, prob in model.show_topic(topic[0]):
        words += word + ' '
    print(':: ' + words)

:: soviétique maritime film asie internet méditerranée code atlas réseau del 
:: terres agricoles armes militaires loi communautés militaire conflit processus membres 
:: e del mur réfugiés la_police turquie cristina homme suisse cristina_del 
:: société îles dimension quartiers capitale la_société géographe relation paysage distance 
:: code port camp york new_york mon suis world je_suis retour 


In [15]:
from pprint import pprint

def show_author(name):
    print('\n%s' % name)
    print('Docs:', model.author2doc[name])
    print('Topics:')
    pprint([(topic_labels[topic[0]], topic[1]) for topic in model[name]])

In [16]:
show_author("Philippe Rivière")


Philippe Rivière
Docs: [15, 48, 64, 67, 69, 70, 81, 89, 95, 98, 99, 103, 104, 105, 125, 140, 154, 156, 172, 173, 189, 244, 256, 260, 261, 265, 266, 267, 276, 280, 281, 284]
Topics:


NameError: name 'topic_labels' is not defined

In [17]:
from gensim.models import atmodel
doc2author = atmodel.construct_doc2author(model.corpus, model.author2doc)

In [18]:
# Compute the per-word bound.
# Number of words in corpus.
corpus_words = sum(cnt for document in model.corpus for _, cnt in document)

# Compute bound and divide by number of words.
perwordbound = model.bound(model.corpus, author2doc=model.author2doc, \
                           doc2author=model.doc2author) / corpus_words
print(perwordbound)

ZeroDivisionError: float division by zero

In [19]:
%time top_topics = model.top_topics(model.corpus)

CPU times: user 211 ms, sys: 2.33 ms, total: 214 ms
Wall time: 216 ms


In [28]:
%%time
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
smallest_author = 0  # Ignore authors with documents less than this.
authors = [model.author2id[a] for a in model.author2id.keys() if len(model.author2doc[a]) >= smallest_author]
_ = tsne.fit_transform(model.state.gamma[authors, :])  # Result stored in tsne.embedding_

CPU times: user 343 ms, sys: 22.3 ms, total: 365 ms
Wall time: 347 ms


In [29]:
# Tell Bokeh to display plots inside the notebook.
from bokeh.io import output_notebook
output_notebook()

In [30]:
from bokeh.models import HoverTool
from bokeh.plotting import figure, show, ColumnDataSource
import math

x = tsne.embedding_[:, 0]
y = tsne.embedding_[:, 1]
author_names = [model.id2author[a] for a in authors]

# Radius of each point corresponds to the number of documents attributed to that author.
scale = 3
author_sizes = [len(model.author2doc[a]) for a in author_names]
radii = [math.sqrt(size) * scale for size in author_sizes]

source = ColumnDataSource(
        data=dict(
            x=x,
            y=y,
            author_names=author_names,
            author_sizes=author_sizes,
            radii=radii,
        )
    )

# Add author names and sizes to mouse-over info.
hover = HoverTool(
        tooltips=[
        ("author", "@author_names"),
        ("size", "@author_sizes"),
        ]
    )

p = figure(tools=[hover, 'crosshair,pan,wheel_zoom,box_zoom,reset,save,lasso_select'])
p.scatter('x', 'y', radius='radii', source=source, fill_alpha=0.6, line_color=None)
show(p)

In [23]:
from gensim.similarities import MatrixSimilarity

# Generate a similarity object for the transformed corpus.
index = MatrixSimilarity(model[list(model.id2author.values())])

# Get similarities to some author.
author_name = 'Philippe Rivière'
sims = index[model[author_name]]

In [24]:
# Make a function that returns similarities based on the Hellinger distance.

from gensim import matutils
import pandas as pd

# Make a list of all the author-topic distributions.
author_vecs = [model.get_author_topics(author) for author in model.id2author.values()]

def similarity(vec1, vec2):
    '''Get similarity between two vectors'''
    dist = matutils.hellinger(matutils.sparse2full(vec1, model.num_topics), \
                              matutils.sparse2full(vec2, model.num_topics))
    sim = 1.0 / (1.0 + dist)
    return sim

def get_sims(vec):
    '''Get similarity of vector to all authors.'''
    sims = [similarity(vec, vec2) for vec2 in author_vecs]
    return sims

def get_table(name, top_n=10, smallest_author=1):
    '''
    Get table with similarities, author names, and author sizes.
    Return `top_n` authors as a dataframe.
    
    '''
    
    # Get similarities.
    sims = get_sims(model.get_author_topics(name))

    # Arrange author names, similarities, and author sizes in a list of tuples.
    table = []
    for elem in enumerate(sims):
        author_name = model.id2author[elem[0]]
        sim = elem[1]
        author_size = len(model.author2doc[author_name])
        if author_size >= smallest_author:
            table.append((author_name, sim, author_size))
            
    # Make dataframe and retrieve top authors.
    df = pd.DataFrame(table, columns=['Author', 'Score', 'Size'])
    df = df.sort_values('Score', ascending=False)[:top_n]
    
    return df

In [31]:
get_table('Philippe Rivière')

Unnamed: 0,Author,Score,Size
68,Philippe Rivière,1.0,32
67,Philippe Rekacewicz,0.999776,183
56,Nicky Hager,0.874938,1
60,Oskar Rekacewicz,0.868715,4
26,Dominique Vidal,0.834706,1
14,Bruno Bergot,0.776194,1
28,Elizabeth Rajasingh,0.767883,2
55,Mycle Schneider,0.723519,1
40,Julie Hazemann,0.723519,1
24,Dan McCarey,0.7022,3


In [26]:
model.show_topics()

[(0,
  '0.009*"soviétique" + 0.008*"maritime" + 0.008*"film" + 0.007*"asie" + 0.007*"internet" + 0.007*"méditerranée" + 0.006*"code" + 0.006*"atlas" + 0.006*"réseau" + 0.006*"del"'),
 (1,
  '0.061*"terres" + 0.015*"agricoles" + 0.015*"armes" + 0.012*"militaires" + 0.011*"loi" + 0.010*"communautés" + 0.010*"militaire" + 0.009*"conflit" + 0.008*"processus" + 0.008*"membres"'),
 (2,
  '0.045*"e" + 0.037*"del" + 0.030*"mur" + 0.022*"réfugiés" + 0.018*"la_police" + 0.018*"turquie" + 0.014*"cristina" + 0.014*"homme" + 0.014*"suisse" + 0.014*"cristina_del"'),
 (3,
  '0.012*"société" + 0.011*"îles" + 0.010*"dimension" + 0.010*"quartiers" + 0.010*"capitale" + 0.009*"la_société" + 0.009*"géographe" + 0.009*"relation" + 0.008*"paysage" + 0.008*"distance"'),
 (4,
  '0.038*"code" + 0.032*"port" + 0.015*"camp" + 0.014*"york" + 0.014*"new_york" + 0.012*"mon" + 0.012*"suis" + 0.011*"world" + 0.011*"je_suis" + 0.010*"retour"')]

In [32]:
for topic in model.show_topics():
    print (', ' . join("{}".format(k) for k,_ in model.show_topic(topic[0])[:8]))

soviétique, maritime, film, asie, internet, méditerranée, code, atlas
terres, agricoles, armes, militaires, loi, communautés, militaire, conflit
e, del, mur, réfugiés, la_police, turquie, cristina, homme
société, îles, dimension, quartiers, capitale, la_société, géographe, relation
code, port, camp, york, new_york, mon, suis, world
