### Imports

In [2]:
import re
import numpy as np
import pandas as pd
import escritoras_latinas.data.load as load
import escritoras_latinas.data.process as process
"""
Natural Language Processing
"""
# NLTK - Sentence Tokenizer
from nltk.tokenize import sent_tokenize
# GSDMM - Topic Modeling
from gsdmm import MovieGroupProcess

### Load data

In [3]:
data_raw = load.data_raw
data_processed = load.data_processed
data_entries = load.data_entries

### Read data

In [4]:
# Read 'csv' file as dataframe
df = pd.read_csv(f'{data_entries}')

# Show shape of dataframe
df.shape

(1116, 1)

In [5]:
# Convert dataframe to list
entries = df.values.tolist()
# Convert to item on list
entries = [str(entry) for entry in entries]
# Clean list content
entries = [re.sub(r'(^.*?\))', '', entry) for entry in entries]

### Process data with NLP (Natural Language Processing)


In [6]:
# Initialize a MovieGroupProcess
mgp = MovieGroupProcess(K=5, alpha=0.1, beta=0.1, n_iters=30)


# Define function to get top words per topic
def top_words(cluster_word_distribution, top_cluster, values):
    top_words = []
    for cluster in top_cluster:
        sort_dicts = sorted(cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        top_words.append(sort_dicts)
        print("\nCluster %s: %s"%(cluster, sort_dicts))
    return top_words

In [7]:
docs_list = []
vocab_list = []
for i in range(len(entries)):
    # Tokenize text into sentences
    sents = sent_tokenize(entries[i])
    # Tokenize sentences into words
    data = list(process.sent_to_words(sents))
    # Remove stopwords in Spanish
    docs = process.remove_stopwords(data)
    docs_list.append(docs)
    # Filter unique words
    vocab = set(x for doc in docs for x in doc)
    vocab_list.append(vocab)

In [8]:
clusters = []
for i in range(len(entries)):
    # Fit the model
    y = mgp.fit(docs_list[i], len(vocab_list[i]))

    # Number of documents per topic
    doc_count = np.array(mgp.cluster_doc_count)
    # Topics sorted by the number of document they are allocated to
    top_index = doc_count.argsort()[-15:][::-1]

    # Get top words in topics
    topwords = top_words(mgp.cluster_word_distribution, top_index, 3)
    # Filter first element from tuples
    # for sublist in topwords:
    #     clusters.append(list(map(lambda x: x[0], sublist)))
    clusters.append(topwords)


In stage 0: transferred 17 clusters with 5 clusters populated
In stage 1: transferred 5 clusters with 5 clusters populated
In stage 2: transferred 7 clusters with 4 clusters populated
In stage 3: transferred 9 clusters with 5 clusters populated
In stage 4: transferred 8 clusters with 4 clusters populated
In stage 5: transferred 8 clusters with 5 clusters populated
In stage 6: transferred 8 clusters with 5 clusters populated
In stage 7: transferred 8 clusters with 5 clusters populated
In stage 8: transferred 6 clusters with 5 clusters populated
In stage 9: transferred 5 clusters with 5 clusters populated
In stage 10: transferred 4 clusters with 5 clusters populated
In stage 11: transferred 2 clusters with 5 clusters populated
In stage 12: transferred 6 clusters with 5 clusters populated
In stage 13: transferred 5 clusters with 5 clusters populated
In stage 14: transferred 4 clusters with 5 clusters populated
In stage 15: transferred 4 clusters with 5 clusters populated
In stage 16: tran

In [24]:
# Create empty list
clusters = []
topics = []

for i in range(5):
    # Tokenize text into sentences
    sents = sent_tokenize(entries[i])
    # Tokenize sentences into words
    data = list(process.sent_to_words(sents))
    # Remove stopwords in Spanish
    docs = process.remove_stopwords(data)
    # Filter unique words
    vocab = set(x for doc in docs for x in doc)

    # Fit the model
    y = mgp.fit(docs, len(vocab))

    # Number of documents per topic
    doc_count = np.array(mgp.cluster_doc_count)
    # Topics sorted by the number of document they are allocated to
    top_index = doc_count.argsort()[-15:][::-1]

    # Get top words in topics
    topwords = top_words(mgp.cluster_word_distribution, top_index, 5)

    # Filter first element from tuples
    for sublist in topwords:
        clusters.append(list(map(lambda x: x[0], sublist)))
    # Flatten list of lists
    flat_clusters = [item for sublist in clusters for item in sublist]

    topics.append(flat_clusters)
    clusters.clear()


In stage 0: transferred 15 clusters with 5 clusters populated
In stage 1: transferred 7 clusters with 5 clusters populated
In stage 2: transferred 6 clusters with 5 clusters populated
In stage 3: transferred 4 clusters with 5 clusters populated
In stage 4: transferred 4 clusters with 5 clusters populated
In stage 5: transferred 6 clusters with 5 clusters populated
In stage 6: transferred 6 clusters with 5 clusters populated
In stage 7: transferred 4 clusters with 5 clusters populated
In stage 8: transferred 1 clusters with 5 clusters populated
In stage 9: transferred 4 clusters with 5 clusters populated
In stage 10: transferred 5 clusters with 5 clusters populated
In stage 11: transferred 4 clusters with 5 clusters populated
In stage 12: transferred 6 clusters with 5 clusters populated
In stage 13: transferred 6 clusters with 5 clusters populated
In stage 14: transferred 5 clusters with 5 clusters populated
In stage 15: transferred 5 clusters with 5 clusters populated
In stage 16: tran

### Training Topic Modeling with `GSDMM` (Gibbs Sampling algorithm for the Dirichlet Multinomial Mixture)


In [95]:

text = re.sub(r'(^.*?\))', '', entries[0])

# Tokenize text into sentences
sents = sent_tokenize(text)
# Tokenize sentences into words
data = list(process.sent_to_words(sents))
# Remove stopwords in Spanish
docs = process.remove_stopwords(data)
# Filter unique words
vocab = set(x for doc in docs for x in doc)


In [96]:
mgp = MovieGroupProcess(K=5, alpha=0.1, beta=0.1, n_iters=30)
y = mgp.fit(docs, len(vocab))

In stage 0: transferred 14 clusters with 4 clusters populated
In stage 1: transferred 7 clusters with 5 clusters populated
In stage 2: transferred 3 clusters with 4 clusters populated
In stage 3: transferred 7 clusters with 4 clusters populated
In stage 4: transferred 7 clusters with 5 clusters populated
In stage 5: transferred 7 clusters with 5 clusters populated
In stage 6: transferred 8 clusters with 5 clusters populated
In stage 7: transferred 7 clusters with 5 clusters populated
In stage 8: transferred 8 clusters with 5 clusters populated
In stage 9: transferred 5 clusters with 5 clusters populated
In stage 10: transferred 8 clusters with 5 clusters populated
In stage 11: transferred 5 clusters with 5 clusters populated
In stage 12: transferred 5 clusters with 5 clusters populated
In stage 13: transferred 7 clusters with 5 clusters populated
In stage 14: transferred 6 clusters with 5 clusters populated
In stage 15: transferred 9 clusters with 4 clusters populated
In stage 16: tran

In [98]:
# Define function to get top words per topic
def top_words(cluster_word_distribution, top_cluster, values):
    top_words = []
    for cluster in top_cluster:
        sort_dicts = sorted(cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        top_words.append(sort_dicts)
        print("\nCluster %s: %s"%(cluster, sort_dicts))
    return top_words
# Number of documents per topic
doc_count = np.array(mgp.cluster_doc_count)
# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-15:][::-1]

# Get top words in topics
top_words = top_words(mgp.cluster_word_distribution, top_index, 3)


Cluster 1: [('mujeres', 3), ('poetas', 3), ('país', 3)]

Cluster 4: [('poesía', 3), ('casa', 2), ('nacional', 2)]

Cluster 0: [('antología', 5), ('asociación', 4), ('escritores', 4)]

Cluster 2: [('internacional', 3), ('poesía', 3), ('feria', 2)]

Cluster 3: []


In [101]:
clusters = []
for sublist in top_words:
    clusters.append(list(map(lambda x: x[0], sublist)))
clusters = [item for sublist in clusters for item in sublist]
clusters

['mujeres',
 'poetas',
 'país',
 'poesía',
 'casa',
 'nacional',
 'antología',
 'asociación',
 'escritores',
 'internacional',
 'poesía',
 'feria']

### Bibliography:
- [GSDMM: Short text clustering](https://github.com/rwalk/gsdmm)