# Exhibitions

In [1]:
import os
from collections import Counter, defaultdict
from pprint import pprint
from string import punctuation

import nltk
import numpy as np
import pandas as pd
from gensim import corpora, models
from nltk import word_tokenize
from nltk.corpus import stopwords
from scipy import spatial

## Loading data

In [2]:
%%time

exhibitions_df = pd.read_csv('data/out/sp_cleaned_exhibitions.csv')
exhibitions_df.fillna('', inplace=True)

FileNotFoundError: File b'data/out/sp_cleaned_exhibitions.csv' does not exist

In [3]:
exhibitions_df.shape

(37420, 26)

In [4]:
exhibitions_df.head()

Unnamed: 0,ID,post_type,post_title,place_t,place_r,place_c,start_y,start_m,start_d,end_y,...,xplace_t,xplace_r,xplace_c,xstart_y,xstart_m,xstart_d,xend_y,xend_m,xend_d,xgender
0,67660,exposición,latifa echakhch ronda,Barcelona,Cataluña,España,2010,7.0,6.0,2011,...,España,España,España,2000.0,1.0,1.0,,,,Masculino
1,67660,exposición,latifa echakhch ronda,Barcelona,Cataluña,España,2010,7.0,6.0,2011,...,Marruecos,Marruecos,Marruecos,1974.0,1.0,1.0,,,,Femenino
2,67660,exposición,latifa echakhch ronda,Barcelona,Cataluña,España,2010,7.0,6.0,2011,...,Barcelona,Cataluña,España,,,,,,,NO APLICA
3,67660,exposición,latifa echakhch ronda,Barcelona,Cataluña,España,2010,7.0,6.0,2011,...,Barcelona,Cataluña,España,,,,,,,NO APLICA
4,68933,exposición,pep duran cadena acontecimientos,Barcelona,Cataluña,España,2011,3.0,3.0,2011,...,España,España,España,1955.0,1.0,1.0,,,,Masculino


## Topics

In [5]:
exhibits_df = exhibitions_df.drop_duplicates('ID')

In [6]:
exhibits_df.ID.nunique()

4461

In [7]:
exhibits_df.start_y.value_counts()

2015    1380
2016     969
2014     579
2013     439
2012     384
2011     370
2010     340
Name: start_y, dtype: int64

In [8]:
texts_sr = exhibits_df.groupby('start_y').post_title.sum()

In [9]:
for i, text in texts_sr.items():
    print(i, len(text.split()))

2010 1147
2011 1222
2012 1315
2013 1465
2014 1835
2015 3987
2016 2709


In [10]:
# texts_sr = texts_sr[texts_sr.apply(lambda x: len(x.split()) >= ??)] # Selecting all

In [11]:
texts_sr.index

Int64Index([2010, 2011, 2012, 2013, 2014, 2015, 2016], dtype='int64', name='start_y')

In [12]:
documents = list(texts_sr)

In [13]:
len(documents)

7

In [14]:
texts = [[word for word in document.split()] for document in documents]

In [15]:
len(texts)

7

In [16]:
def load_ini_dictionary(texts):
    filename = 'data/dictionaries/dictionary.dict'
    if os.path.exists(filename):
        dictionary = corpora.Dictionary.load(filename)
    else:
        dictionary = corpora.Dictionary(texts)
        dictionary.save(filename)
    return dictionary

In [17]:
def load_param_dictionary(dictionary, MIN_DFs, MAX_DFs, MAX_FTs):
    filename = 'data/dictionaries/dictionary_{}_{}_{}.dict'.format(MIN_DFs, MAX_DFs, MAX_FTs)
    if os.path.exists(filename):
        dictionary = corpora.Dictionary.load(filename)
    else:
        dictionary.filter_extremes(no_below=MIN_DF, no_above=MAX_DF, keep_n=MAX_FT, keep_tokens=['picasso'])
        dictionary.save(filename)
    return dictionary

In [18]:
dictionary = load_ini_dictionary(texts)

In [19]:
len(dictionary)

7005

In [20]:
%%time

# lsi_topics_lists = []
lda_topics_lists = []
hdp_topics_lists = []

MIN_DFs = [1, 2, 3, 4]
MAX_DFs = [0.85]
MAX_FTs = [100, 1000, len(dictionary)]
N_TOPICSs = [2, 5, 10, len(texts), 50]
N_WORDS = 10

total_iterations = len(MIN_DFs) * len(MAX_DFs) * len(MAX_FTs) * (len(N_TOPICSs) + 1)
iteration = 1
for MIN_DF in MIN_DFs:
    for MAX_DF in MAX_DFs:
        for MAX_FT in MAX_FTs:
            dictionary = load_ini_dictionary(texts)
            dictionary = load_param_dictionary(dictionary, MIN_DF, MAX_DF, MAX_FT)
            corpus = [dictionary.doc2bow(text) for text in texts]
            tfidf = models.TfidfModel(corpus)
            corpus_tfidf = tfidf[corpus]
            
            # HDP
            hdp = models.HdpModel(corpus, id2word=dictionary, random_state=0)
            corpus_hdp = hdp[corpus]
            ts = hdp.show_topics(-1, num_words=N_WORDS, formatted=False)
            result = (ts, (MIN_DF, MAX_DF, MAX_FT, -1), corpus_hdp)
            hdp_topics_lists.append(result)
                        
            topics_set = set(topic for doc in corpus_hdp for topic, weight in doc)
            hdp_n_topics = len(topics_set)
            
            n_topics_aux = set(N_TOPICSs).union([hdp_n_topics])
            for N_TOPICS in n_topics_aux:
                print('Iteration: {} / ~{}'.format(iteration, total_iterations), \
                      '- MIN_DF:', MIN_DF, '- MAX_DF:', MAX_DF, '- MAX_FT:', MAX_FT, '- N_TOPICS:', N_TOPICS)
                iteration += 1
                
                # LSI
#                 lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=N_TOPICS)
#                 corpus_lsi = lsi[corpus_tfidf]
#                 ts = lsi.show_topics(lsi.num_topics, formatted=False)
#                 result = (ts, (MIN_DF, MAX_DF, MAX_FT, N_TOPICS), corpus_lsi)
#                 lsi_topics_lists.append(result)
            
                # LDA
                lda = models.LdaModel(corpus, id2word=dictionary, num_topics=N_TOPICS, random_state=0)
                corpus_lda = lda[corpus]
                ts = sorted(lda.show_topics(lda.num_topics, formatted=False))
                result = (ts, (MIN_DF, MAX_DF, MAX_FT, N_TOPICS), corpus_lda)
                lda_topics_lists.append(result)
            
                

Iteration: 1 / ~72 - MIN_DF: 1 - MAX_DF: 0.85 - MAX_FT: 100 - N_TOPICS: 2
Iteration: 2 / ~72 - MIN_DF: 1 - MAX_DF: 0.85 - MAX_FT: 100 - N_TOPICS: 5
Iteration: 3 / ~72 - MIN_DF: 1 - MAX_DF: 0.85 - MAX_FT: 100 - N_TOPICS: 7
Iteration: 4 / ~72 - MIN_DF: 1 - MAX_DF: 0.85 - MAX_FT: 100 - N_TOPICS: 10
Iteration: 5 / ~72 - MIN_DF: 1 - MAX_DF: 0.85 - MAX_FT: 100 - N_TOPICS: 16
Iteration: 6 / ~72 - MIN_DF: 1 - MAX_DF: 0.85 - MAX_FT: 100 - N_TOPICS: 50
Iteration: 7 / ~72 - MIN_DF: 1 - MAX_DF: 0.85 - MAX_FT: 1000 - N_TOPICS: 2
Iteration: 8 / ~72 - MIN_DF: 1 - MAX_DF: 0.85 - MAX_FT: 1000 - N_TOPICS: 10
Iteration: 9 / ~72 - MIN_DF: 1 - MAX_DF: 0.85 - MAX_FT: 1000 - N_TOPICS: 50
Iteration: 10 / ~72 - MIN_DF: 1 - MAX_DF: 0.85 - MAX_FT: 1000 - N_TOPICS: 5
Iteration: 11 / ~72 - MIN_DF: 1 - MAX_DF: 0.85 - MAX_FT: 1000 - N_TOPICS: 7
Iteration: 12 / ~72 - MIN_DF: 1 - MAX_DF: 0.85 - MAX_FT: 7005 - N_TOPICS: 2
Iteration: 13 / ~72 - MIN_DF: 1 - MAX_DF: 0.85 - MAX_FT: 7005 - N_TOPICS: 10
Iteration: 14 / ~72 -

In [21]:
# %%time

# # lsi_topics_lists = []
# lda_topics_lists = []
# hdp_topics_lists = []

# MIN_DFs = [1, 2]
# MAX_DFs = [0.9, 1.0]
# MAX_FTs = [100, 1000, len(dictionary)]
# N_TOPICSs = [2, 5, 10, len(texts), 50]
# N_WORDS = 10

# total_iterations = len(MIN_DFs) * len(MAX_DFs) * len(MAX_FTs) * (len(N_TOPICSs) + 1)
# iteration = 1

# dictionary = load_dictionary(texts)
# dictionary.filter_extremes(no_below=2, no_above=0.9, keep_n=8000)
# corpus = [dictionary.doc2bow(text) for text in texts]
# tfidf = models.TfidfModel(corpus)
# corpus_tfidf = tfidf[corpus]

# # HDP
# hdp = models.HdpModel(corpus, id2word=dictionary, random_state=0)
# corpus_hdp = hdp[corpus]
# ts = hdp.show_topics(-1, num_words=N_WORDS, formatted=False)
# result = (ts, (0, 0, 0, -1), corpus_hdp)
# hdp_topics_lists.append(result)


# n_topics_aux = N_TOPICSs# + [hdp_n_topics]
# for N_TOPICS in n_topics_aux:
#     print('Iteration: {} / {}'.format(iteration, total_iterations), \
#           '- MIN_DF:', 0, '- MAX_DF:', 0, '- MAX_FT:', 0, '- N_TOPICS:', N_TOPICS)
#     iteration += 1


#     # LDA
#     lda = models.LdaModel(corpus, id2word=dictionary, num_topics=N_TOPICS, random_state=0)
#     corpus_lda = lda[corpus]
#     ts = sorted(lda.show_topics(lda.num_topics, formatted=False))
#     result = (ts, (0, 0, 0, N_TOPICS), corpus_lda)
#     lda_topics_lists.append(result)

In [22]:
def extract_vocabulary(ts):
    vocab = set()
    for t in ts:
        words = [word for word, weight in t]
        vocab = vocab.union(words)
    vocab = sorted(vocab)
    r = {w: i for i, w in enumerate(vocab)}
    return r

def get_weights(t):
    return dict(t)

def create_vector(vocab, weights):
    r = [0] * len(vocab)
    for wo in weights:
        r[vocab[wo]] = weights[wo]
    return r

def vectorize(ts):
    r = []
    ts = [t for i, j, t in ts]
    vocab = extract_vocabulary(ts)
    for t in ts:
        weights = get_weights(t)
        vector = create_vector(vocab, weights)
        r.append(vector)
    return r

In [23]:
def two_topics_distance(t1, t2):
    r = spatial.distance.cosine(t1, t2)
    return r

def multiple_topics_distance(ts):
    ds = []
    for i, t1 in enumerate(ts[:-1]):
        for j in range(i + 1, len(ts)):
            t2 = ts[j]
            d = two_topics_distance(t1, t2)
            ds.append(d)
    return np.mean(ds)

def transform(ts_list): # formerly top_n_used_topics
    total_weights = defaultdict(float)
    corpus = ts_list[2]
    for doc in corpus:
        for topic, weight in doc:
            total_weights[topic] += abs(weight)
    total_weights = dict(total_weights)
    counter = Counter(total_weights)
    topics_dct = dict(ts_list[0])
    r = ([(weight, topic_id, topics_dct[topic_id])
             for topic_id, weight in counter.most_common()], ts_list[1], ts_list[2])
    return r

def best_topics(ts_lists):
    r = []
    for ts_list in ts_lists:
        top_n_ts_list = transform(ts_list)
        ts_vector = vectorize(top_n_ts_list[0])
        d = multiple_topics_distance(ts_vector)
        r.append((d, top_n_ts_list))
    r = sorted(r, reverse=True)
    return r[0]

In [24]:
%%time

# print('LSI')
# lsi_best_topics_list = best_topics(lsi_topics_lists)
print('LDA')
lda_best_topics_list = best_topics(lda_topics_lists)
print('HDP')
hdp_best_topics_list = best_topics(hdp_topics_lists)

LDA
HDP
CPU times: user 8.24 s, sys: 7.71 s, total: 15.9 s
Wall time: 2.41 s


## Results

### LSI

In [25]:
# for i, doc in enumerate(lsi_best_topics_list[1][2]):
#     print(texts_sr.index[i], doc)

In [26]:
# lsi_best_topics_list[1][0]

### LDA

In [27]:
for i, doc in enumerate(lda_best_topics_list[1][2]):
    print(texts_sr.index[i], doc)

2010 [(40, 0.9947594)]
2011 [(33, 0.9948959)]
2012 [(23, 0.7842242), (32, 0.21182513)]
2013 [(1, 0.9964748)]
2014 [(32, 0.9967114)]
2015 [(32, 0.99840915)]
2016 [(32, 0.84612715), (40, 0.15140496)]


In [28]:
lda_best_topics_list[1][1]

(4, 0.85, 7005, 50)

In [29]:
lda_best_topics_list[1][0]

[(3.0530900359153748,
  32,
  [('picasso', 0.042111713),
   ('mujeres', 0.019770708),
   ('vida', 0.015750268),
   ('casa', 0.0110806795),
   ('teresa', 0.010485754),
   ('jesús', 0.009898915),
   ('certamen', 0.0092140315),
   ('siglos', 0.0091078915),
   ('visiones', 0.008784279),
   ('imagen', 0.007793367)]),
 (1.146168440580368,
  40,
  [('picasso', 0.064249195),
   ('carlos', 0.01334744),
   ('nuevo', 0.01216151),
   ('nacional', 0.010964673),
   ('línea', 0.010742326),
   ('retrato', 0.009919678),
   ('renacimiento', 0.009820684),
   ('real', 0.00979706),
   ('lleida', 0.00967091),
   ('xxi', 0.008766832)]),
 (0.9964748024940491,
  1,
  [('picasso', 0.04758247),
   ('vida', 0.020572735),
   ('moda', 0.015070789),
   ('retorno', 0.014542604),
   ('blanco', 0.014354866),
   ('eterno', 0.01361901),
   ('contemporáneas', 0.010970152),
   ('casa', 0.009521288),
   ('objetos', 0.009255764),
   ('visiones', 0.009074654)]),
 (0.994895875453949,
  33,
  [('picasso', 0.03883271),
   ('mart

### HDP

In [30]:
for i, doc in enumerate(hdp_best_topics_list[1][2]):
    print(texts_sr.index[i], doc)

2010 [(0, 0.26917055298939785), (3, 0.3981342024210884), (7, 0.149032337077542), (85, 0.12071013724633103), (98, 0.0574830586916904)]
2011 [(1, 0.07083517311593109), (7, 0.9190842025434731)]
2012 [(0, 0.20985017394290115), (1, 0.5247006088917318), (88, 0.12511310112843976), (125, 0.08811896996589506), (149, 0.048733999805026025)]
2013 [(0, 0.2107894995525091), (5, 0.784059976564012)]
2014 [(0, 0.2473898858108832), (4, 0.7479843167620706)]
2015 [(0, 0.950380969233341), (127, 0.04723218544778237)]
2016 [(2, 0.7582413105016008), (61, 0.05197206011844201), (66, 0.10090265701580343), (79, 0.08120569678845073)]


In [31]:
hdp_best_topics_list[1][1]

(4, 0.85, 100, -1)

In [33]:
len(hdp_best_topics_list[1][0])

16

In [32]:
hdp_best_topics_list[1][0]

[(1.8875810815290324,
  0,
  [('picasso', 0.037173181513935935),
   ('vida', 0.03256538606233612),
   ('jesús', 0.0314940393552541),
   ('poder', 0.02393581979946046),
   ('lugar', 0.021827229613317567),
   ('mujeres', 0.02168088108026971),
   ('hoy', 0.021252625814563206),
   ('xxi', 0.020447885782844046),
   ('nuevas', 0.019032313205696323),
   ('imagen', 0.01891285392963105)]),
 (1.068116539621015,
  7,
  [('picasso', 0.044217519452148545),
   ('frente', 0.03856239878182732),
   ('col', 0.031586817210945736),
   ('europa', 0.024859892592155697),
   ('sombra', 0.023521600253759854),
   ('catalunya', 0.02348389338308434),
   ('fernández', 0.02337268637139683),
   ('martín', 0.023182949440235124),
   ('nuevas', 0.022791608216824466),
   ('agua', 0.022463085622335368)]),
 (0.784059976564012,
  5,
  [('universo', 0.04869509542085452),
   ('diálogos', 0.04524232305941078),
   ('picasso', 0.04173491908754539),
   ('frente', 0.03195686655783368),
   ('certamen', 0.026327952142943034),
   ('