In [90]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/ddx/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [91]:
import numpy as np
import pandas as pd

import json
import glob


import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import Phrases
from gensim.models.phrases import Phraser

import spacy
from nltk.corpus import stopwords

import pyLDAvis
import pyLDAvis.gensim
import pyLDAvis.gensim_models as gensimvis

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)



In [92]:
def load_data(file):
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

def write_data(file, data):
    with open(file, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4)

In [93]:
stopwords = stopwords.words("english")
# stopwords

In [94]:
data = pd.read_csv('data.csv')['Description'].str.replace('About the Creative City:', '', regex=False)

data.head()

0    Situated in a picturesque valley at the foothi...
1     The city of Gangneung is developing an inclus...
2    Casablanca is a major centre for media arts an...
3     Valencia emphasizes the essential role of des...
4     Veliky Novgorod Creative City has made signif...
Name: Description, dtype: object

In [95]:
data[5][0:90]

'\xa0Ouarzazate is considered to have a special place in the minds of film lovers. As one of t'

In [96]:
text = "\xa0Da Lat's economic growth is driven by its creative sector."
length = len(text)
print(length)

59


In [97]:
# !python3 -m spacy download en_core_web_sm
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return (texts_out)


lemmatized_texts = lemmatization(data)
print (lemmatized_texts[0][0:90])

situate picturesque valley foothill meeting point ancient civilization unique location inf


In [98]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data_words = gen_words(lemmatized_texts)

print (data_words[0][0:20])

['situate', 'picturesque', 'valley', 'foothill', 'meeting', 'point', 'ancient', 'civilization', 'unique', 'location', 'influence', 'city', 'design', 'offer', 'harmonious', 'blend', 'modern', 'technology', 'architectural', 'tradition']


In [99]:
id2word = corpora.Dictionary(data_words)

corpus = []
for text in data_words:
    new = id2word.doc2bow(text)
    corpus.append(new)

print (corpus[0][0:20])

word = id2word[[0][:1][0]]
print (word)

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2), (6, 1), (7, 1), (8, 1), (9, 2), (10, 1), (11, 2), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 4)]
active


In [100]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [101]:
pyLDAvis.enable_notebook()

vis = gensimvis.prepare(lda_model, corpus, id2word, mds="mmds", R=10, n_jobs=1)
vis


In [108]:
# BIGRAMS AND TRIGRAMS
bigrams_phrases = Phrases(data_words, min_count=5, threshold=50)
trigram_phrases = Phrases(bigrams_phrases[data_words], threshold=50)

# Creating phrasers for efficiency
bigram = Phraser(bigrams_phrases)
trigram = Phraser(trigram_phrases)

# Functions to create bigrams and trigrams
def make_bigrams(texts):
    return [bigram[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram[bigram[doc]] for doc in texts]

# Applying the bigram and trigram models
data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_bigrams)

# Printing the first document's bigrams/trigrams
print(data_bigrams_trigrams[0])


['situate', 'picturesque', 'valley', 'foothill', 'meeting_point', 'ancient', 'civilization', 'unique', 'location', 'influence', 'city', 'design', 'offer', 'harmonious', 'blend', 'modern', 'technology', 'architectural', 'tradition', 'city', 'centre', 'design', 'renowned', 'innovative', 'project', 'field', 'design', 'public_private', 'sector', 'professional', 'designer', 'play', 'active', 'role', 'today', 'design', 'industry', 'encompass', 'enterprise', 'employ', 'skilled', 'individual', 'showcase', 'commitment', 'urban', 'design', 'host', 'exhibition', 'gather', 'lead', 'local', 'national', 'international', 'designer', 'follower', 'project', 'implement', 'strengthen', 'tie', 'graduate', 'student', 'host', 'various', 'competition', 'design', 'combine', 'traditional', 'contemporary', 'trend', 'project', 'form', 'creative', 'union', 'professional', 'young', 'aspire', 'designer', 'connect', 'municipality', 'community', 'field', 'design', 'dedicate', 'provide', 'training', 'opportunity', 'as

In [109]:
print(data_bigrams_trigrams[0:50])

[['situate', 'picturesque', 'valley', 'foothill', 'meeting_point', 'ancient', 'civilization', 'unique', 'location', 'influence', 'city', 'design', 'offer', 'harmonious', 'blend', 'modern', 'technology', 'architectural', 'tradition', 'city', 'centre', 'design', 'renowned', 'innovative', 'project', 'field', 'design', 'public_private', 'sector', 'professional', 'designer', 'play', 'active', 'role', 'today', 'design', 'industry', 'encompass', 'enterprise', 'employ', 'skilled', 'individual', 'showcase', 'commitment', 'urban', 'design', 'host', 'exhibition', 'gather', 'lead', 'local', 'national', 'international', 'designer', 'follower', 'project', 'implement', 'strengthen', 'tie', 'graduate', 'student', 'host', 'various', 'competition', 'design', 'combine', 'traditional', 'contemporary', 'trend', 'project', 'form', 'creative', 'union', 'professional', 'young', 'aspire', 'designer', 'connect', 'municipality', 'community', 'field', 'design', 'dedicate', 'provide', 'training', 'opportunity', 'a

In [110]:
#TF-IDF REMOVAL
from gensim.models import TfidfModel

id2word = corpora.Dictionary(data_bigrams_trigrams)

texts = data_bigrams_trigrams

corpus = [id2word.doc2bow(text) for text in texts]
# print (corpus[0][0:20])

tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.03
words  = []
words_missing_in_tfidf = []
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow

In [127]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=7,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [128]:
pyLDAvis.enable_notebook()

vis = gensimvis.prepare(lda_model, corpus, id2word, mds="mmds", R=10, n_jobs=1)
vis
