![Image](https://raw.githubusercontent.com/D2I-Melbourne/MOP/master/images/mop-black-100px.png)

# MELBOURNE CITY OPEN DATA PLAYGROUND
---
## Open Data Metadata Topic Visualisation using NLP techniques
---
| Date | Author/Contributor | Change |
| :- | :- | :- |
| 30-Sep-2021 | Oscar Wu | T2 2021 Final Version |
| 9-Dec-2021 | Steven Tuften | Format Notebook for new GitHub Repo |

### ATTRIBUTIONS

### Package/Library Imports

<div class="alert alert-block alert-warning">
    Ensure you first setup Conda environment using conda configuration instructions in this Repository!
</div>

In [None]:
!pip install pyLDAvis==3.2.2
!pip install nltk
!pip install spacy
!pip install gensim
!pip install importlib_metadata

In [44]:
import os
import re
import numpy as np
import pandas as pd

# Open Data Specific libraries
from sodapy import Socrata

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Plotting libraries
import matplotlib.pyplot as plt# NLTK Stop words
import seaborn as sns 

# NLP Libraries
import spacy
import spacy.cli

import nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 

import pyLDAvis
import pyLDAvis.sklearn

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

## Connect to Melbourne Open Data

In [19]:
apptoken = os.environ.get("SODAPY_APPTOKEN") # Anonymous app token
domain = "data.melbourne.vic.gov.au"
client = Socrata(domain, apptoken) # Open Dataset connection



## Step 1 : Retrieve Datasets

Retrieve the following information on each dataset:
  - dataset name 
  - id 
  - metadata

In [20]:
content = []
for a in client.datasets():

  content.append([a['resource']['name'],

  a['resource']['columns_name'],
  a['resource']['description'],
  a['classification']['categories'],
  a['classification']['domain_category'],
  a['classification']['tags'],
  a['classification']['domain_tags']])



# remove '[]' and join string
content_lst_inital = []
for each_element in content:
  each_element_list = []
  for element in each_element:
    if type(element) == list:
      element = ' '.join(element)
    else:
      pass
    each_element_list.append(element)
  cont= (' ').join(each_element_list)
  content_lst_inital.append(cont)

In [21]:
content = []
for a in client.datasets():

  content.append([a['resource']['name'],
  a['resource']['columns_name'],
  a['resource']['description'],
  a['classification']['categories'],
  a['classification']['domain_category'],
  a['classification']['tags'],
  a['classification']['domain_tags']])

In [22]:
total_list = []
for each_sent in content:
  test = []
  for i in each_sent:
    if type(i) == list:
      i = " ".join(i)
      test.append(i)
    else:
      test.append(i)
  line_list = " ".join(test)
  total_list.append(line_list)

In [23]:
re.sub(r'[\(\)]',"", '(counts per ) (. ) ( ) (')

'counts per  .    '

In [24]:
# remove emails
data = [re.sub(r'\S*@\S*\s?', '', sent) for sent in total_list]

# Remove new line characters
data = [re.sub(r'\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub(r"\'", "", sent) for sent in data]

# html tags
data = [re.sub(r"<.*?> ", "", sent) for sent in data]

#
data = [re.sub(r'[^a-zA-Z]+', " ", sent) for sent in data]

In [25]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['pedestrian', 'counting', 'system', 'monthly', 'counts', 'per', 'hour', 'id', 'sensor', 'name', 'sensor', 'id', 'mdate', 'hourly', 'counts', 'month', 'year', 'date', 'time', 'time', 'day', 'this', 'dataset', 'contains', 'hourly', 'pedestrian', 'counts', 'since', 'from', 'pedestrian', 'sensor', 'devices', 'located', 'across', 'the', 'city', 'the', 'data', 'is', 'updated', 'on', 'monthly', 'basis', 'and', 'can', 'be', 'used', 'to', 'determine', 'variations', 'in', 'pedestrian', 'activity', 'throughout', 'the', 'day', 'dataset', 'which', 'details', 'the', 'location', 'status', 'and', 'directional', 'readings', 'of', 'sensors', 'any', 'changes', 'to', 'sensor', 'locations', 'are', 'important', 'to', 'consider', 'when', 'analysing', 'and', 'interpreting', 'pedestrian', 'counts', 'over', 'time', 'dataset', 'helps', 'to', 'understand', 'how', 'people', 'use', 'different', 'city', 'locations', 'at', 'different', 'times', 'of', 'day', 'to', 'better', 'inform', 'decision', 'making', 'and', 'pl

In [27]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['pedestrian', 'counting', 'system', 'monthly', 'counts', 'per', 'hour', 'id', 'sensor', 'name', 'sensor', 'id', 'mdate', 'hourly', 'counts', 'month', 'year', 'date', 'time', 'time', 'day', 'this', 'dataset', 'contains', 'hourly', 'pedestrian', 'counts', 'since', 'from', 'pedestrian', 'sensor', 'devices', 'located', 'across', 'the', 'city', 'the', 'data', 'is', 'updated', 'on', 'monthly', 'basis', 'and', 'can', 'be', 'used', 'to', 'determine', 'variations', 'in', 'pedestrian', 'activity', 'throughout', 'the', 'day', 'dataset', 'which', 'details', 'the', 'location', 'status', 'and', 'directional', 'readings', 'of', 'sensors', 'any', 'changes', 'to', 'sensor', 'locations', 'are', 'important', 'to', 'consider', 'when', 'analysing', 'and', 'interpreting', 'pedestrian', 'counts', 'over', 'time', 'dataset', 'helps', 'to', 'understand', 'how', 'people', 'use', 'different', 'city', 'locations', 'at', 'different', 'times', 'of', 'day', 'to', 'better', 'inform', 'decision', 'making', 'and', 'pla

In [28]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [37]:
# Download English language model
spacy.cli.download("en_core_web_sm")

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
#nlp = spacy.load('en', disable=['parser', 'ner'])
nlp = spacy.load("en_core_web_sm")

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['pedestrian', 'counting', 'system', 'monthly', 'count', 'hour', 'd', 'sensor', 'name', 'sensor', 'd', 'mdate', 'hourly', 'count', 'month', 'year', 'date', 'time', 'time', 'day', 'dataset', 'contain', 'hourly', 'pedestrian', 'count', 'pedestrian', 'sensor', 'device', 'locate', 'city', 'datum', 'update', 'monthly', 'basis', 'use', 'determine', 'variation', 'pedestrian', 'activity', 'day', 'dataset', 'detail', 'location', 'status', 'directional', 'reading', 'sensor', 'change', 'sensor', 'location', 'important', 'consider', 'analyse', 'interpret', 'pedestrian', 'count', 'time', 'dataset', 'help', 'understand', 'people', 'different', 'city', 'location', 'different', 'time', 'day', 'well', 'inform', 'decision', 'make', 'plan', 'future', 'representation', 'pedestrian', 'volume', 'compare', 'location', 'give', 'day', 'time', 'find', 'finance', 'transport', 'accessibility', 'covid', 'foot', 'traffic', 'pedestrian', 'count', 'pedestrian', 'safemobility', 'sensor', 'traffic', 'flow']]


In [38]:
data_lemmatized[0:8]

[['pedestrian',
  'counting',
  'system',
  'monthly',
  'count',
  'hour',
  'd',
  'sensor',
  'name',
  'sensor',
  'd',
  'mdate',
  'hourly',
  'count',
  'month',
  'year',
  'date',
  'time',
  'time',
  'day',
  'dataset',
  'contain',
  'hourly',
  'pedestrian',
  'count',
  'pedestrian',
  'sensor',
  'device',
  'locate',
  'city',
  'datum',
  'update',
  'monthly',
  'basis',
  'use',
  'determine',
  'variation',
  'pedestrian',
  'activity',
  'day',
  'dataset',
  'detail',
  'location',
  'status',
  'directional',
  'reading',
  'sensor',
  'change',
  'sensor',
  'location',
  'important',
  'consider',
  'analyse',
  'interpret',
  'pedestrian',
  'count',
  'time',
  'dataset',
  'help',
  'understand',
  'people',
  'different',
  'city',
  'location',
  'different',
  'time',
  'day',
  'well',
  'inform',
  'decision',
  'make',
  'plan',
  'future',
  'representation',
  'pedestrian',
  'volume',
  'compare',
  'location',
  'give',
  'day',
  'time',
  'find',

In [39]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2), (6, 1), (7, 1), (8, 1), (9, 5), (10, 1), (11, 1), (12, 2), (13, 3), (14, 1), (15, 1), (16, 4), (17, 1), (18, 1), (19, 1), (20, 1), (21, 2), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 2), (32, 1), (33, 1), (34, 1), (35, 1), (36, 4), (37, 1), (38, 1), (39, 1), (40, 2), (41, 1), (42, 8), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 6), (49, 1), (50, 1), (51, 5), (52, 2), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1)]]


In [40]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('accessibility', 1),
  ('activity', 1),
  ('analyse', 1),
  ('basis', 1),
  ('change', 1),
  ('city', 2),
  ('compare', 1),
  ('consider', 1),
  ('contain', 1),
  ('count', 5),
  ('counting', 1),
  ('covid', 1),
  ('d', 2),
  ('dataset', 3),
  ('date', 1),
  ('datum', 1),
  ('day', 4),
  ('decision', 1),
  ('detail', 1),
  ('determine', 1),
  ('device', 1),
  ('different', 2),
  ('directional', 1),
  ('finance', 1),
  ('find', 1),
  ('flow', 1),
  ('foot', 1),
  ('future', 1),
  ('give', 1),
  ('help', 1),
  ('hour', 1),
  ('hourly', 2),
  ('important', 1),
  ('inform', 1),
  ('interpret', 1),
  ('locate', 1),
  ('location', 4),
  ('make', 1),
  ('mdate', 1),
  ('month', 1),
  ('monthly', 2),
  ('name', 1),
  ('pedestrian', 8),
  ('people', 1),
  ('plan', 1),
  ('reading', 1),
  ('representation', 1),
  ('safemobility', 1),
  ('sensor', 6),
  ('status', 1),
  ('system', 1),
  ('time', 5),
  ('traffic', 2),
  ('transport', 1),
  ('understand', 1),
  ('update', 1),
  ('use', 1),
  ('v

## Meta Data cleaning Done

In [41]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=14, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [42]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.063*"building" + 0.058*"energy" + 0.055*"datum" + 0.044*"dataset" + '
  '0.027*"scope" + 0.025*"council" + 0.024*"model" + 0.023*"build" + '
  '0.023*"attribute" + 0.023*"environment"'),
 (1,
  '0.048*"city" + 0.046*"melbourne" + 0.024*"baseline" + 0.024*"single" + '
  '0.021*"group" + 0.020*"location" + 0.020*"bin" + 0.017*"process" + '
  '0.017*"specie" + 0.014*"online"'),
 (2,
  '0.059*"parking" + 0.040*"datum" + 0.031*"time" + 0.030*"restriction" + '
  '0.026*"sensor" + 0.023*"record" + 0.023*"street" + 0.021*"value" + '
  '0.018*"bay" + 0.017*"transport"'),
 (3,
  '0.058*"weight" + 0.055*"collect" + 0.032*"commingle" + 0.032*"waste" + '
  '0.031*"record" + 0.031*"recycling_facility" + 0.022*"bale" + 0.022*"day" + '
  '0.021*"estimate" + 0.021*"financial"'),
 (4,
  '0.068*"city" + 0.053*"melbourne" + 0.030*"business" + 0.027*"environment" + '
  '0.024*"property" + 0.018*"tree" + 0.018*"http" + 0.017*"public" + '
  '0.017*"residential" + 0.016*"open"'),
 (5,
  '0.075*"soil

# NOTE : Workbook cells beyond this point require testing and fixes!
## Table

In [45]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

AttributeError: module 'pyLDAvis' has no attribute 'gensim'

In [49]:
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Lemmatize with POS Tag
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\61412\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\61412\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


In [50]:
each_char = 'trees'

lemmatizer.lemmatize(each_char, get_wordnet_pos(each_char))

'tree'

In [None]:
nlp_content = []
for each_sentence in words:
  list_1 = []
  for each_char in nltk.word_tokenize(each_sentence):
    tk=lemmatizer.lemmatize(each_char, get_wordnet_pos(each_char))
    list_1.append(tk)
  nlp_content.append(list_1)

In [None]:
nlp_content[0]

In [None]:
## check sparsicity 

# make it to sentence 
# nlp_content_string= sentence 
# nlp_content = nlp_content_string.split(' ')
nlp_content_string = [' '.join(i) for i in nlp_content]


#remove built-in english stopwords, convert all words to lowercase, 
#and a word can contain numbers and alphabets of at least length 3 in order to be qualified as a word.
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )


In [None]:
data_vectorized = vectorizer.fit_transform(nlp_content_string)
# Materialize the sparse data
data_dense = data_vectorized.todense()

# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")

In [None]:
data_vectorized.shape

In [None]:
search_params = {'n_components': list(range(1,223)), 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(data_vectorized)

In [None]:
test_model_14 =LatentDirichletAllocation(batch_size=6, n_components = 14,
                        random_state=0,
                        learning_decay=0.7,
                        max_iter= 10,
                        max_doc_update_iter=100,
                        learning_method='batch')
test_model_14.fit(data_vectorized)

In [None]:
test_model_6 =LatentDirichletAllocation(batch_size=6, n_components = 6,
                        random_state=0,
                        learning_decay=0.7,
                        max_iter= 10,
                        max_doc_update_iter=100,
                        learning_method='batch')
test_model_6.fit(data_vectorized)

In [None]:


#data is data_vectorized
def return_topic_visual_pro_topic(model, data=data_vectorized):

  # Create Document - Topic Matrix
  lda_output = model.transform(data)
  # column names
  topicnames = ["Topic" + str(i) for i in range(model.n_components)]
  # index names
  docnames = ["dataset" + str(i) for i in range(data.shape[0])]

  # Make the pandas dataframe
  df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

  # Get dominant topic for each document
  dominant_topic = np.argmax(df_document_topic.values, axis=1)
  df_document_topic['dominant_topic'] = dominant_topic

  # Styling
  def color_green(val):
      color = 'green' if val > .1 else 'black'
      return 'color: {col}'.format(col=color)

  def make_bold(val):
      weight = 700 if val > .1 else 400
      return 'font-weight: {weight}'.format(weight=weight)


  # Apply Style
  df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
  

  return  df_document_topics, dominant_topic


In [None]:
# replace model to see difference 
df_document_topics, dominant_topic  = return_topic_visual_pro_topic(test_model_14, data_vectorized)
df_document_topics


In [None]:
#topic_n = df_document_topic['dominant_topic']
#df.loc[:,'topic_n'] = topic_n.values
df['content'] = nlp_content_string
df['topic_n'] = dominant_topic

In [None]:
df[df.loc[:,'topic_n'] == 13]

In [None]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(test_model_14, data_vectorized, vectorizer, mds='tsne')
panel