In [1]:
import wikipedia
import nltk

import re
from nltk.stem import WordNetLemmatizer
from gensim import corpora
import pickle
import gensim

import pandas as pd
import numpy as np

from gensim.models import LsiModel

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\EricHan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\EricHan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Topic Modeling with LDA - Latent Dirichlet Allocation

In [3]:
autonomous_vehicle = wikipedia.page("Autonomous Vehicle")
artificial_intelligence = wikipedia.page("Artificial Intelligence")
satellite = wikipedia.page("Satellite")
eiffel_tower = wikipedia.page("Eiffel Tower")

corpus = [autonomous_vehicle.content, artificial_intelligence.content, satellite.content, eiffel_tower.content]

In [4]:
stemmer = WordNetLemmatizer()

def preprocess_text(document):
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str(document))

        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        tokens = document.split()
        tokens = [stemmer.lemmatize(word) for word in tokens]
        tokens = [word for word in tokens if word not in en_stop]
        tokens = [word for word in tokens if len(word)  > 5]

        return tokens

In [5]:
#iterate through the corpus list that contains the four Wikipedia articles and preprocess text
processed_data = [];
for doc in corpus:
    tokens = preprocess_text(doc)
    processed_data.append(tokens)

In [6]:
#use this list to create a dictionary and corresponding bag of words corpus
gensim_dictionary = corpora.Dictionary(processed_data)
gensim_corpus = [gensim_dictionary.doc2bow(token, allow_update=True) for token in processed_data]

In [7]:
#save our dictionary as well as the bag of words corpus using pickle
pickle.dump(gensim_corpus, open('gensim_corpus_corpus.pkl', 'wb'))
gensim_dictionary.save('gensim_dictionary.gensim')

In [8]:
#create LDA model in Gensim, use the LdaModel class, pass the bag of words corpus to the LdaModel constructor
lda_model = gensim.models.ldamodel.LdaModel(gensim_corpus, num_topics=10, id2word=gensim_dictionary, passes=20)
lda_model.save('gensim_model.gensim')

In [9]:
#It is important to mention here that LDA is an unsupervised learning algorithm and 
#in real-world problems, you will not know about the topics in the dataset beforehand. 
#You will simply be given a corpus, the topics will be created using LDA 
#and then the names of the topics are up to you.

topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.093*"satellite" + 0.009*"launch" + 0.009*"united" + 0.009*"communication" + 0.009*"launched"')
(1, '0.000*"satellite" + 0.000*"intelligence" + 0.000*"artificial" + 0.000*"learning" + 0.000*"problem"')
(2, '0.016*"intelligence" + 0.015*"machine" + 0.014*"learning" + 0.014*"problem" + 0.012*"artificial"')
(3, '0.000*"satellite" + 0.000*"cruise" + 0.000*"vehicle" + 0.000*"eiffel" + 0.000*"artificial"')
(4, '0.000*"cruise" + 0.000*"problem" + 0.000*"satellite" + 0.000*"learning" + 0.000*"machine"')
(5, '0.000*"satellite" + 0.000*"intelligence" + 0.000*"learning" + 0.000*"artificial" + 0.000*"machine"')
(6, '0.069*"cruise" + 0.030*"vehicle" + 0.014*"francisco" + 0.012*"driving" + 0.011*"autonomous"')
(7, '0.032*"eiffel" + 0.010*"second" + 0.009*"french" + 0.007*"structure" + 0.007*"exposition"')
(8, '0.000*"satellite" + 0.000*"eiffel" + 0.000*"cruise" + 0.000*"machine" + 0.000*"vehicle"')
(9, '0.000*"cruise" + 0.000*"eiffel" + 0.000*"vehicle" + 0.000*"satellite" + 0.000*"machine"')


# Topic Modeling via LSI - Latent Semantic Indexing

In [10]:
lsi_model = LsiModel(gensim_corpus, num_topics=4, id2word=gensim_dictionary)
topics = lsi_model.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.317*"intelligence" + 0.291*"machine" + 0.281*"learning" + 0.272*"problem" + 0.246*"artificial" + 0.183*"network" + 0.145*"system" + 0.143*"knowledge" + 0.138*"search" + 0.137*"program"')
(1, '-0.746*"cruise" + -0.332*"vehicle" + -0.155*"francisco" + -0.128*"driving" + -0.115*"announced" + -0.115*"service" + -0.114*"company" + -0.114*"autonomous" + -0.101*"satellite" + -0.099*"pedestrian"')
(2, '-0.915*"satellite" + 0.124*"cruise" + -0.089*"launch" + -0.083*"communication" + -0.082*"launched" + -0.082*"united" + -0.074*"observation" + -0.062*"rocket" + -0.055*"sputnik" + -0.049*"scientific"')
(3, '-0.651*"eiffel" + -0.206*"second" + -0.184*"french" + -0.148*"structure" + -0.142*"exposition" + -0.134*"tallest" + -0.116*"engineer" + -0.109*"restaurant" + -0.109*"construction" + -0.108*"france"')


# Rule-Based Matching - spacy

In [50]:
# !python -m spacy download en_core_web_trf
# !pip install spacy-transformers

import spacy

import spacy_transformers

nlp = spacy.load("en_core_web_sm")

import en_core_web_sm
nlp = en_core_web_sm.load()

from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher

m_tool = Matcher(nlp.vocab)

In [51]:
p1 = [{'LOWER': 'quickbrownfox'}]
p2 = [{'LOWER': 'quick'}, {'IS_PUNCT': True}, {'LOWER': 'brown'}, {'IS_PUNCT': True}, {'LOWER': 'fox'}]
p3 = [{'LOWER': 'quick'}, {'LOWER': 'brown'}, {'LOWER': 'fox'}]
p4 = [{'LOWER': 'quick'}, {'LOWER': 'brownfox'}]

m_tool.add('QBF', [p1,p2,p3,p4])

sentence = nlp(u'The quick-brown-fox jumps over the lazy dog. The quick brown fox eats well. \
               the quickbrownfox is dead. the dog misses the quick brownfox')

phrase_matches = m_tool(sentence)
print(phrase_matches )

for match_id, start, end in phrase_matches:
    string_id = nlp.vocab.strings[match_id]  
    span = sentence[start:end]                   
    print(match_id, string_id, start, end, span.text)

[(12825528024649263697, 1, 6), (12825528024649263697, 13, 16), (12825528024649263697, 21, 22), (12825528024649263697, 29, 31)]
12825528024649263697 QBF 1 6 quick-brown-fox
12825528024649263697 QBF 13 16 quick brown fox
12825528024649263697 QBF 21 22 quickbrownfox
12825528024649263697 QBF 29 31 quick brownfox


# Phrase-Based Matching

In [31]:
import bs4 as bs  
import urllib.request  
import re  
import nltk

scrapped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence')  
article = scrapped_data.read()

parsed_article = bs.BeautifulSoup(article,'lxml')

paragraphs = parsed_article.find_all('p')

article_text = ""

for p in paragraphs:  
    article_text += p.text
    
    
processed_article = article_text.lower()  
processed_article = re.sub('[^a-zA-Z]', ' ', processed_article )  
processed_article = re.sub(r'\s+', ' ', processed_article)

In [32]:
#Create Phrase Matcher Object
phrase_matcher = PhraseMatcher(nlp.vocab)

phrases = ['machine learning', 'robots', 'intelligent agents']

patterns = [nlp(text) for text in phrases]

phrase_matcher.add('AI', None, *patterns)

sentence = nlp(processed_article)

matched_phrases = phrase_matcher(sentence)

for match_id, start, end in matched_phrases:
    string_id = nlp.vocab.strings[match_id]  
    span = sentence[start:end]                   
    print(match_id, string_id, start, end, span.text)

5530044837203964789 AI 1086 1088 machine learning
5530044837203964789 AI 1119 1121 machine learning
5530044837203964789 AI 1231 1233 machine learning
5530044837203964789 AI 2444 2446 machine learning
5530044837203964789 AI 2968 2970 machine learning
5530044837203964789 AI 2987 2989 machine learning
5530044837203964789 AI 3462 3464 machine learning
5530044837203964789 AI 3501 3503 machine learning
5530044837203964789 AI 3973 3975 machine learning
5530044837203964789 AI 4046 4048 machine learning
5530044837203964789 AI 4383 4385 machine learning
5530044837203964789 AI 4421 4423 machine learning
5530044837203964789 AI 4454 4456 machine learning
5530044837203964789 AI 4736 4738 machine learning
5530044837203964789 AI 4794 4796 machine learning
5530044837203964789 AI 5088 5089 robots
5530044837203964789 AI 5185 5187 machine learning
5530044837203964789 AI 5255 5257 machine learning
5530044837203964789 AI 5387 5388 robots
5530044837203964789 AI 7216 7218 machine learning
5530044837203964789 

In [40]:
# !conda install -c conda-forge pattern

# Downloading Built-In Gensim Models and Datasets

In [41]:
import gensim.downloader as api
from gensim.models import KeyedVectors

In [42]:
# model_list = [
#     'fasttext-wiki-news-subwords-300', 'glove-twitter-100',
#     'glove-twitter-200', 'glove-twitter-25', 'glove-twitter-50',
#     'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200',
#     'glove-wiki-gigaword-300', 'glove-wiki-gigaword-50',
#     'conceptnet-numberbatch-17-06-300'
# ]

# w2v_embedding = api.load('fasttext-wiki-news-subwords-300')

# w2v_embedding.save(
#     r'/mnt/c/Users/handy019/OneDrive - University of South Australia/Python/3 webscrapping/4 do it again/gensim pre-trained model/fasttext-wiki-news-subwords-300.d2v'
# )

In [43]:
w2v_embedding = KeyedVectors.load(
    r'F:\All data and docs\Python\3 webscrapping\4 do it again\gensim pre-trained model\glove-wiki-gigaword-300.d2v'
)

In [44]:
w2v_embedding.most_similar("photovoltaic",topn=5)

[('photovoltaics', 0.6885057091712952),
 ('solar', 0.6298309564590454),
 ('thin-film', 0.5802013278007507),
 ('pv', 0.5194124579429626),
 ('geothermal', 0.49324947595596313)]

In [45]:
list1 = [x[0] for x in w2v_embedding.most_similar("decision-making",topn=50)]

In [47]:
list1

['policy-making',
 'decisionmaking',
 'problem-solving',
 'decision-makers',
 'organizational',
 'processes',
 'participatory',
 'organisational',
 'participative',
 'lawmaking',
 'policymaking',
 'budgeting',
 'top-down',
 'normative',
 'governance',
 'deliberative',
 'evidence-based',
 'decisions',
 'decentralized',
 'bottom-up',
 'centralized',
 'socialization',
 'reasoning',
 'conceptualization',
 'facilitates',
 'methodology',
 'competence',
 'methodologies',
 'outcomes',
 'hierarchical',
 'contexts',
 'heuristics',
 'rational',
 'delegated',
 'stakeholders',
 'rationality',
 'day-to-day',
 'collaborative',
 'subjective',
 'consensus-based',
 'discourse',
 'competences',
 'optimal',
 'know-how',
 'cognition',
 'competencies',
 'workflows',
 'interpersonal',
 'decentralization',
 'co-ordinate']