In [1]:
import wikipedia
import nltk

import re
from nltk.stem import WordNetLemmatizer
from gensim import corpora
import pickle
import gensim

import pandas as pd
import numpy as np

from gensim.models import LsiModel

In [2]:
# nltk.download('stopwords')
# nltk.download('wordnet')
# en_stop = set(nltk.corpus.stopwords.words('english'))

# Topic Modeling with LDA - Latent Dirichlet Allocation

In [None]:
autonomous_vehicle = wikipedia.page("Autonomous Vehicle")
artificial_intelligence = wikipedia.page("Artificial Intelligence")
satellite = wikipedia.page("Satellite")
eiffel_tower = wikipedia.page("Eiffel Tower")

corpus = [autonomous_vehicle.content, artificial_intelligence.content, satellite.content, eiffel_tower.content]

In [None]:
stemmer = WordNetLemmatizer()

def preprocess_text(document):
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str(document))

        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        tokens = document.split()
        tokens = [stemmer.lemmatize(word) for word in tokens]
        tokens = [word for word in tokens if word not in en_stop]
        tokens = [word for word in tokens if len(word)  > 5]

        return tokens

In [None]:
#iterate through the corpus list that contains the four Wikipedia articles and preprocess text
processed_data = [];
for doc in corpus:
    tokens = preprocess_text(doc)
    processed_data.append(tokens)

In [None]:
#use this list to create a dictionary and corresponding bag of words corpus
gensim_dictionary = corpora.Dictionary(processed_data)
gensim_corpus = [gensim_dictionary.doc2bow(token, allow_update=True) for token in processed_data]

In [None]:
#save our dictionary as well as the bag of words corpus using pickle
pickle.dump(gensim_corpus, open('gensim_corpus_corpus.pkl', 'wb'))
gensim_dictionary.save('gensim_dictionary.gensim')

In [None]:
#create LDA model in Gensim, use the LdaModel class, pass the bag of words corpus to the LdaModel constructor
lda_model = gensim.models.ldamodel.LdaModel(gensim_corpus, num_topics=10, id2word=gensim_dictionary, passes=20)
lda_model.save('gensim_model.gensim')

In [None]:
#It is important to mention here that LDA is an unsupervised learning algorithm and 
#in real-world problems, you will not know about the topics in the dataset beforehand. 
#You will simply be given a corpus, the topics will be created using LDA 
#and then the names of the topics are up to you.

topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

# Topic Modeling via LSI - Latent Semantic Indexing

In [None]:
lsi_model = LsiModel(gensim_corpus, num_topics=4, id2word=gensim_dictionary)
topics = lsi_model.print_topics(num_words=10)
for topic in topics:
    print(topic)

# Rule-Based Matching - spacy

In [None]:
import spacy
nlp = spacy.load('en_core_web_trf')

from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher

m_tool = Matcher(nlp.vocab)

In [None]:
p1 = [{'LOWER': 'quickbrownfox'}]
p2 = [{'LOWER': 'quick'}, {'IS_PUNCT': True}, {'LOWER': 'brown'}, {'IS_PUNCT': True}, {'LOWER': 'fox'}]
p3 = [{'LOWER': 'quick'}, {'LOWER': 'brown'}, {'LOWER': 'fox'}]
p4 = [{'LOWER': 'quick'}, {'LOWER': 'brownfox'}]

m_tool.add('QBF', [p1,p2,p3,p4])

sentence = nlp(u'The quick-brown-fox jumps over the lazy dog. The quick brown fox eats well. \
               the quickbrownfox is dead. the dog misses the quick brownfox')

phrase_matches = m_tool(sentence)
print(phrase_matches )

for match_id, start, end in phrase_matches:
    string_id = nlp.vocab.strings[match_id]  
    span = sentence[start:end]                   
    print(match_id, string_id, start, end, span.text)

# Phrase-Based Matching

In [None]:
import bs4 as bs  
import urllib.request  
import re  
import nltk

scrapped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence')  
article = scrapped_data.read()

parsed_article = bs.BeautifulSoup(article,'lxml')

paragraphs = parsed_article.find_all('p')

article_text = ""

for p in paragraphs:  
    article_text += p.text
    
    
processed_article = article_text.lower()  
processed_article = re.sub('[^a-zA-Z]', ' ', processed_article )  
processed_article = re.sub(r'\s+', ' ', processed_article)

In [None]:
#Create Phrase Matcher Object
phrase_matcher = PhraseMatcher(nlp.vocab)

phrases = ['machine learning', 'robots', 'intelligent agents']

patterns = [nlp(text) for text in phrases]

phrase_matcher.add('AI', None, *patterns)

sentence = nlp(processed_article)

matched_phrases = phrase_matcher(sentence)

for match_id, start, end in matched_phrases:
    string_id = nlp.vocab.strings[match_id]  
    span = sentence[start:end]                   
    print(match_id, string_id, start, end, span.text)

In [None]:
!conda install -c conda-forge pattern

# Downloading Built-In Gensim Models and Datasets

In [2]:
import gensim.downloader as api
from gensim.models import KeyedVectors

In [4]:
# model_list = [
#     'fasttext-wiki-news-subwords-300', 'glove-twitter-100',
#     'glove-twitter-200', 'glove-twitter-25', 'glove-twitter-50',
#     'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200',
#     'glove-wiki-gigaword-300', 'glove-wiki-gigaword-50',
#     'conceptnet-numberbatch-17-06-300'
# ]

# w2v_embedding = api.load('fasttext-wiki-news-subwords-300')

# w2v_embedding.save(
#     r'/mnt/c/Users/handy019/OneDrive - University of South Australia/Python/3 webscrapping/4 do it again/gensim pre-trained model/fasttext-wiki-news-subwords-300.d2v'
# )

In [3]:
w2v_embedding = KeyedVectors.load(
    r'C:\Users\duchi\OneDrive - University of South Australia\Python\3 webscrapping\4 do it again\gensim pre-trained model\glove-wiki-gigaword-300.d2v'
)

In [29]:
keywords

['OSHA',
 'Occupational Safety and Health Administration',
 'injury rate',
 'safety record',
 'injuries',
 'death',
 'occupational safety',
 'worker safety',
 'safety training',
 'safety performance',
 'safety climate',
 'safety program',
 'work environment safety',
 'fatalities',
 'TRIR',
 'total recordable incident rate',
 'incident rate',
 'injury',
 'injure',
 'workplace safety',
 'illness',
 'employee safety',
 'safety procedure',
 'safety hazard',
 'work safety',
 'safety condition',
 'fatality',
 'employees',
 'workers',
 'employee',
 'worker',
 'staff',
 'healthcare',
 'medical',
 'hospitals',
 'patients',
 'disease',
 'diseases',
 'injured',
 'illnesses',
 'safe',
 'safety',
 'hazards',
 'hazard',
 'accident',
 'accidents',
 'suffered',
 'suffering',
 'casualties',
 'bruises',
 'wounds',
 'hospitalized',
 'wounded',
 'wound',
 'trauma',
 'injuring',
 'hurt',
 'fatal',
 'incidents',
 'deaths',
 'pain',
 'laborer',
 'working',
 'practice',
 'skill',
 'trained',
 'trainees',
 'cr

In [27]:
list1 = [x[0] for x in w2v_embedding.most_similar("toxic",topn=50)]

In [167]:
list_ = [
    'neuroplasticity', 'mechanical', 'engineering', 'space', 'aerospace',
    'science', 'earth', 'laser', 'lasers', 'photonics', 'photonic',
    'renewable', 'artificial', 'intelligence', 'management', 'advanced',
    'environmental', 'robotics', 'robotic', 'botany', 'planetarium',
    'astrophysics', 'satellite', 'technologies', 'technology', 'electronic',
    'autonomous', 'driverless', 'software', 'program', 'embryologists',
    'mobility', 'services', 'systems', 'smart', 'energy', 'data', 'e-learning',
    'open-source', 'web-based', 'digital', '3-d', '3d', 'predictive',
    'analytics', 'decision-making', 'consumer-driven', 'app', 'apps',
    'developed', 'smartphone', 'innovations', 'innovation', 'thermal',
    'machine', 'modelling', 'simulation', 'imagery', 'biosecurity', 'tech',
    'internet', 'geolocation', 'communication', 'radio', 'frequency',
    'halogen', 'efficiency', 'efficiencies', 'quartz', 'filament', 'tungsten',
    'power', 'recycled', 'solar', 'photovoltaic', 'battery', 'aerial',
    'mapping', 'developer', 'online', 'platform', 'telehealth', 'start-ups',
    'electricity', 'data-driven', 'solutions', 'crowd-funding', 'virtual',
    'drone', 'cloud-based', 'wearable', 'gps', 'x-ray', 'portable', 'system',
    'pre-order', 'sensors', 'sensor', 'interactive'
]

In [168]:
similar = {}

for key in list_:
    
    try:
        temp = w2v_embedding.most_similar(key,topn=50)

        similar[key] = [x[0] for x in temp]
        similar['{}_score'.format(key)] = [x[1] for x in temp]
    except:
        print(key)

In [169]:
# w2v_embedding.save(
#     r'/mnt/c/Users/handy019/OneDrive - University of South Australia/Python/3 webscrapping/4 do it again/gensim pre-trained model/{}.d2v'
#     .format(name))

In [170]:
w2v_embedding.most_similar("geolocation",topn=50)

[('geocoding', 0.4380856454372406),
 ('location-based', 0.41686856746673584),
 ('websense', 0.40829652547836304),
 ('gnupg', 0.40550944209098816),
 ('cad/cam', 0.39738544821739197),
 ('fpga', 0.3946461081504822),
 ('e-card', 0.39462560415267944),
 ('fluoropolymers', 0.3938526213169098),
 ('fasttrack', 0.39105573296546936),
 ('anti-censorship', 0.39045822620391846),
 ('industry-standard', 0.38938531279563904),
 ('multiseat', 0.38789454102516174),
 ('dropbox', 0.3870222270488739),
 ('levenshtein', 0.3851688802242279),
 ('relatedness', 0.3848205804824829),
 ('high-precision', 0.3845316767692566),
 ('localisation', 0.3833083510398865),
 ('inkscape', 0.3824479281902313),
 ('ipsec', 0.3806017339229584),
 ('vpn', 0.3793243169784546),
 ('anti-spam', 0.378400593996048),
 ('modularization', 0.378022700548172),
 ('microarray', 0.37561485171318054),
 ('benkert', 0.3743286728858948),
 ('webgl', 0.374021053314209),
 ('user-generated', 0.37397557497024536),
 ('utilising', 0.3734482228755951),
 ('ipbe

In [171]:
similar_frame = pd.DataFrame(data=similar)

In [173]:
similar_frame[list_].to_csv(r'/mnt/c/Users/handy019/OneDrive - University of South Australia/Python/3 webscrapping/3 summary/similar_frame.csv', encoding='utf-8', index=None)
similar_frame.to_csv(r'/mnt/c/Users/handy019/OneDrive - University of South Australia/Python/3 webscrapping/3 summary/similar_frame_with_score.csv', encoding='utf-8', index=None)

In [174]:
similar_frame.head()

Unnamed: 0,neuroplasticity,neuroplasticity_score,mechanical,mechanical_score,engineering,engineering_score,space,space_score,aerospace,aerospace_score,...,system,system_score,pre-order,pre-order_score,sensors,sensors_score,sensor,sensor_score,interactive,interactive_score
0,neurogenesis,0.527151,electrical,0.639377,engineers,0.611102,nasa,0.632399,automotive,0.56318,...,systems,0.794524,pre-ordered,0.554411,sensor,0.787914,sensors,0.787914,multimedia,0.666878
1,plasticity,0.481572,hydraulic,0.585176,technology,0.602139,spacecraft,0.625613,aviation,0.553433,...,which,0.513731,pre-orders,0.548903,infrared,0.635932,infrared,0.598667,online,0.574534
2,diamagnetism,0.460442,engineering,0.566597,sciences,0.592431,spaces,0.590746,aeronautics,0.551577,...,mechanism,0.511397,itunes,0.533538,detect,0.619463,device,0.542785,entertainment,0.552217
3,gliosis,0.455925,mechanics,0.487156,engineer,0.583249,shuttle,0.571557,engineering,0.520104,...,control,0.505229,preorder,0.508901,detectors,0.611098,sensing,0.53837,video,0.534405
4,neovascularization,0.455304,vibration,0.474555,science,0.575667,astronauts,0.563398,lockheed,0.519512,...,computerized,0.504438,download-only,0.494121,devices,0.586671,detector,0.52093,digital,0.521121


In [175]:
len(list_)

99