In [1]:
import os
import numpy as np
import pandas as pd
import spacy
from spacy import displacy
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel, LsiModel, HdpModel
import matplotlib.pyplot as plt
import sklearn
import keras
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [2]:
test_data_dir = '{}'.format(os.sep).join([gensim.__path__[0], 'test', 'test_data'])
print(test_data_dir)
lee_train_file = test_data_dir + os.sep + 'lee_background.cor'
print(lee_train_file)
text = open(lee_train_file).read()

C:\Users\Devmallya Karar\anaconda3\lib\site-packages\gensim\test\test_data
C:\Users\Devmallya Karar\anaconda3\lib\site-packages\gensim\test\test_data\lee_background.cor


In [24]:
doc = nlp(text)

In [3]:
nlp = spacy.load('en_core_web_sm')
nlp

<spacy.lang.en.English at 0x20993cf3280>

In [4]:
my_stop_words = ['say', '\s', 'mr', 'Mr', 'said', 'says', 'saying', 'today', 'be']
for stopword in my_stop_words:
    lexeme = nlp.vocab[stopword]
    lexeme.is_stop = True
print(lexeme)

<spacy.lexeme.Lexeme object at 0x0000020993DD3080>


In [5]:
sent = nlp('Last Thursday, Manchester United defeated AC Milan at San Siro.')

In [6]:
sent

Last Thursday, Manchester United defeated AC Milan at San Siro.

In [11]:
for token in sent:
    print(token.text,'-', token.pos_,'-', token.tag_)

Last - ADJ - JJ
Thursday - PROPN - NNP
, - PUNCT - ,
Manchester - PROPN - NNP
United - PROPN - NNP
defeated - VERB - VBD
AC - PROPN - NNP
Milan - PROPN - NNP
at - ADP - IN
San - PROPN - NNP
Siro - PROPN - NNP
. - PUNCT - .


In [12]:
for token in sent:
    print(token.text,'-', token.ent_type_)

Last - DATE
Thursday - DATE
, - 
Manchester - ORG
United - ORG
defeated - 
AC - ORG
Milan - ORG
at - 
San - GPE
Siro - GPE
. - 


In [13]:
for ent in sent.ents:
    print(ent.text, '-', ent.label_)

Last Thursday - DATE
Manchester United - ORG
AC Milan - ORG
San Siro - GPE


In [14]:
displacy.render(sent, style='ent', jupyter=True)

In [16]:
for chunk in sent.noun_chunks:
    print(chunk.text,'-', chunk.root.text,'-', chunk.root.dep_,'-', chunk.root.head.text)

Manchester United - United - nsubj - defeated
AC Milan - Milan - dobj - defeated
San Siro - Siro - pobj - at


In [18]:
for token in sent:
    print(token.text,'-', token.dep_,'-', token.head.text,'-', token.head.pos_,'-',
         [child for child in token.children])

Last - amod - Thursday - PROPN - []
Thursday - npadvmod - defeated - VERB - [Last]
, - punct - defeated - VERB - []
Manchester - compound - United - PROPN - []
United - nsubj - defeated - VERB - [Manchester]
defeated - ROOT - defeated - VERB - [Thursday, ,, United, Milan, at, .]
AC - compound - Milan - PROPN - []
Milan - dobj - defeated - VERB - [AC]
at - prep - defeated - VERB - [Siro]
San - compound - Siro - PROPN - []
Siro - pobj - at - ADP - [San]
. - punct - defeated - VERB - []


In [19]:
displacy.render(sent, style='dep', jupyter=True, options={'distance':90})

In [25]:
texts, article = [], []

for word in doc:
    
    if word.text != '\n' and not word.is_stop and not word.is_punct and not word.like_num and word.text != 'I':
        article.append(word.lemma_)
        
    if word.text == '\n':
        texts.append(article)
        article = []

In [26]:
print(texts[0])

['hundred', 'people', 'force', 'vacate', 'home', 'Southern', 'Highlands', 'New', 'South', 'Wales', 'strong', 'wind', 'push', 'huge', 'bushfire', 'town', 'Hill', 'new', 'blaze', 'near', 'Goulburn', 'south', 'west', 'Sydney', 'force', 'closure', 'Hume', 'Highway', '4:00pm', 'AEDT', 'marked', 'deterioration', 'weather', 'storm', 'cell', 'move', 'east', 'Blue', 'Mountains', 'force', 'authority', 'decision', 'evacuate', 'people', 'home', 'outlying', 'street', 'Hill', 'New', 'South', 'Wales', 'southern', 'highland', 'estimated', 'resident', 'leave', 'home', 'nearby', 'Mittagong', 'New', 'South', 'Wales', 'Rural', 'Fire', 'Service', 'weather', 'condition', 'cause', 'fire', 'burn', 'finger', 'formation', 'ease', 'fire', 'unit', 'Hill', 'optimistic', 'defend', 'property', 'blaze', 'burn', 'New', 'Year', 'Eve', 'New', 'South', 'Wales', 'fire', 'crew', 'call', 'new', 'fire', 'Gunning', 'south', 'Goulburn', 'detail', 'available', 'stage', 'fire', 'authority', 'close', 'Hume', 'Highway', 'direction

In [27]:
bigram = gensim.models.phrases.Phrases(texts)
texts = [bigram[line] for line in texts]
texts = [bigram[line] for line in texts]

In [28]:
print(texts[0])

['hundred', 'people', 'force', 'vacate', 'home', 'Southern', 'Highlands', 'New_South', 'Wales', 'strong', 'wind', 'push', 'huge', 'bushfire', 'town', 'Hill', 'new', 'blaze', 'near', 'Goulburn', 'south_west', 'Sydney', 'force', 'closure', 'Hume', 'Highway', '4:00pm', 'AEDT', 'marked', 'deterioration', 'weather', 'storm', 'cell', 'move', 'east', 'Blue_Mountains', 'force', 'authority', 'decision', 'evacuate', 'people', 'home', 'outlying', 'street', 'Hill', 'New_South', 'Wales', 'southern', 'highland', 'estimated', 'resident', 'leave', 'home', 'nearby', 'Mittagong', 'New_South', 'Wales', 'Rural_Fire', 'Service', 'weather_condition', 'cause', 'fire_burn', 'finger', 'formation', 'ease', 'fire', 'unit', 'Hill', 'optimistic', 'defend', 'property', 'blaze', 'burn', 'New', 'Year', 'Eve', 'New_South', 'Wales', 'fire', 'crew', 'call', 'new', 'fire', 'Gunning', 'south', 'Goulburn', 'detail', 'available', 'stage', 'fire', 'authority', 'close', 'Hume', 'Highway', 'direction', 'new', 'fire', 'Sydney',

In [29]:
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [30]:
print(corpus[1])

[(71, 1), (83, 1), (91, 1), (93, 1), (94, 1), (108, 1), (109, 1), (110, 1), (111, 4), (112, 1), (113, 1), (114, 1), (115, 1), (116, 2), (117, 1), (118, 1), (119, 3), (120, 1), (121, 1), (122, 1), (123, 2), (124, 3), (125, 1), (126, 2), (127, 2), (128, 1), (129, 1), (130, 1), (131, 1), (132, 1), (133, 1), (134, 1), (135, 1), (136, 1), (137, 1), (138, 3), (139, 1), (140, 1), (141, 1), (142, 2), (143, 1), (144, 1), (145, 1), (146, 1), (147, 1), (148, 1), (149, 1), (150, 3), (151, 3), (152, 1), (153, 1), (154, 2), (155, 1), (156, 1), (157, 2), (158, 1), (159, 1), (160, 1), (161, 1), (162, 1), (163, 1), (164, 1), (165, 1), (166, 1), (167, 1), (168, 1), (169, 1), (170, 2), (171, 1), (172, 1), (173, 1), (174, 1), (175, 1), (176, 1)]


In [31]:
lsi_model = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary)
lsi_model.show_topics(num_topics=5)

[(0,
  '-0.235*"israeli" + -0.214*"Arafat" + -0.202*"palestinian" + -0.176*"force" + -0.159*"kill" + -0.159*"official" + -0.151*"attack" + -0.141*"people" + -0.120*"day" + -0.117*"Israel"'),
 (1,
  '0.310*"israeli" + 0.301*"Arafat" + 0.278*"palestinian" + -0.169*"Afghanistan" + 0.162*"Sharon" + -0.159*"Australia" + 0.154*"Israel" + 0.128*"Hamas" + 0.122*"West_Bank" + -0.119*"force"'),
 (2,
  '-0.257*"Afghanistan" + -0.217*"force" + 0.181*"fire" + -0.180*"Al_Qaeda" + -0.170*"bin_Laden" + -0.147*"Pakistan" + 0.144*"Sydney" + -0.139*"Taliban" + -0.129*"fighter" + 0.129*"Australia"'),
 (3,
  '0.383*"fire" + 0.275*"area" + 0.207*"Sydney" + -0.206*"Australia" + 0.176*"firefighter" + 0.162*"north" + 0.153*"wind" + 0.138*"New_South" + 0.138*"Wales" + 0.133*"south"'),
 (4,
  '-0.269*"company" + -0.172*"union" + -0.167*"Qantas" + 0.142*"match" + 0.141*"South_Africa" + -0.138*"worker" + 0.136*"wicket" + 0.131*"win" + 0.131*"Test" + 0.121*"day"')]

In [32]:
hdp_model = HdpModel(corpus=corpus, id2word=dictionary)
hdp_model.show_topics()

[(0,
  '0.003*Government + 0.003*israeli + 0.003*group + 0.003*palestinian + 0.003*match + 0.003*kill + 0.003*Sharon + 0.002*attack + 0.002*Hamas + 0.002*ask + 0.002*Gaza_Strip + 0.002*militant + 0.002*Australia + 0.002*play + 0.002*win + 0.002*Palestinian_Authority + 0.002*choose + 0.002*arrest + 0.002*target + 0.002*launch'),
 (1,
  '0.004*airport + 0.003*Taliban + 0.002*kill + 0.002*Kandahar + 0.002*Launceston + 0.002*Virgin + 0.002*force + 0.002*night + 0.002*opposition + 0.002*tell + 0.002*leave + 0.002*city + 0.002*near + 0.002*fighter + 0.001*flight + 0.001*Laden + 0.001*terminal + 0.001*road + 0.001*Agha + 0.001*Sherrard'),
 (2,
  '0.006*storm + 0.003*hit + 0.003*Sydney + 0.003*damage + 0.003*tree + 0.003*report + 0.003*area + 0.002*work + 0.002*north + 0.002*volunteer + 0.002*company + 0.002*SES + 0.002*home + 0.002*bad + 0.002*continue + 0.002*Emergency + 0.002*state + 0.002*Martin + 0.002*New_South + 0.002*Wales'),
 (3,
  '0.002*union + 0.002*Indonesia + 0.002*Howard + 0.002

In [33]:
lda_model = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)
lda_model.show_topics()

[(0,
  '0.006*"force" + 0.004*"day" + 0.003*"Sydney" + 0.003*"come" + 0.003*"people" + 0.003*"tell" + 0.003*"Government" + 0.003*"Melbourne" + 0.003*"official" + 0.003*"area"'),
 (1,
  '0.008*"people" + 0.006*"fire" + 0.005*"kill" + 0.004*"year" + 0.003*"tell" + 0.003*"think" + 0.002*"service" + 0.002*"Sydney" + 0.002*"Australia" + 0.002*"call"'),
 (2,
  '0.006*"israeli" + 0.006*"man" + 0.005*"palestinian" + 0.005*"day" + 0.004*"Australia" + 0.004*"official" + 0.004*"United_States" + 0.004*"force" + 0.004*"report" + 0.004*"people"'),
 (3,
  '0.005*"child" + 0.005*"Qantas" + 0.004*"people" + 0.004*"day" + 0.004*"think" + 0.004*"company" + 0.003*"call" + 0.003*"union" + 0.003*"come" + 0.003*"Australian"'),
 (4,
  '0.005*"company" + 0.005*"day" + 0.004*"Labor" + 0.004*"union" + 0.003*"start" + 0.003*"family" + 0.003*"win" + 0.003*"know" + 0.003*"expect" + 0.003*"come"'),
 (5,
  '0.008*"Australia" + 0.006*"Government" + 0.005*"new" + 0.004*"year" + 0.004*"people" + 0.004*"Afghanistan" + 0.

In [38]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)