In [29]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

lemmatizer = WordNetLemmatizer()

In [214]:
documents = ['Wage conflict in retail business grows',
			 'Higher wage for cafeteria employees',
			 'Retailing Wage Dispute Expands',
			 'Train Crash Near Petershausen',
			 'Five Deaths in Crash of Police Helicopter']

In [215]:
docs = [list(map(lemmatizer.lemmatize, filter(lambda x: x not in stops, word_tokenize(doc.lower())))) for doc in documents]


In [216]:
word2ind = {}
ind2word = []

for sent in docs:
    for word in sent:
        ind2word.append(word)
    
ind2word = set(ind2word)

for ind, word in enumerate(ind2word):
    word2ind[word] = ind
    
print(word2ind)

{'retail': 0, 'conflict': 1, 'business': 2, 'death': 3, 'higher': 4, 'crash': 5, 'dispute': 6, 'grows': 7, 'train': 8, 'near': 9, 'retailing': 10, 'helicopter': 11, 'wage': 12, 'expands': 13, 'police': 14, 'cafeteria': 15, 'five': 16, 'employee': 17, 'petershausen': 18}


In [217]:
import numpy as np

mat = np.zeros((len(documents), len(word2ind)))
for ind, doc in enumerate(docs):
    for word in doc:
        mat[ind, word2ind[word]] += 1
        
print(mat, mat.shape)

[[1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0.]] (5, 19)


In [223]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = CountVectorizer(lowercase=True, stop_words=stops, tokenizer=word_tokenize)
X = vectorizer.fit_transform(documents)
counts = X.toarray()  # Get the doc-term count matrix
dt = counts > 0       # Convert to a binary matrix
doc_term_mat = dt * 1 # If you prefer, represent as 1s and 0s



Matrices have same absolute value, but are different since word indexes are not the same.

In [224]:
query_vec = (vectorizer.transform(['retail']).toarray() + vectorizer.transform(['wage']).toarray()).T

print('Document product', doc_term_mat.dot(query_vec))

normalize = (doc_term_mat.T / np.array([len(d.split()) for d in documents])).T

print('Normalize product', normalize.dot(query_vec))

Document product [[2]
 [1]
 [1]
 [0]
 [0]]
Normalize product [[0.33333333]
 [0.2       ]
 [0.25      ]
 [0.        ]
 [0.        ]]


Normalizing breaks the tie between the two documents, but the most 'important' document still remains at the first place.

In [228]:
tf_vectorizer = TfidfVectorizer(lowercase=True, stop_words=stops, tokenizer=word_tokenize)
tf_X = tf_vectorizer.fit_transform(documents).toarray()

retail_vec = tf_vectorizer.transform(['retail']).toarray()
wage_vec = tf_vectorizer.transform(['wage']).toarray()

In [229]:
idf = ((tf_X.dot((retail_vec + wage_vec).T)))

In [252]:
import itertools
doc_count = len(documents)
doc_list = [i for i in range(doc_count)]
doc_pairs = list(itertools.combinations(doc_list, 2))

for aa, bb in doc_pairs:
    a,b = doc_term_mat[aa], doc_term_mat[bb]
    sim = (np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
    if sim > 0:
        print(documents[aa], ' ----- ', documents[bb])
        print(' Are similar with cos of ', sim)

Wage conflict in retail business grows  -----  Higher wage for cafeteria employees
 Are similar with cos of  0.22360679774997896
Wage conflict in retail business grows  -----  Retailing Wage Dispute Expands
 Are similar with cos of  0.22360679774997896
Higher wage for cafeteria employees  -----  Retailing Wage Dispute Expands
 Are similar with cos of  0.25
Train Crash Near Petershausen  -----  Five Deaths in Crash of Police Helicopter
 Are similar with cos of  0.22360679774997896


In [271]:
new_docs = [
    'Plane crash in Baden-Wuerttemberg',          # Doc 3a
	'The weather'                             # Doc 3b
]

vec = np.zeros(tf_vectorizer.transform(['ewr']).toarray().shape)
for doc in new_docs:
    for word in doc.split():
        vec += tf_vectorizer.transform([word]).toarray()
    similarity = tf_X.dot(vec.T)
    print(similarity)
    vec = np.zeros(tf_vectorizer.transform(['ewr']).toarray().shape)

[[0.        ]
 [0.        ]
 [0.        ]
 [0.42224214]
 [0.37410477]]
[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]


In [272]:
def prepare_dataset(filename):
    articles = []
    text = open(filename,'r').read().split()
    index_start = list(np.where(np.array(text)=="<DOC")[0])
    for i in range(len(index_start)-1):
        start_art = index_start[i]+2
        end_art = index_start[i+1]
        article = text[start_art:end_art]
        articles.append(article)
    return articles


In [275]:
articles = prepare_dataset('de-news.txt')

In [276]:
from gensim.models import LdaModel
from gensim import corpora

common_dictionary = corpora.Dictionary(articles)
# Transform each doc into a bag of words
common_corpus = [common_dictionary.doc2bow(a) for a in articles]
# This line is the actual training part and might take a few minutes
n_topics = 2
lda = LdaModel(common_corpus, id2word=common_dictionary, num_topics=n_topics, passes=100)
# After training is done, we can check the top words of each topic
for k in range(n_topics):
	top_words = lda.show_topic(k, topn=5)


ModuleNotFoundError: No module named 'gensim'