In [1]:
import json
import logging
from re import sub
from multiprocessing import cpu_count

import numpy as np

import gensim.downloader as api
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.models import WordEmbeddingSimilarityIndex
from gensim.similarities import SparseTermSimilarityMatrix
from gensim.similarities import SoftCosineSimilarity

In [2]:
import logging

# Initialize logging.
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARNING)  # DEBUG # INFO

In [3]:
import nltk

# Import and download stopwords from NLTK.
nltk.download('stopwords')  # Download stopwords list.
stopwords = set(nltk.corpus.stopwords.words("english"))

[nltk_data] Downloading package stopwords to /Users/dford/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Support functions for pre-processing and calculation
# From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb

def preprocess(doc):
    # Tokenize, clean up input document string
    doc = sub(r'<img[^<>]+(>|$)', " image_token ", doc)
    doc = sub(r'<[^<>]+(>|$)', " ", doc)
    doc = sub(r'\[img_assist[^]]*?\]', " ", doc)
    doc = sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', " url_token ", doc)
    return [token for token in simple_preprocess(doc, min_len=0, max_len=float("inf")) if token not in stopwords]

In [5]:
# Load  data
# 
import pymongo
client = pymongo.MongoClient()
db = client["ufo_project"]
sightings=db["ufo_sightings"]

titles = [item["summary"] for item in sightings.find()]
documents = [item["text"] for item in sightings.find()]
ojbectIDs = [item["_id"] for item in sightings.find()]




# titles = [item["summary"] for item in sightings.find({"city":"Naperville"})]
# documents = [item["text"] for item in sightings.find({"city":"Naperville"})]
# ojbectIDs = [item["_id"] for item in sightings.find({"city":"Naperville"})]


len(ojbectIDs)

70531

In [20]:
# query_string = "white capsule shaped shaped flying object with no visible propulsion and a smooth surface. It was moving erratically but would also stop suddenly and hover completely still. It was a long white ellipse or oval. It looked like a pill"
# query_string = "black cube encased by a translucent sphere. shaped like a black box performing gravity defying movement. Dark square shape moving irregularly"
# query_string = "white capsule shaped. It was moving erratically but would also stop suddenly and hover completely still. It was a long white ellipse. It looked like a white pill."
# Preprocess the documents, including the query string
query_string = "flying black cube. Shaped like a black box. Dark square shape moving irregularly. black rectangular UFO"
corpus = [preprocess(document) for document in documents]
query = preprocess(query_string)

In [21]:
%%time
# jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10
# Download and/or load the GloVe word vector embeddings

if 'glove' not in locals():  # only load if not already in memory
    glove = api.load("glove-wiki-gigaword-50")
    
similarity_index = WordEmbeddingSimilarityIndex(glove)

CPU times: user 14 µs, sys: 1e+03 ns, total: 15 µs
Wall time: 18.4 µs


In [22]:
%%time

# Build the term dictionary, TF-idf model
# The search query must be in the dictionary as well, in case the terms do not overlap with the documents (we still want similarity)
dictionary = Dictionary(corpus+[query])
tfidf = TfidfModel(dictionary=dictionary)

# Create the term similarity matrix. 
# The nonzero_limit enforces sparsity by limiting the number of non-zero terms in each column. 
# For my application, I got best results by removing the default value of 100
similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf)  # , nonzero_limit=None)

CPU times: user 34min 19s, sys: 1min 1s, total: 35min 21s
Wall time: 6min 37s


In [23]:
# Compute Soft Cosine Measure between the query and the documents.
query_tf = tfidf[dictionary.doc2bow(query)]

index = SoftCosineSimilarity(
            tfidf[[dictionary.doc2bow(document) for document in corpus]],
            similarity_matrix)

doc_similarity_scores = index[query_tf]

  Y = np.multiply(Y, 1 / np.sqrt(Y_norm))
  Y = np.multiply(Y, 1 / np.sqrt(Y_norm))


In [25]:
# Output the similarity scores for top 15 documents
sorted_indexes = np.argsort(doc_similarity_scores)[::-1]
for idx in sorted_indexes[:15]:
    print(f'{idx} \t {doc_similarity_scores[idx]:0.3f} \t {titles[idx]}')

51870 	 0.709 	 I had just stepped outside the house after having a late dinner which was at about 08:45 p.m. and as I looked up to th
9350 	 0.705 	 Huge Dark/Black rectangular object in the sky.
14150 	 0.702 	 Saw a greyish black stretched cube with rounded corners with bright white windows on it
5883 	 0.701 	 two interrelated objects larger rectangle and rotating cube on one end changes shape, speed direction gas filled
34207 	 0.687 	 Rectangular UFO Lehigh Valley
26541 	 0.686 	 Rectangle boxes in form of a train, varrying colors. Red and white hovering dots above and on the sides. Suddenly dissapears in 1hr.
66834 	 0.680 	 Brilliant white light, morphed to clearly defined rectangular black shape, emitted misty tail, all vanished w/ no trace.
15929 	 0.674 	 White/Blue Rectangular object over Orange County CA changed shape, searchlight pointed at object.
1665 	 0.672 	 We saw a rectangular object with a row of smaller tic tac shaped lights, not sure if it was one craft or many.

In [26]:
from bson.objectid import ObjectId

In [27]:
doc_similarity_scores[0]

0.44143572

In [28]:
ojbectIDs[31094]

ObjectId('6018ee0d44298851bc755a42')

In [29]:
len(doc_similarity_scores)

70531

In [30]:
# for i in range(len(ojbectIDs)):
#     sightings.update_one({"_id":ojbectIDs[i]},
#                     {'$set': {'tictac_similarity': float(doc_similarity_scores[i])}})

In [31]:
for i in range(len(ojbectIDs)):
    sightings.update_one({"_id":ojbectIDs[i]},
                    {'$set': { 'cubeShip_similarity': float(doc_similarity_scores[i])}})