In [1]:
"""
This is a simple application for sentence embeddings: semantic search

We have a corpus with various sentences. Then, for a given query sentence,
we want to find the most similar sentence in this corpus.

This script outputs for various queries the top 5 most similar sentences in the corpus.
"""
from sentence_transformers import SentenceTransformer, util
import torch

embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

# Corpus with example sentences
corpus = ['I feel creepy.',
          'I feel scaried.',
          'I feel chilling.',
          'I feel terrifying.',
          'I feel frightening.'
          ]
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

# Query sentences:
queries = ['A cheetah chases prey on across a field.', 'A horrible curse befell my girlfriend and now she can only eat human meat.', "Apple tress can grow as tall as 20 feet."]


# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = 5
for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
    cos_scores = cos_scores.cpu()

    #We use torch.topk to find the highest 5 scores
    top_results = torch.topk(cos_scores, k=top_k)

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for score, idx in zip(top_results[0], top_results[1]):
        print(corpus[idx], "(Score: %.4f)" % (score))





Query: A cheetah chases prey on across a field.

Top 5 most similar sentences in corpus:
I feel creepy. (Score: 0.0888)
I feel terrifying. (Score: 0.0292)
I feel frightening. (Score: 0.0197)
I feel chilling. (Score: -0.0419)
I feel scaried. (Score: -0.0801)




Query: A horrible curse befell my girlfriend and now she can only eat human meat.

Top 5 most similar sentences in corpus:
I feel terrifying. (Score: 0.2653)
I feel scaried. (Score: 0.1889)
I feel creepy. (Score: 0.1667)
I feel frightening. (Score: 0.1488)
I feel chilling. (Score: 0.0610)




Query: Apple tress can grow as tall as 20 feet.

Top 5 most similar sentences in corpus:
I feel frightening. (Score: -0.0347)
I feel terrifying. (Score: -0.0469)
I feel scaried. (Score: -0.0677)
I feel chilling. (Score: -0.0813)
I feel creepy. (Score: -0.1176)


In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
import spacy
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
from tensorflow import keras

In [None]:
def clean_data(csv_in):
    csv_read = pd.read_csv(csv_in)

    # extract columns we want
    csv = csv_read[['id', 'title', 'selftext', 'score']]
    
    # drop removed, deleted, and nas
    csv = csv[csv.selftext != '[removed]']
    csv = csv[csv.selftext != '[deleted]']
    csv.dropna(subset = ["selftext"], inplace=True)
    
    # remove urls in text
    def remove_urls(row):        
        import re
        '''
        This insane regex is written by kerim from here:
        https://stackoverflow.com/questions/14081050/remove-all-forms-of-urls-from-a-given-string-in-python
        '''
        row = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', '', row, flags=re.MULTILINE)
        
        return row
        
    csv.selftext = csv.selftext.apply(remove_urls)
    
    # clean html, something like &amp;
    def clean_html(row):
        from bs4 import BeautifulSoup
        from html import unescape

        soup = BeautifulSoup(unescape(row), 'lxml')
        return soup.text
    
    csv.selftext = csv.selftext.apply(clean_html)
        
    return csv

nosleep2020 = clean_data('./Creepy Data/NoSleep/NoSleep/RS_2020_nosleep.csv')

In [88]:
nosleep2020.iloc[45,:]

id                                                     gxk0ml
title                                  Half Rock cave part 2.
selftext    [Part 1]()\n\n*"Jaaaaacckkk…"*\n\n"Demetri?" I...
score                                                       1
Name: 61, dtype: object

In [89]:
import re
text = nosleep2020.iloc[45,:].selftext
re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', '', text, flags=re.MULTILINE)



In [90]:
nosleep2020.to_excel('test.xlsx')

In [7]:
from bs4 import BeautifulSoup
from html import unescape

soup = BeautifulSoup(unescape(str(nosleep2020.selftext)), 'lxml')
print(soup.text)

0        This is the only rule of our household. If you...
1        This is the only rule of our household. If you...
3        It is hard for me to talk about my old friend ...
5        They say the devil is in the details.  Well th...
6        “Any sign of ‘em yet?” \n\nI continued staring...
                               ...                        
21218    *There is no cure for trauma. Once it enters t...
21219    I knew Persephone would need time to adjust, b...
21221    This isnt much, but this is surely the first u...
21223    Okay. for context, this story started about a ...
21225    ​\n\nI  was never able to find love...
Name: selftext, Length: 15487, dtype: object


In [6]:
nlp = spacy.load('en_core_web_lg')
doc = nlp(creepy.selftext[1])

OSError: [E050] Can't find model 'en_core_web_lg'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

In [31]:
for idx, sent in enumerate(doc.sents):
    encoded_text = np.expand_dims(embedder.encode(sent.text), axis = 0)
    if idx == 0:
        x = encoded_text
    else:
        x = np.append(x, encoded_text, axis = 0)

In [3]:
x

NameError: name 'x' is not defined

In [None]:
model = keras.models.Sequential([
    keras.layers.LSTM()
])

In [34]:
tf.random.normal([32, 10, 8])

<tf.Tensor: shape=(32, 10, 8), dtype=float32, numpy=
array([[[-0.8919805 , -1.4307563 ,  0.15166256, ..., -1.7649478 ,
         -1.021099  , -0.02627978],
        [-1.4783181 ,  0.08943315,  0.17854673, ..., -0.19954081,
          0.83018124, -0.5244327 ],
        [ 0.4220114 ,  0.46339417, -0.79892176, ..., -0.19948697,
          1.6006763 ,  1.775066  ],
        ...,
        [ 0.06492849, -1.7639278 , -0.27689138, ..., -0.0975803 ,
         -0.55004656, -0.36803925],
        [ 0.21676254,  0.46872047, -0.4581078 , ...,  0.57250595,
          0.94552565,  0.69256175],
        [-0.21706009,  1.0176384 , -0.29759172, ...,  0.10234821,
         -0.7302468 , -0.10730033]],

       [[ 0.70369136, -1.492371  , -0.00349201, ...,  0.27868918,
          0.00752653, -1.2992821 ],
        [-0.6542289 ,  1.655241  ,  0.5115025 , ...,  0.9593869 ,
          0.02697794, -0.33769584],
        [-0.6231138 ,  2.0751064 , -0.18052201, ..., -0.32436767,
         -1.58487   ,  0.23079748],
        ...,
 

In [55]:
import tensorflow_datasets as tfds

datasets, info = tfds.load('imdb_reviews', as_supervised = True, with_info = True)

INFO:absl:No config specified, defaulting to first: imdb_reviews/plain_text
INFO:absl:Load pre-computed DatasetInfo (eg: splits, num examples,...) from GCS: imdb_reviews/plain_text/1.0.0
INFO:absl:Load dataset info from /var/folders/xg/vnbr1nbs5pv86wd58cwl_mxm0000gn/T/tmpnja58or5tfds
INFO:absl:Field info.config_name from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.config_description from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.citation from disk and from code do not match. Keeping the one from code.
INFO:absl:Generating dataset imdb_reviews (/Users/anthony/tensorflow_datasets/imdb_reviews/plain_text/1.0.0)


[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /Users/anthony/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


HBox(children=(HTML(value='Dl Completed...'), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='…

HBox(children=(HTML(value='Dl Size...'), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'…

INFO:absl:URL http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz already downloaded: reusing /Users/anthony/tensorflow_datasets/downloads/ai.stanfor.edu_amaas_sentime_aclImdb_v1PaujRp-TxjBWz59jHXsMDm5WiexbxzaFQkEnXc3Tvo8.tar.gz.
INFO:absl:Generating split train








HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

Shuffling and writing examples to /Users/anthony/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteSM226Q/imdb_reviews-train.tfrecord


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=25000.0), HTML(value='')))

INFO:absl:Done writing /Users/anthony/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteSM226Q/imdb_reviews-train.tfrecord. Shard lengths: [25000]
INFO:absl:Generating split test


HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

Shuffling and writing examples to /Users/anthony/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteSM226Q/imdb_reviews-test.tfrecord


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=25000.0), HTML(value='')))

INFO:absl:Done writing /Users/anthony/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteSM226Q/imdb_reviews-test.tfrecord. Shard lengths: [25000]
INFO:absl:Generating split unsupervised


HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

Shuffling and writing examples to /Users/anthony/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteSM226Q/imdb_reviews-unsupervised.tfrecord


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=50000.0), HTML(value='')))

INFO:absl:Done writing /Users/anthony/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteSM226Q/imdb_reviews-unsupervised.tfrecord. Shard lengths: [50000]
INFO:absl:Skipping computing stats for mode ComputeStatsMode.SKIP.
INFO:absl:Constructing tf.data.Dataset for split None, from /Users/anthony/tensorflow_datasets/imdb_reviews/plain_text/1.0.0


[1mDataset imdb_reviews downloaded and prepared to /Users/anthony/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [58]:
train_size = info.splits['train'].num_examples

In [60]:
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300)
    X_batch = tf.strings.regex_replace(X_batch, b"<br\\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.split(X_batch)
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch

In [61]:
from collections import Counter
vocabulary = Counter()
for X_batch, y_batch in datasets["train"].batch(32).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))

In [68]:
vocab_size = 10000
truncated_vocabulary = [word for word, count in vocabulary.most_common()[:vocab_size]]