## Sentence-transformers test

In [1]:
"""
This is a simple application for sentence embeddings: semantic search

We have a corpus with various sentences. Then, for a given query sentence,
we want to find the most similar sentence in this corpus.

This script outputs for various queries the top 5 most similar sentences in the corpus.
"""
from sentence_transformers import SentenceTransformer, util
import torch

embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

# Corpus with example sentences
corpus = ['I feel creepy.',
          'I feel scaried.',
          'I feel chilling.',
          'I feel terrifying.',
          'I feel frightening.'
          ]
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

# Query sentences:
queries = ['A cheetah chases prey on across a field.', 'A horrible curse befell my girlfriend and now she can only eat human meat.', "Apple tress can grow as tall as 20 feet."]


# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = 5
for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
    cos_scores = cos_scores.cpu()

    #We use torch.topk to find the highest 5 scores
    top_results = torch.topk(cos_scores, k=top_k)

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for score, idx in zip(top_results[0], top_results[1]):
        print(corpus[idx], "(Score: %.4f)" % (score))





Query: A cheetah chases prey on across a field.

Top 5 most similar sentences in corpus:
I feel creepy. (Score: 0.0888)
I feel terrifying. (Score: 0.0292)
I feel frightening. (Score: 0.0197)
I feel chilling. (Score: -0.0419)
I feel scaried. (Score: -0.0801)




Query: A horrible curse befell my girlfriend and now she can only eat human meat.

Top 5 most similar sentences in corpus:
I feel terrifying. (Score: 0.2653)
I feel scaried. (Score: 0.1889)
I feel creepy. (Score: 0.1667)
I feel frightening. (Score: 0.1488)
I feel chilling. (Score: 0.0610)




Query: Apple tress can grow as tall as 20 feet.

Top 5 most similar sentences in corpus:
I feel frightening. (Score: -0.0347)
I feel terrifying. (Score: -0.0469)
I feel scaried. (Score: -0.0677)
I feel chilling. (Score: -0.0813)
I feel creepy. (Score: -0.1176)


In [1]:
from creepy_module import clean_data, clean_comments

## Iterate all directories

In [2]:
'''
    (creepyvenv) Creepy Data> tree
    Folder PATH listing
    Volume serial number is 7250-E7CF
    C:.
    ├───Confession
    │   └───Confession
    ├───Confessions
    │   ├───Confessions
    │   └───__MACOSX
    │       └───Confessions
    ├───CreepyPasta
    │   ├───CreepyPasta
    │   └───__MACOSX
    │       └───CreepyPasta
    ├───NoSleep
    │   ├───NoSleep
    │   └───__MACOSX
    │       └───NoSleep
    ├───Self
    │   ├───Self
    │   └───__MACOSX
    │       └───Self
    ├───ShortScaryStories
    │   ├───ShortScaryStories
    │   └───__MACOSX
    │       └───ShortScaryStories
    ├───StoriesAboutKevin
    │   └───StoriesAboutKevin
    └───TIFU
        ├───TIFU
        └───__MACOSX
            └───TIFU
'''

'\n    (creepyvenv) Creepy Data> tree\n    Folder PATH listing\n    Volume serial number is 7250-E7CF\n    C:.\n    ├───Confession\n    │   └───Confession\n    ├───Confessions\n    │   ├───Confessions\n    │   └───__MACOSX\n    │       └───Confessions\n    ├───CreepyPasta\n    │   ├───CreepyPasta\n    │   └───__MACOSX\n    │       └───CreepyPasta\n    ├───NoSleep\n    │   ├───NoSleep\n    │   └───__MACOSX\n    │       └───NoSleep\n    ├───Self\n    │   ├───Self\n    │   └───__MACOSX\n    │       └───Self\n    ├───ShortScaryStories\n    │   ├───ShortScaryStories\n    │   └───__MACOSX\n    │       └───ShortScaryStories\n    ├───StoriesAboutKevin\n    │   └───StoriesAboutKevin\n    └───TIFU\n        ├───TIFU\n        └───__MACOSX\n            └───TIFU\n'

I renamed several files for easy coding. I also deleted unrelated folders. Now it's like this:

In [3]:
'''
    (creepyvenv) Creepy Data> tree
    Folder PATH listing
    Volume serial number is 7250-E7CF
    C:.
    ├───Confession
    │   └───Confession
    ├───Confessions
    │   └───Confessions
    ├───CreepyPasta
    │   └───CreepyPasta
    ├───NoSleep
    │   └───NoSleep
    ├───Self
    │   └───Self
    ├───ShortScaryStories
    │   └───ShortScaryStories
    │       └───.ipynb_checkpoints
    ├───StoriesAboutKevin
    │   └───StoriesAboutKevin
    └───TIFU
        └───TIFU
'''

'\n    (creepyvenv) Creepy Data> tree\n    Folder PATH listing\n    Volume serial number is 7250-E7CF\n    C:.\n    ├───Confession\n    │   └───Confession\n    ├───Confessions\n    │   └───Confessions\n    ├───CreepyPasta\n    │   └───CreepyPasta\n    ├───NoSleep\n    │   └───NoSleep\n    ├───Self\n    │   └───Self\n    ├───ShortScaryStories\n    │   └───ShortScaryStories\n    │       └───.ipynb_checkpoints\n    ├───StoriesAboutKevin\n    │   └───StoriesAboutKevin\n    └───TIFU\n        └───TIFU\n'

In [4]:
import os

PATH = '../Creepy Data/'
OUTPUT_PATH = '../pickles/'

for subreddit in os.listdir(PATH):
    # I have done nosleep, delete it in the future.
    if subreddit == 'NoSleep':
        continue
        
    try:
        os.makedirs(f"{OUTPUT_PATH + subreddit}")
    except FileExistsError:
        pass
    
    print(f'''
    -----------
    {subreddit}
    -----------
    ''')
    
    # Data
    subreddit_csv = clean_data(f"{PATH + subreddit}/{subreddit}/RS_2020_{subreddit.lower()}.csv")
    subreddit_csv.to_pickle(f"{OUTPUT_PATH + subreddit}/RS_2020_{subreddit.lower()}.pickle")
    
    # Comments
    subreddit_comments_csv = clean_comments(f"{PATH + subreddit}/{subreddit}/{subreddit.lower()}_comments.csv")
    subreddit_comments_csv.to_pickle(f"{OUTPUT_PATH + subreddit}/{subreddit.lower()}_comments.pickle")


    -----------
    Confession
    -----------
    
Reading data...
Done
Removing nonexisting rows...
Done
Removing links...
Done


HBox(children=(HTML(value='Cleaning titles'), FloatProgress(value=0.0, max=24.0), HTML(value='')))




HBox(children=(HTML(value='Cleaning posts'), FloatProgress(value=0.0, max=24.0), HTML(value='')))




HBox(children=(HTML(value='Detecting title language'), FloatProgress(value=0.0, max=208730.0), HTML(value=''))…




HBox(children=(HTML(value='Detecting posts language'), FloatProgress(value=0.0, max=208730.0), HTML(value=''))…


Reading data...
Done
Removing nonexisting rows...
Done
Filtering indirect comments...
Done
Removing links...
Done


HBox(children=(HTML(value='Cleaning comments'), FloatProgress(value=0.0, max=24.0), HTML(value='')))




HBox(children=(HTML(value='Detecting comment language'), FloatProgress(value=0.0, max=1433364.0), HTML(value='…



    -----------
    Confessions
    -----------
    
Reading data...
Done
Removing nonexisting rows...
Done
Removing links...
Done


HBox(children=(HTML(value='Cleaning titles'), FloatProgress(value=0.0, max=23476.0), HTML(value='')))




HBox(children=(HTML(value='Cleaning posts'), FloatProgress(value=0.0, max=24.0), HTML(value='')))




HBox(children=(HTML(value='Detecting title language'), FloatProgress(value=0.0, max=23476.0), HTML(value='')))




HBox(children=(HTML(value='Detecting posts language'), FloatProgress(value=0.0, max=23476.0), HTML(value='')))


Reading data...
Done
Removing nonexisting rows...
Done
Filtering indirect comments...
Done
Removing links...
Done


HBox(children=(HTML(value='Cleaning comments'), FloatProgress(value=0.0, max=24.0), HTML(value='')))




HBox(children=(HTML(value='Detecting comment language'), FloatProgress(value=0.0, max=795962.0), HTML(value=''…



    -----------
    CreepyPasta
    -----------
    
Reading data...
Done
Removing nonexisting rows...
Done
Removing links...
Done


HBox(children=(HTML(value='Cleaning titles'), FloatProgress(value=0.0, max=2481.0), HTML(value='')))




HBox(children=(HTML(value='Cleaning posts'), FloatProgress(value=0.0, max=24.0), HTML(value='')))




HBox(children=(HTML(value='Detecting title language'), FloatProgress(value=0.0, max=2481.0), HTML(value='')))




HBox(children=(HTML(value='Detecting posts language'), FloatProgress(value=0.0, max=2481.0), HTML(value='')))


Reading data...
Done
Removing nonexisting rows...
Done
Filtering indirect comments...
Done
Removing links...
Done


HBox(children=(HTML(value='Cleaning comments'), FloatProgress(value=0.0, max=24.0), HTML(value='')))




HBox(children=(HTML(value='Detecting comment language'), FloatProgress(value=0.0, max=72358.0), HTML(value='')…



    -----------
    Self
    -----------
    
Reading data...
Done
Removing nonexisting rows...
Done
Removing links...
Done


HBox(children=(HTML(value='Cleaning titles'), FloatProgress(value=0.0, max=18055.0), HTML(value='')))




HBox(children=(HTML(value='Cleaning posts'), FloatProgress(value=0.0, max=24.0), HTML(value='')))




HBox(children=(HTML(value='Detecting title language'), FloatProgress(value=0.0, max=18055.0), HTML(value='')))




HBox(children=(HTML(value='Detecting posts language'), FloatProgress(value=0.0, max=18055.0), HTML(value='')))


Reading data...
Done
Removing nonexisting rows...
Done
Filtering indirect comments...
Done
Removing links...
Done


HBox(children=(HTML(value='Cleaning comments'), FloatProgress(value=0.0, max=24.0), HTML(value='')))




HBox(children=(HTML(value='Detecting comment language'), FloatProgress(value=0.0, max=883557.0), HTML(value=''…



    -----------
    ShortScaryStories
    -----------
    
Reading data...
Done
Removing nonexisting rows...
Done
Removing links...
Done


HBox(children=(HTML(value='Cleaning titles'), FloatProgress(value=0.0, max=24.0), HTML(value='')))




HBox(children=(HTML(value='Cleaning posts'), FloatProgress(value=0.0, max=24.0), HTML(value='')))




HBox(children=(HTML(value='Detecting title language'), FloatProgress(value=0.0, max=55176.0), HTML(value='')))




HBox(children=(HTML(value='Detecting posts language'), FloatProgress(value=0.0, max=55176.0), HTML(value='')))


Reading data...
Done
Removing nonexisting rows...
Done
Filtering indirect comments...
Done
Removing links...
Done


HBox(children=(HTML(value='Cleaning comments'), FloatProgress(value=0.0, max=24.0), HTML(value='')))




HBox(children=(HTML(value='Detecting comment language'), FloatProgress(value=0.0, max=218171.0), HTML(value=''…



    -----------
    StoriesAboutKevin
    -----------
    
Reading data...
Done
Removing nonexisting rows...
Done
Removing links...
Done


HBox(children=(HTML(value='Cleaning titles'), FloatProgress(value=0.0, max=3421.0), HTML(value='')))




HBox(children=(HTML(value='Cleaning posts'), FloatProgress(value=0.0, max=24.0), HTML(value='')))




HBox(children=(HTML(value='Detecting title language'), FloatProgress(value=0.0, max=3421.0), HTML(value='')))




HBox(children=(HTML(value='Detecting posts language'), FloatProgress(value=0.0, max=3421.0), HTML(value='')))


Reading data...
Done
Removing nonexisting rows...
Done
Filtering indirect comments...
Done
Removing links...
Done


HBox(children=(HTML(value='Cleaning comments'), FloatProgress(value=0.0, max=24.0), HTML(value='')))




HBox(children=(HTML(value='Detecting comment language'), FloatProgress(value=0.0, max=22796.0), HTML(value='')…



    -----------
    TIFU
    -----------
    
Reading data...
Done
Removing nonexisting rows...
Done
Removing links...
Done


HBox(children=(HTML(value='Cleaning titles'), FloatProgress(value=0.0, max=17189.0), HTML(value='')))




HBox(children=(HTML(value='Cleaning posts'), FloatProgress(value=0.0, max=24.0), HTML(value='')))




HBox(children=(HTML(value='Detecting title language'), FloatProgress(value=0.0, max=17189.0), HTML(value='')))




HBox(children=(HTML(value='Detecting posts language'), FloatProgress(value=0.0, max=17189.0), HTML(value='')))


Reading data...
Done
Removing nonexisting rows...
Done
Filtering indirect comments...
Done
Removing links...
Done


HBox(children=(HTML(value='Cleaning comments'), FloatProgress(value=0.0, max=24.0), HTML(value='')))




HBox(children=(HTML(value='Detecting comment language'), FloatProgress(value=0.0, max=3022035.0), HTML(value='…




Now the pickle directory look like this:

In [7]:
'''
(creepyvenv) PS pickles> tree /f
Folder PATH listing
Volume serial number is 7250-E7CF
C:.
├───.ipynb_checkpoints
├───Confession
│       confession_comments.pickle
│       RS_2020_confession.pickle
│
├───Confessions
│       confessions_comments.pickle
│       RS_2020_confessions.pickle
│
├───CreepyPasta
│       creepypasta_comments.pickle
│       RS_2020_creepypasta.pickle
│
├───NoSleep
│   │   nosleep_comments.pickle
│   │   RS_2020_nosleep.pickle
│   │
│   └───.ipynb_checkpoints
├───Self
│       RS_2020_self.pickle
│       self_comments.pickle
│
├───ShortScaryStories
│       RS_2020_shortscarystories.pickle
│       shortscarystories_comments.pickle
│
├───StoriesAboutKevin
│       RS_2020_storiesaboutkevin.pickle
│       storiesaboutkevin_comments.pickle
│
└───TIFU
        RS_2020_tifu.pickle
        tifu_comments.pickle
'''

'\n(creepyvenv) PS pickles> tree /f\nFolder PATH listing\nVolume serial number is 7250-E7CF\nC:.\n├───.ipynb_checkpoints\n├───Confession\n│       confession_comments.pickle\n│       RS_2020_confession.pickle\n│\n├───Confessions\n│       confessions_comments.pickle\n│       RS_2020_confessions.pickle\n│\n├───CreepyPasta\n│       creepypasta_comments.pickle\n│       RS_2020_creepypasta.pickle\n│\n├───NoSleep\n│   │   nosleep_comments.pickle\n│   │   RS_2020_nosleep.pickle\n│   │\n│   └───.ipynb_checkpoints\n├───Self\n│       RS_2020_self.pickle\n│       self_comments.pickle\n│\n├───ShortScaryStories\n│       RS_2020_shortscarystories.pickle\n│       shortscarystories_comments.pickle\n│\n├───StoriesAboutKevin\n│       RS_2020_storiesaboutkevin.pickle\n│       storiesaboutkevin_comments.pickle\n│\n└───TIFU\n        RS_2020_tifu.pickle\n        tifu_comments.pickle\n'