# Text preprocessing with Spacy
The following notebook preprocess the text from the dataset using the [Spacy](https://spacy.io/) library.

First import spacy, pandas to load the dataset and some utility methods.

In [2]:
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

import pandas as pd
import spacy
from utils import get_counter
import json
import itertools

Set up Spacy to use the GPU (if possible), then load the fast and optimized Spacy's English pipeline [en_core_web_sm](https://spacy.io/models/en#en_core_web_sm).

In [3]:
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")

In [4]:
# All the removable punctation 
# Does NOT contains ! (exclamation mark), ? (question mark) and . (point).
punctuation = "#$%&'()*+,-/:;<=>@[\]^_`{|}~" + '"'

# Preserve the excluded punctation, normalizing repetitions in a single letter.
# Special case for point: only multiple points (n>=2) are preserved, normalized with '...'
def multi_punct(word:str) -> str:
    joined = ''.join(sorted(set(word), key=word.index))
    if joined in '!':
        return '!'
    if joined in '?':
        return '?'
    if joined in '.':
        if len(word) >= 2:
            return '...'
        else:
            return ''
    return word      

In [5]:
# Preprocess steps executed after Spacy's pipeline execution.
def spacy_process(doc):
    
    # Tokenization and lemmatization are done with the spacy nlp pipeline commands    
    out_list = (token.lemma_ for token in doc)

    # Filter out stopwords
    out_list = (word for word in out_list if not nlp.vocab[word].is_stop)
    
    # Remove punctuation  
    out_list = (multi_punct(word) for word in out_list if not ''.join(sorted(set(word), key=word.index)) in punctuation)
    
    # Remove digits and decimal value
    out_list = (item for item in out_list if item and (not item.isdecimal() or not item.isdigit()))

    # Lowercase everything
    return list(map(str.lower, out_list))


Load the train dataset with pandas.

In [6]:
train = pd.read_csv(
    '../../data/train-balanced.tsv', 
    delimiter='\t', 
    names=['isSarcastic', 'comment', 'user', 'subreddit', 'date','parent'])

Setup pipelines for comment text and parent text.
The [DependencyParser](https://spacy.io/api/dependencyparser) and [EntityRecognizer](https://spacy.io/api/entityrecognizer) are left disabled.

In [7]:
comments_nlp = nlp.pipe(
    train['comment'].astype(str).values,
    disable=['parser', 'ner'])

parents_nlp = nlp.pipe(
    train['parent'].astype(str).values,
    disable=['parser', 'ner'])

Pipelines are executed: those are implemented with a series of iterator, consumed with the *list()* function.

In [8]:
comments = list(map(spacy_process, comments_nlp))

In [9]:
parents = list(map(spacy_process, parents_nlp))

The preprocessed data is exported as csv. **user** and **date** attributes are ignored.

In [10]:
export = pd.DataFrame(
    {
        'comment': list(map(lambda x: ' '.join(x), comments)), 
        'parent': list(map(lambda x: ' '.join(x), parents)), 
        'subreddit': train.subreddit, 
        'isSarcastic':train.isSarcastic
    }
)
export.to_csv('../../data/spacy_preprocessed/spacy_preprocessed_data.csv')

The dictionaries for the following corpus are extracted:
- comments
- parents
- comments + parents

In [14]:
dictionary = {word : count for word, count in get_counter(itertools.chain(comments, parents)).most_common()}
with open('../../data/spacy_preprocessed/spacy_dictionary_cp.json', 'w') as file:
    json.dump(dictionary, file)

dictionary = {word : count for word, count in get_counter(comments).most_common()}
with open('../../data/spacy_preprocessed/spacy_dictionary_comments.json', 'w') as file:
    json.dump(dictionary, file)

dictionary = {word : count for word, count in get_counter(parents).most_common()}
with open('../../data/spacy_preprocessed/spacy_dictionary_parents.json', 'w') as file:
    json.dump(dictionary, file)

In [13]:
# with open('../data/spacy_pre.txt', 'w') as file:
#     for sentence in itertools.chain(export.comment, export.parent):
#         file.write(str(sentence) + '\n')