<a href="https://colab.research.google.com/github/dk-wei/NLP-progress/blob/master/Text_Data_Cleaning_Medium.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load Dependencies

In [2]:
import re
import sklearn
import numpy as np
import sklearn
import sklearn.ensemble
import sklearn.metrics
from __future__ import print_function
from tqdm.notebook import tqdm

import spacy
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Load Data

In [3]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [4]:
newsgroups_train.data[20]

'From: keith@cco.caltech.edu (Keith Allan Schneider)\nSubject: Re: <<Pompous ass\nOrganization: California Institute of Technology, Pasadena\nLines: 16\nNNTP-Posting-Host: punisher.caltech.edu\n\nlivesey@solntze.wpd.sgi.com (Jon Livesey) writes:\n\n[...]\n>>The "`little\' things" above were in reference to Germany, clearly.  People\n>>said that there were similar things in Germany, but no one could name any.\n>That\'s not true.  I gave you two examples.  One was the rather\n>pevasive anti-semitism in German Christianity well before Hitler\n>arrived.  The other was the system of social ranks that were used\n>in Imperail Germany and Austria to distinguish Jews from the rest \n>of the population.\n\nThese don\'t seem like "little things" to me.  At least, they are orders\nworse than the motto.  Do you think that the motto is a "little thing"\nthat will lead to worse things?\n\nkeith\n'

In [5]:
compared_text = newsgroups_train.data[20]

In [6]:
print(newsgroups_train.data[20])

From: keith@cco.caltech.edu (Keith Allan Schneider)
Subject: Re: <<Pompous ass
Organization: California Institute of Technology, Pasadena
Lines: 16
NNTP-Posting-Host: punisher.caltech.edu

livesey@solntze.wpd.sgi.com (Jon Livesey) writes:

[...]
>>The "`little' things" above were in reference to Germany, clearly.  People
>>said that there were similar things in Germany, but no one could name any.
>That's not true.  I gave you two examples.  One was the rather
>pevasive anti-semitism in German Christianity well before Hitler
>arrived.  The other was the system of social ranks that were used
>in Imperail Germany and Austria to distinguish Jews from the rest 
>of the population.

These don't seem like "little things" to me.  At least, they are orders
worse than the motto.  Do you think that the motto is a "little thing"
that will lead to worse things?

keith



# Build Stopwords Dictionary

In [7]:
base_stop_words = stopwords.words("english")
punctuation_string = list(string.punctuation)
extra_stopwords_black = {'n/a', 'unspecified',"subject"}    # extra stopwords
extra_stopwords_white = {'build'}           # not stopwords
base_stop_words.extend(punctuation_string)
stopwords_eng = set(base_stop_words)
stopwords_eng.update(extra_stopwords_black)
stopwords_eng.difference_update(extra_stopwords_white)


In [8]:
#stopwords_eng

# Customize `spaCy` tokenizer that keeps hyphenated letter words 

Source of code: [Wiktor Stribiżew](https://stackoverflow.com/questions/55241927/spacy-intra-word-hyphens-how-to-treat-them-one-word)

In [9]:
import spacy
from spacy.tokenizer import Tokenizer
from spacy.util import compile_infix_regex

nlp = spacy.load("en_core_web_sm")

def custom_tokenizer(nlp):
    inf = list(nlp.Defaults.infixes)               # Default infixes
    inf.remove(r"(?<=[0-9])[+\-\*^](?=[0-9-])")    # Remove the generic op between numbers or between a number and a -
    inf = tuple(inf)                               # Convert inf to tuple
    infixes = inf + tuple([r"(?<=[0-9])[+*^](?=[0-9-])", r"(?<=[0-9])-(?=-)"])  # Add the removed rule after subtracting (?<=[0-9])-(?=[0-9]) pattern
    infixes = [x for x in infixes if '-|–|—|--|---|——|~' not in x] # Remove - between letters rule
    infix_re = compile_infix_regex(infixes)

    return Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search,
                                suffix_search=nlp.tokenizer.suffix_search,
                                infix_finditer=infix_re.finditer,
                                token_match=nlp.tokenizer.token_match,
                                rules=nlp.Defaults.tokenizer_exceptions)

nlp.tokenizer = custom_tokenizer(nlp)

after customization, `spaCy` wont split on hyphen in token.

In [10]:
doc = nlp('The Indo-European Caucus won the all-male election 58-32.')

print([token.text for token in doc]) 

['The', 'Indo-European', 'Caucus', 'won', 'the', 'all-male', 'election', '58-32', '.']


# Data Cleaning

In [11]:
def text_process(text_string):

  text_string = text_string.replace("/", ' ').replace(',', ' ').replace('?', ' ').replace('!', ' ').replace(':', ' ').strip()

  processed_token = [i.lower() for i in re.split('\s|\n|\t', text_string.strip()) if i != '']

  processed_token = list(filter (lambda s:any([c.isalnum() for c in s]), processed_token))

  processed_token = [i.rstrip('.').
                     rstrip(',').
                     lstrip('(').
                     rstrip(')').
                     lstrip('<').
                     rstrip('>').
                     rstrip(':').
                     lstrip('>').
                     rstrip('>').
                     lstrip('"').
                     rstrip('"').
                     lstrip("'").
                     rstrip("'").
                     lstrip("-").
                     rstrip("-") for i in processed_token]

  processed_token = ' '.join([i for i in processed_token if i not in stopwords_eng])

  # chose whether implement lemmetization or not 
  # doc = nlp(processed_token)

  # return ' '.join([token.lemma_ for token in doc if token.text not in stopwords_eng])

  return processed_token

In [12]:
text_process(newsgroups_train.data[20])

"keith@cco.caltech.edu keith allan schneider pompous ass organization california institute technology pasadena lines 16 nntp-posting-host punisher.caltech.edu livesey@solntze.wpd.sgi.com jon livesey writes `little things reference germany clearly people said similar things germany one could name that's true gave two examples one rather pevasive anti-semitism german christianity well hitler arrived system social ranks used imperail germany austria distinguish jews rest population seem like little things least orders worse motto think motto little thing lead worse things keith"

In [13]:
compared_text

'From: keith@cco.caltech.edu (Keith Allan Schneider)\nSubject: Re: <<Pompous ass\nOrganization: California Institute of Technology, Pasadena\nLines: 16\nNNTP-Posting-Host: punisher.caltech.edu\n\nlivesey@solntze.wpd.sgi.com (Jon Livesey) writes:\n\n[...]\n>>The "`little\' things" above were in reference to Germany, clearly.  People\n>>said that there were similar things in Germany, but no one could name any.\n>That\'s not true.  I gave you two examples.  One was the rather\n>pevasive anti-semitism in German Christianity well before Hitler\n>arrived.  The other was the system of social ranks that were used\n>in Imperail Germany and Austria to distinguish Jews from the rest \n>of the population.\n\nThese don\'t seem like "little things" to me.  At least, they are orders\nworse than the motto.  Do you think that the motto is a "little thing"\nthat will lead to worse things?\n\nkeith\n'