In [13]:
import textacy
from textacy import preprocessing

Preprocessing

In [2]:
raw_text = """ The best programs, are the ones written when the programmer is supposed to be working on something else.Mike bought the book for $50 although in Paris it will cost $30 dollars.
Don’t document the problem, fix it.This is from https://twitter.com/codewisdom?lang=en. """

In [14]:
preprocessing.remove.punctuation(raw_text)

' The best programs  are the ones written when the programmer is supposed to be working on something else Mike bought the book for $50 although in Paris it will cost $30 dollars \nDon t document the problem  fix it This is from https   twitter com codewisdom lang=en  '

In [15]:
preprocessing.replace.urls(raw_text,'TWITTER')

' The best programs, are the ones written when the programmer is supposed to be working on something else.Mike bought the book for $50 although in Paris it will cost $30 dollars.\nDon’t document the problem, fix it.This is from TWITTER '

In [16]:
textacy.preprocessing.replace.currency_symbols(raw_text,'USD')

' The best programs, are the ones written when the programmer is supposed to be working on something else.Mike bought the book for USD50 although in Paris it will cost USD30 dollars.\nDon’t document the problem, fix it.This is from https://twitter.com/codewisdom?lang=en. '

In [17]:
html = """
<!DOCTYPE html>
<html>
<head>
<style>

</style>
<b>The brow fox jumps</b>
"""

In [18]:
textacy.preprocessing.remove.html_tags(html)

'The brow fox jumps'

In [20]:
preproc = preprocessing.make_pipeline(
     preprocessing.replace.hashtags,
     preprocessing.replace.user_handles,
     preprocessing.replace.emojis,
 )
preproc("@spacy_io is OSS for industrial-strength NLP in Python developed by @explosion_ai 💥")

'_USER_ is OSS for industrial-strength NLP in Python developed by _USER_ _EMOJI_'

Similarity

In [21]:
textacy.similarity.hamming('String 1','String 2')

0.875

In [22]:
textacy.similarity.levenshtein('Str 1','String 2')

0.5

In [23]:
textacy.similarity.jaccard('Str 1','String 2')

0.4444444444444444

In [24]:
textacy.similarity.cosine('Str 1','String 2')

0.6324555320336759

Stats

In [28]:
from textacy import text_stats 
from textacy import datasets

In [30]:
text = next(datasets.CapitolWords().texts(limit=1))
doc = textacy.make_spacy_doc(text,'en_core_web_sm')
ts = textacy.text_stats.TextStats(doc)
ts.n_words

  utils.deprecated(


137

In [31]:
ts.n_unique_words

81

In [32]:
ts.n_chars_per_word

(3,
 7,
 7,
 7,
 9,
 3,
 7,
 7,
 3,
 1,
 4,
 2,
 11,
 9,
 7,
 7,
 9,
 3,
 3,
 7,
 3,
 4,
 4,
 2,
 4,
 9,
 3,
 2,
 5,
 7,
 4,
 9,
 2,
 3,
 8,
 2,
 4,
 3,
 9,
 11,
 2,
 4,
 3,
 7,
 4,
 2,
 9,
 2,
 5,
 7,
 2,
 7,
 2,
 5,
 3,
 4,
 2,
 9,
 11,
 4,
 8,
 3,
 3,
 10,
 10,
 4,
 3,
 8,
 2,
 4,
 3,
 5,
 3,
 3,
 8,
 6,
 7,
 5,
 4,
 4,
 5,
 10,
 1,
 4,
 8,
 6,
 4,
 3,
 9,
 4,
 7,
 3,
 8,
 3,
 3,
 10,
 10,
 4,
 4,
 7,
 4,
 3,
 3,
 6,
 6,
 3,
 4,
 1,
 10,
 10,
 3,
 3,
 2,
 6,
 10,
 3,
 7,
 4,
 2,
 4,
 3,
 8,
 6,
 4,
 4,
 2,
 4,
 4,
 4,
 3,
 4,
 2,
 4,
 4,
 4,
 4,
 2)

Data Augmentation

In [35]:
from textacy import augmentation

In [36]:
tfs = [augmentation.transforms.delete_words, augmentation.transforms.swap_chars, augmentation.transforms.delete_chars]

In [38]:
augmenter = augmentation.Augmenter(tfs, num=[0.4, 0.8, 0.6])

In [39]:
augmenter.apply_transforms(doc, lang="en_core_web_sm")

Mr. Speaker, 480,000 Federal employees are working without apy, a form of involuntay servitude; 280,000 Federal employees are not working, and they will be paid. Virtually all of these workers have mortgages to pay, cihldren to fed, and financial obligations to meet.
Mr. Speaker, what is happenign to these workers is immoral, is wrong, an must be rectified immediately. Nwet Gingrich and the Republican leadership must not continue to hold the House and the American people hostage while they psh their disastrous 7-year balanced budget plan. The gentleman from Georgia, Mr. Gingrich, and the Republican leadership must join Sentaor Dole and the entire Senate and pass a continuing resolution now, now to reopn Government.
Mr. Speaker, that is what the American people awnt, that is what they ned, and that is what this body must do.

In [40]:
doc

Mr. Speaker, 480,000 Federal employees are working without pay, a form of involuntary servitude; 280,000 Federal employees are not working, and they will be paid. Virtually all of these workers have mortgages to pay, children to feed, and financial obligations to meet.
Mr. Speaker, what is happening to these workers is immoral, is wrong, and must be rectified immediately. Newt Gingrich and the Republican leadership must not continue to hold the House and the American people hostage while they push their disastrous 7-year balanced budget plan. The gentleman from Georgia, Mr. Gingrich, and the Republican leadership must join Senator Dole and the entire Senate and pass a continuing resolution now, now to reopen Government.
Mr. Speaker, that is what the American people want, that is what they need, and that is what this body must do.

Information extraction

In [41]:
from textacy import extract

In [46]:
for n in extract.ngrams(doc,n=2):
    print(n)

Mr. Speaker
480,000 Federal
Federal employees
involuntary servitude
280,000 Federal
Federal employees
financial obligations
Mr. Speaker
rectified immediately
Newt Gingrich
Republican leadership
American people
people hostage
disastrous 7
year balanced
balanced budget
budget plan
Mr. Gingrich
Republican leadership
join Senator
Senator Dole
entire Senate
continuing resolution
reopen Government
Mr. Speaker
American people
people want


In [48]:
for ent in extract.terms(doc, ents=extract.entities):
    print(ent)

Speaker
480,000
280,000
Speaker
Newt Gingrich
Republican
House
American
7-year
Georgia
Gingrich
Republican
Dole
Senate
Speaker
American
