In [8]:
!pip install textacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m


Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [54]:
import textacy
from pprint import pprint

In [3]:
text1 = '''Dagny, there’s nothing of any importance in life—except how well you do your work. Nothing. Only that. 
Whatever else you are, will come from that. It’s the only measure of human value. All the codes of ethics they’ll 
try to ram down your throat are just so much paper money put out by swindlers to fleece people of their virtues. 
The code of competence is the only system of morality that’s on a gold standard. When you grow up, you’ll know what 
I mean.'''

text2 = '''In the department of -- but it is better not to mention the department. There is nothing more irritable 
than departments, regiments, courts of justice, and, in a word, every branch of public service. Each individual 
attached to them nowadays thinks all society insulted in his person. Quite recently, a complaint was received from 
a justice of the peace, in which he plainly demonstrated that all the imperial institutions were going to the dogs, 
and that the Czar’s sacred name was being taken in vain; and all because a wretched private had dared to appear before 
him in his dressing-gown, and to address him in that state.'''

text3 = '''But I have no leisure for them at all; and the reason, my friend, is this: I am not yet able, as the 
Delphic inscription has it, to know myself; so it seems to me ridiculous, when I do not yet know that, to investigate 
irrelevant things. And so I dismiss these matters and accepting the customary belief about them, as I was saying just 
now, I investigate not these things, but myself, to know whether I am a monster more complicated and more furious than 
Typhon or a gentler and simpler creature, to whom a divine and quiet lot is given by nature.'''

text4 = '''On the morning of August 27, 1908, in a room over a general store beside the Pedernales River in the Texas 
Hill Country, a baby boy was born to a forty-year-old woman who had prayed for a son. And who, when her prayer was 
answered, would tell her son that he was destined for greatness. Who would instill in him an ambition that was 
overwhelming in its intensity, that would consume his life and that he, in turn, would transmit to his daughters. 
The ambition was to be fulfilled. The boy born that morning would reach heights of power that no one born in the Hill 
Country – no one born in Texas – had ever reached. And that power would be used in ways that would change the lives 
of millions of people and the very nature of the nation itself.'''

In [4]:
texts = [text1, text2, text3, text4]

In [41]:
corpus = textacy.Corpus("en_core_web_sm", texts)
type(corpus)

textacy.corpus.Corpus

In [42]:
doc1, doc2, doc3, doc4 = corpus.docs
print(type(doc1))

<class 'spacy.tokens.doc.Doc'>


In [24]:
doc4._.preview

'Doc(166 tokens: "On the morning of August 27, 1908, in a room ov...")'

In [26]:
list(textacy.extract.entities(doc4))

[morning of August 27, 1908,
 Pedernales River,
 Texas,
 Hill Country,
 forty-year-old,
 morning,
 Hill 
 Country,
 Texas,
 millions]

In [29]:
list(textacy.extract.subject_verb_object_triples(doc4))

[SVOTriple(subject=[who], verb=[would, tell], object=[son]),
 SVOTriple(subject=[Who], verb=[would, instill], object=[ambition]),
 SVOTriple(subject=[that], verb=[would, consume], object=[life]),
 SVOTriple(subject=[he], verb=[would, transmit], object=[life]),
 SVOTriple(subject=[The, boy, born, that, morning], verb=[would, reach], object=[heights]),
 SVOTriple(subject=[one], verb=[had, reached], object=[that]),
 SVOTriple(subject=[that], verb=[would, change], object=[lives, nature])]

In [35]:
from itertools import combinations
list(combinations(range(4), 2))

[(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]

In [44]:
for d1, d2 in combinations(docs, 2):
    print(f'Doc 1: {d1._.preview}')
    print(f'Doc 2: {d2._.preview}')
    print(f'Cosine similarity: {textacy.similarity.cosine(d1.text, d2.text)}')
    print(f'Levenshtein similarity: {textacy.similarity.levenshtein(d1.text, d2.text)}')

Doc 1: Doc(106 tokens: "Dagny, there’s nothing of any importance in lif...")
Doc 2: Doc(129 tokens: "In the department of -- but it is better not to...")
Cosine similarity: 0.8219949365267865
Levenshtein similarity: 0.26349206349206344
Doc 1: Doc(106 tokens: "Dagny, there’s nothing of any importance in lif...")
Doc 2: Doc(124 tokens: "But I have no leisure for them at all; and the ...")
Cosine similarity: 0.8614435222839291
Levenshtein similarity: 0.2554347826086957
Doc 1: Doc(106 tokens: "Dagny, there’s nothing of any importance in lif...")
Doc 2: Doc(166 tokens: "On the morning of August 27, 1908, in a room ov...")
Cosine similarity: 0.7610194341477615
Levenshtein similarity: 0.2666666666666667
Doc 1: Doc(129 tokens: "In the department of -- but it is better not to...")
Doc 2: Doc(124 tokens: "But I have no leisure for them at all; and the ...")
Cosine similarity: 0.8451542547285166
Levenshtein similarity: 0.265079365079365
Doc 1: Doc(129 tokens: "In the department of -- but it is be

In [56]:
for doc in docs:
    pprint(textacy.extract.keyterms.textrank(doc))

[('paper money', 0.02747140820813597),
 ('gold standard', 0.022213691454051575),
 ('human value', 0.020098168423121947),
 ('life', 0.013476903165852198),
 ('importance', 0.012496671553933147),
 ('swindler', 0.012274782487314444),
 ('people', 0.01063220936393599),
 ('ethic', 0.010317159256115508),
 ('throat', 0.01009916038573986),
 ('virtue', 0.009986853360950098)]
[('wretched private', 0.020278617229828745),
 ('public service', 0.01858695414917655),
 ('department', 0.018316081104902463),
 ('imperial institution', 0.018246035602614646),
 ('justice', 0.011435421048622628),
 ('sacred', 0.010796864305404079),
 ('dressing', 0.010160524825315083),
 ('Czar', 0.009564677837181332),
 ('irritable', 0.00950243997218708),
 ('gown', 0.009467730350536545)]
[('irrelevant thing', 0.026074339800283317),
 ('customary belief', 0.021832185801670796),
 ('Delphic inscription', 0.020341902081768122),
 ('quiet lot', 0.019419689029294182),
 ('simple creature', 0.018985900327585556),
 ('leisure', 0.011529155833