In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
"""
We divide the text into sentences, using the sents tool,
which divides the long sentence into specific sentences
"""

doc1 = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

for sent in doc1.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [5]:
# You can also find out if a specific word is the beginning of a sentence or not like this:

print(doc1[1].is_sent_start)
print(doc1[0].is_sent_start)

False
True


In [7]:
# If we want to use a specific sentence, it is not correct to write it like this:
print(doc1.sents[0])

TypeError: 'generator' object is not subscriptable

In [8]:
# Because it's not a list, but like this
list(doc1.sents)[0]

This is the first sentence.

In [9]:
doc_sents = [sent for sent in doc1.sents]
doc_sents

[This is the first sentence.,
 This is another sentence.,
 This is the last sentence.]

In [10]:
# Each word can be examined separately to determine which one begins the sentence like this:
print(doc_sents[1].start, doc_sents[1].end)

6 11


In [13]:
doc_sents[1]

This is another sentence.

In [14]:
doc2 = nlp(u'This is a sentence. This is a sentence. This is a sentence.')
for token in doc2:
    print(token.is_sent_start, ''+token.text)

True This
False is
False a
False sentence
False .
True This
False is
False a
False sentence
False .
True This
False is
False a
False sentence
False .


In [15]:
# It is also possible to make the library change its behavior in dividing sentences, so if we have a sentence here
doc3 = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')

for sent in doc3.sents:
    print(sent)

"Management is doing things right; leadership is doing the right things."
-Peter Drucker


In [22]:
"""
It will give us an inappropriate division, as we want it to be divided on the basis of semi-column;
You can specify a specific letter to perform the division using it like this:
"""
from spacy.language import Language
@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ";":
            doc[token.i+1].is_sent_start = True
    return doc        


In [23]:
nlp.add_pipe("set_custom_boundaries", before='parser')

<function __main__.set_custom_boundaries(doc)>

In [24]:
nlp.pipe_names

['tok2vec',
 'tagger',
 'set_custom_boundaries',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner']

In [25]:
doc4 = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')
for sent in doc4.sents:
    print(sent)

"Management is doing things right;
leadership is doing the right things."
-Peter Drucker


In [26]:
# Here, the tokenize_sent tool from the nltk library is used for the same task
from nltk.tokenize import sent_tokenize

EXAMPLE_TEXT = ''' 
10 
Thomas Gradgrind, sir. A man of realities. A man of facts and calculations. A man who 
proceeds upon the principle that two and two are four, and nothing over, and who is not 
to be talked into allowing for anything over. Thomas Gradgrind, sir—peremptorily 
Thomas—Thomas Gradgrind. With a rule and a pair of scales, and the multiplication 
table always in his pocket, sir, ready to weigh and measure any parcel of human nature, 
and tell you exactly what it comes to. It is a mere question of figures, a case of simple 
arithmetic. You might hope to get some other nonsensical belief into the head of 
George Gradgrind, or Augustus Gradgrind, or John Gradgrind, or Joseph Gradgrind (all 
supposititious, non-existent persons), but into the head of Thomas Gradgrind—no, sir! 
In such terms Mr. Gradgrind always mentally introduced himself, whether to his private 
circle of acquaintance, or to the public in general. In such terms, no doubt, substituting 
the words ‘boys and girls,’ for ‘sir,’ Thomas Gradgrind now presented Thomas Gradgrind 
to the little pitchers before him, who were to be filled so full of facts. 
Indeed, as he eagerly sparkled at them from the cellarage before mentioned, he 
seemed a kind of cannon loaded to the muzzle with facts, and prepared to blow them 
clean out of the regions of childhood at one discharge. He seemed a galvanizing 
apparatus, too, charged with a grim mechanical substitute for the tender young 
imaginations that were to be stormed away. 
‘Girl number twenty,’ said Mr. Gradgrind, squarely pointing with his square forefinger, ‘I 
don’t know that girl. Who is that girl?’ 
11 
''' 
for s in sent_tokenize(EXAMPLE_TEXT):
    print(s)
    print("--------------------")

 
10 
Thomas Gradgrind, sir.
--------------------
A man of realities.
--------------------
A man of facts and calculations.
--------------------
A man who 
proceeds upon the principle that two and two are four, and nothing over, and who is not 
to be talked into allowing for anything over.
--------------------
Thomas Gradgrind, sir—peremptorily 
Thomas—Thomas Gradgrind.
--------------------
With a rule and a pair of scales, and the multiplication 
table always in his pocket, sir, ready to weigh and measure any parcel of human nature, 
and tell you exactly what it comes to.
--------------------
It is a mere question of figures, a case of simple 
arithmetic.
--------------------
You might hope to get some other nonsensical belief into the head of 
George Gradgrind, or Augustus Gradgrind, or John Gradgrind, or Joseph Gradgrind (all 
supposititious, non-existent persons), but into the head of Thomas Gradgrind—no, sir!
--------------------
In such terms Mr. Gradgrind always mentally introdu

In [27]:
# Also there is a punksentencetokenizer tool in nltk to split sentences
from nltk.tokenize import PunktSentenceTokenizer
custom_sent_tokenizer = PunktSentenceTokenizer(EXAMPLE_TEXT)
tokenized = custom_sent_tokenizer.tokenize(EXAMPLE_TEXT)
tokenized[:10]

[' \n10 \nThomas Gradgrind, sir.',
 'A man of realities.',
 'A man of facts and calculations.',
 'A man who \nproceeds upon the principle that two and two are four, and nothing over, and who is not \nto be talked into allowing for anything over.',
 'Thomas Gradgrind, sir—peremptorily \nThomas—Thomas Gradgrind.',
 'With a rule and a pair of scales, and the multiplication \ntable always in his pocket, sir, ready to weigh and measure any parcel of human nature, \nand tell you exactly what it comes to.',
 'It is a mere question of figures, a case of simple \narithmetic.',
 'You might hope to get some other nonsensical belief into the head of \nGeorge Gradgrind, or Augustus Gradgrind, or John Gradgrind, or Joseph Gradgrind (all \nsupposititious, non-existent persons), but into the head of Thomas Gradgrind—no, sir!',
 'In such terms Mr. Gradgrind always mentally introduced himself, whether to his private \ncircle of acquaintance, or to the public in general.',
 'In such terms, no doubt, subs

In [28]:
# But the performance of these tools in the Arabic language is not good
doc1 = nlp('هذه هي الجملة الأولي ,. هذه هي الجملة الثانية , والجملة الثالثة')

for sent in doc1.sents:
    print(sent)

هذه هي الجملة الأولي ,.
هذه هي الجملة الثانية , والجملة الثالثة


In [29]:
doc1[2].is_sent_start

False

In [31]:
doc_sents = [sent for sent in doc1.sents] 
doc_sents 

[هذه هي الجملة الأولي ,., هذه هي الجملة الثانية , والجملة الثالثة]

In [32]:
doc2 = nlp(u'This is a sentence. This is a sentence. This is a sentence.') 
for token in doc2: 
    print(token.is_sent_start, ' '+token.text) 

True  This
False  is
False  a
False  sentence
False  .
True  This
False  is
False  a
False  sentence
False  .
True  This
False  is
False  a
False  sentence
False  .


In [33]:
EXAMPLE_TEXT = ''' 
يشكل الذكاء الاصطناعي تحديا والهاما لعلم الفلسفة ؛ لزعمه القدرة على إعادة خلق قدرات العقل البشري
وكمارو يحيي الناس
هل هناك حدود لمدى ذكاء الآلات؟ هل هناك فرق جوهري بين الذكاء البشري والذكاء الاصطناعي؟ وهل يمكن أن يكون 
 .للآلة عقل ووعي؟ عدد قليل من أهم الإجابات على هذه الأسئلة ترد أدناه
 "آلات الحساب والذكاء "قانون تورنغ
إذا كان الجهاز يعمل بذكاء يضاهي الإنسان، إذافذكائه يماثل ذكاء الإنسان. تفيد نظرية آلان تورنغ أنه، في نهاية المطاف، لا 
 .يسعنا إلا أن نحكم على ذكاء الآلة بناء على أدائها. هذه النظرية تشكل أساسا لاختبار تورنغ
''' 
for s in sent_tokenize(EXAMPLE_TEXT) : 
 print(s) 
 print('----------------------')

 
يشكل الذكاء الاصطناعي تحديا والهاما لعلم الفلسفة ؛ لزعمه القدرة على إعادة خلق قدرات العقل البشري
وكمارو يحيي الناس
هل هناك حدود لمدى ذكاء الآلات؟ هل هناك فرق جوهري بين الذكاء البشري والذكاء الاصطناعي؟ وهل يمكن أن يكون 
 .للآلة عقل ووعي؟ عدد قليل من أهم الإجابات على هذه الأسئلة ترد أدناه
 "آلات الحساب والذكاء "قانون تورنغ
إذا كان الجهاز يعمل بذكاء يضاهي الإنسان، إذافذكائه يماثل ذكاء الإنسان.
----------------------
تفيد نظرية آلان تورنغ أنه، في نهاية المطاف، لا 
 .يسعنا إلا أن نحكم على ذكاء الآلة بناء على أدائها.
----------------------
هذه النظرية تشكل أساسا لاختبار تورنغ
----------------------
