In [1]:
from pprint import pprint

In [2]:
import nltk

In [19]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [3]:
text = "The quick brown fox jumps over a lazy dog."

In [4]:
from nltk.tokenize import sent_tokenize, word_tokenize

# Tokenization

In [5]:
sents = sent_tokenize(text)
sents

['The quick brown fox jumps over a lazy dog.']

In [6]:
words = word_tokenize(text)
words

['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'a', 'lazy', 'dog', '.']

# Stop Words Detection

In [74]:
from nltk.corpus import stopwords

# from string import punctuation
english_stopwords = stopwords.words('english')

In [8]:
words_non_stop = [word for word in nltk.tokenize.word_tokenize(text) if word not in english_stopwords]
words_non_stop

['The', 'quick', 'brown', 'fox', 'jumps', 'lazy', 'dog', '.']

# N-Grams

In [9]:
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
bigram_measures = BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(words_non_stop)
pprint([*finder.ngram_fd.items()])

[(('The', 'quick'), 1),
 (('quick', 'brown'), 1),
 (('brown', 'fox'), 1),
 (('fox', 'jumps'), 1),
 (('jumps', 'lazy'), 1),
 (('lazy', 'dog'), 1),
 (('dog', '.'), 1)]


# Stemming

In [10]:
text_2 = "Machine learning likes to learn on learnable data."

In [11]:
from nltk.stem.lancaster import LancasterStemmer # there are plenty stemmers

In [12]:
st = LancasterStemmer()
stemmed_Words = [st.stem(word) for word in word_tokenize(text_2)]
stemmed_Words

['machin', 'learn', 'lik', 'to', 'learn', 'on', 'learn', 'dat', '.']

# Lemmatization

In [13]:
from nltk.stem import WordNetLemmatizer

In [14]:
lm = WordNetLemmatizer()
lemma_Words = [lm.lemmatize(word) for word in word_tokenize(text_2)]
lemma_Words

['Machine', 'learning', 'like', 'to', 'learn', 'on', 'learnable', 'data', '.']

# Stem vs Lem

In [15]:
from nltk.stem.lancaster import LancasterStemmer

In [16]:
with open('random_text.txt') as f:
    text_3 = f.read()

In [17]:
%timeit [lm.lemmatize(word) for word in word_tokenize(text_3)]

31.8 ms ± 3.95 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [18]:
%timeit [st.stem(word) for word in word_tokenize(text_3)]

77.8 ms ± 26.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


# Tagging

In [19]:
nltk.pos_tag(word_tokenize(text_2))

[('Machine', 'NN'),
 ('learning', 'VBG'),
 ('likes', 'NNS'),
 ('to', 'TO'),
 ('learn', 'VB'),
 ('on', 'IN'),
 ('learnable', 'JJ'),
 ('data', 'NNS'),
 ('.', '.')]

# ?? Disambiguating Word Meaning ??

In [21]:
from nltk.corpus import wordnet as wn

In [22]:
for ss in wn.synsets('run'):
    print(ss, ss.definition())

Synset('run.n.01') a score in baseball made by a runner touching all four bases safely
Synset('test.n.05') the act of testing something
Synset('footrace.n.01') a race run on foot
Synset('streak.n.01') an unbroken series of events
Synset('run.n.05') (American football) a play in which a player attempts to carry the ball through or past the opposing team
Synset('run.n.06') a regular trip
Synset('run.n.07') the act of running; traveling on foot at a fast pace
Synset('run.n.08') the continuous period of time during which something (a machine or a factory) operates or continues in operation
Synset('run.n.09') unrestricted freedom to use
Synset('run.n.10') the production achieved during a continuous period of operation (of a machine or factory etc.)
Synset('rivulet.n.01') a small stream
Synset('political_campaign.n.01') a race between candidates for elective office
Synset('run.n.13') a row of unravelled stitches
Synset('discharge.n.06') the pouring forth of a fluid
Synset('run.n.15') an unbr

In [23]:
from nltk.wsd import lesk

In [24]:
text_4 = "Let's go running!"

In [25]:
sense = lesk(word_tokenize(text_4), 'run')
sense

Synset('run.v.13')

# Article summary

## Scrapping

In [26]:
import requests
from bs4 import BeautifulSoup

In [27]:
article_url = "https://jamesclear.com/why-facts-dont-change-minds"

In [28]:
response = requests.get(article_url)

In [29]:
soup = BeautifulSoup(response.content, "html.parser")
soup

<!DOCTYPE html>

<!--[if IE 9]><html class="lt-ie10" lang="lang="en-US"" > <![endif]-->
<html lang="en-US">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<link href="https://use.typekit.net/tqf2ebt.css" rel="stylesheet"/>
<meta content="index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1" name="robots">
<!-- This site is optimized with the Yoast SEO Premium plugin v19.2.1 (Yoast SEO v19.10) - https://yoast.com/wordpress/plugins/seo/ -->
<title>Why Facts Don’t Change Our Minds</title>
<meta content="Why don't facts change our minds? This article explains the logic of false beliefs and proposes a better system for constructive conversation." name="description">
<link href="https://jamesclear.com/why-facts-dont-change-minds" rel="canonical">
<meta content="en_US" property="og:locale">
<meta content="article" property="og:type"/>
<meta content="Why Facts 

In [52]:
article = ' '.join(map(lambda p: p.text, soup.find_all('p')))
article

'The economist J.K. Galbraith once wrote, “Faced with a choice between changing one’s mind and proving there is no need to do so, almost everyone gets busy with the proof.” Leo Tolstoy was even bolder: “The most difficult subjects can be explained to the most slow-witted man if he has not formed any idea of them already; but the simplest thing cannot be made clear to the most intelligent man if he is firmly persuaded that he knows already, without a shadow of doubt, what is laid before him.” What’s going on here? Why don’t facts change our minds? And why would someone continue to believe a false or inaccurate idea anyway? How do such behaviors serve us?\n Humans need a reasonably accurate view of the world in order to survive. If your model of reality is wildly different from the actual world, then you struggle to take effective actions each day. 1 However, truth and accuracy are not the only things that matter to the human mind. Humans also seem to have a deep desire to belong. In Ato

In [110]:
import requests
from bs4 import BeautifulSoup

def scrap_text(url, tag='p', **kwargs):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    text = ' '.join(map(lambda p: p.text, soup.find_all(tag, **kwargs)))
    return text

## Processing text

In [70]:
sents = sent_tokenize(article)
sents

['The economist J.K. Galbraith once wrote, “Faced with a choice between changing one’s mind and proving there is no need to do so, almost everyone gets busy with the proof.” Leo Tolstoy was even bolder: “The most difficult subjects can be explained to the most slow-witted man if he has not formed any idea of them already; but the simplest thing cannot be made clear to the most intelligent man if he is firmly persuaded that he knows already, without a shadow of doubt, what is laid before him.” What’s going on here?',
 'Why don’t facts change our minds?',
 'And why would someone continue to believe a false or inaccurate idea anyway?',
 'How do such behaviors serve us?',
 'Humans need a reasonably accurate view of the world in order to survive.',
 'If your model of reality is wildly different from the actual world, then you struggle to take effective actions each day.',
 '1 However, truth and accuracy are not the only things that matter to the human mind.',
 'Humans also seem to have a de

In [71]:
sents = sents[:sents.index('Be kind first, be right later.')+1]
sents

['The economist J.K. Galbraith once wrote, “Faced with a choice between changing one’s mind and proving there is no need to do so, almost everyone gets busy with the proof.” Leo Tolstoy was even bolder: “The most difficult subjects can be explained to the most slow-witted man if he has not formed any idea of them already; but the simplest thing cannot be made clear to the most intelligent man if he is firmly persuaded that he knows already, without a shadow of doubt, what is laid before him.” What’s going on here?',
 'Why don’t facts change our minds?',
 'And why would someone continue to believe a false or inaccurate idea anyway?',
 'How do such behaviors serve us?',
 'Humans need a reasonably accurate view of the world in order to survive.',
 'If your model of reality is wildly different from the actual world, then you struggle to take effective actions each day.',
 '1 However, truth and accuracy are not the only things that matter to the human mind.',
 'Humans also seem to have a de

In [73]:
word_sent = word_tokenize(article.lower())
word_sent

['the',
 'economist',
 'j.k.',
 'galbraith',
 'once',
 'wrote',
 ',',
 '“',
 'faced',
 'with',
 'a',
 'choice',
 'between',
 'changing',
 'one',
 '’',
 's',
 'mind',
 'and',
 'proving',
 'there',
 'is',
 'no',
 'need',
 'to',
 'do',
 'so',
 ',',
 'almost',
 'everyone',
 'gets',
 'busy',
 'with',
 'the',
 'proof.',
 '”',
 'leo',
 'tolstoy',
 'was',
 'even',
 'bolder',
 ':',
 '“',
 'the',
 'most',
 'difficult',
 'subjects',
 'can',
 'be',
 'explained',
 'to',
 'the',
 'most',
 'slow-witted',
 'man',
 'if',
 'he',
 'has',
 'not',
 'formed',
 'any',
 'idea',
 'of',
 'them',
 'already',
 ';',
 'but',
 'the',
 'simplest',
 'thing',
 'can',
 'not',
 'be',
 'made',
 'clear',
 'to',
 'the',
 'most',
 'intelligent',
 'man',
 'if',
 'he',
 'is',
 'firmly',
 'persuaded',
 'that',
 'he',
 'knows',
 'already',
 ',',
 'without',
 'a',
 'shadow',
 'of',
 'doubt',
 ',',
 'what',
 'is',
 'laid',
 'before',
 'him.',
 '”',
 'what',
 '’',
 's',
 'going',
 'on',
 'here',
 '?',
 'why',
 'don',
 '’',
 't',
 '

In [90]:
_stopwords = set(stopwords.words('english') + list(punctuation) + ['’', '“', '”'])
_stopwords

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'need

In [93]:
word_sent = [word for word in word_sent if word not in _stopwords]
word_sent

['economist',
 'j.k.',
 'galbraith',
 'wrote',
 'faced',
 'choice',
 'changing',
 'one',
 'mind',
 'proving',
 'need',
 'almost',
 'everyone',
 'gets',
 'busy',
 'proof.',
 'leo',
 'tolstoy',
 'even',
 'bolder',
 'difficult',
 'subjects',
 'explained',
 'slow-witted',
 'man',
 'formed',
 'idea',
 'already',
 'simplest',
 'thing',
 'made',
 'clear',
 'intelligent',
 'man',
 'firmly',
 'persuaded',
 'knows',
 'already',
 'without',
 'shadow',
 'doubt',
 'laid',
 'him.',
 'going',
 'facts',
 'change',
 'minds',
 'would',
 'someone',
 'continue',
 'believe',
 'false',
 'inaccurate',
 'idea',
 'anyway',
 'behaviors',
 'serve',
 'us',
 'humans',
 'need',
 'reasonably',
 'accurate',
 'view',
 'world',
 'order',
 'survive',
 'model',
 'reality',
 'wildly',
 'different',
 'actual',
 'world',
 'struggle',
 'take',
 'effective',
 'actions',
 'day',
 '1',
 'however',
 'truth',
 'accuracy',
 'things',
 'matter',
 'human',
 'mind',
 'humans',
 'also',
 'seem',
 'deep',
 'desire',
 'belong',
 'atomic

## Most common words

In [94]:
from nltk.probability import FreqDist
from heapq import nlargest

In [99]:
freq = FreqDist(word_sent)
freq

FreqDist({'people': 28, 'idea': 21, 'ideas': 14, 'someone': 12, 'bad': 12, 'change': 11, 'beliefs': 11, 'like': 10, 'one': 9, 'better': 9, ...})

In [100]:
nlargest(10, freq, key=freq.get)

['people',
 'idea',
 'ideas',
 'someone',
 'bad',
 'change',
 'beliefs',
 'like',
 'one',
 'better']

## Most significant sentences

In [97]:
from collections import defaultdict

In [102]:
ranking = defaultdict(int)

for i, sent in enumerate(sents):
    for w in word_tokenize(sent.lower()):
        if w in freq:
            ranking[i] += freq[w]    

In [104]:
sents_idx = nlargest(5, ranking, key=ranking.get)
sents_idx

[15, 17, 0, 73, 83]

In [105]:
summary = ' '.join([sents[i] for i in sorted(sents_idx)])
summary

'The economist J.K. Galbraith once wrote, “Faced with a choice between changing one’s mind and proving there is no need to do so, almost everyone gets busy with the proof.” Leo Tolstoy was even bolder: “The most difficult subjects can be explained to the most slow-witted man if he has not formed any idea of them already; but the simplest thing cannot be made clear to the most intelligent man if he is firmly persuaded that he knows already, without a shadow of doubt, what is laid before him.” What’s going on here? The Harvard psychologist Steven Pinker put it this way, “People are embraced or condemned according to their beliefs, so one function of the mind may be to hold beliefs that bring the belief-holder the greatest number of allies, protectors, or disciples, rather than beliefs that are most likely to be true.” 2 We don’t always believe things because they are correct. I thought Kevin Simler put it well when he wrote, “If a brain anticipates that it will be rewarded for adopting a

In [111]:
from collections import defaultdict
from string import punctuation
from nltk.probability import FreqDist
from heapq import nlargest


def summarize(text, n=4):
    sents = sent_tokenize(text)

    assert n <= len(sents)
    word_sent = word_tokenize(text.lower())
    _stopwords = set(stopwords.words('english') + list(punctuation) + ['’', '“', '”'])

    word_sent = [st.stem(word) for word in word_sent if word not in _stopwords]
    freq = FreqDist(word_sent)

    ranking = defaultdict(int)

    for i, sent in enumerate(sents):
        for w in word_tokenize(sent.lower()):
            if w in freq:
                ranking[i] += freq[w]

    sents_idx = nlargest(n, ranking, key=ranking.get)
    return ' '.join([sents[i] for i in sorted(sents_idx)])

In [112]:
summarize(article, 2)

'The economist J.K. Galbraith once wrote, “Faced with a choice between changing one’s mind and proving there is no need to do so, almost everyone gets busy with the proof.” Leo Tolstoy was even bolder: “The most difficult subjects can be explained to the most slow-witted man if he has not formed any idea of them already; but the simplest thing cannot be made clear to the most intelligent man if he is firmly persuaded that he knows already, without a shadow of doubt, what is laid before him.” What’s going on here? 6 Let’s call this phenomenon Clear’s Law of Recurrence: The number of people who believe an idea is directly proportional to the number of times it has been repeated during the last year—even if the idea is false.'