# Career Booster Introduction au Deep Learning

## Séance #3 — NLP / NLU

*** 

Ce notebook présente un cas d'usage simple : analyser le sentiment d'un commentaire effectué au sujet d'un film.

## Import des bibliothèques

In [2]:
# --- Numerical analysis
import numpy as np
from statistics import mean
from random import shuffle

# --- Pretty print
from pprint import pprint

# --- NLP
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

## 1. Import des données

In [3]:
nltk.download([
    "names",
    "stopwords",
    "state_union",
    "twitter_samples",
    "movie_reviews",
    "averaged_perceptron_tagger",
    "vader_lexicon",
    "punkt"])

[nltk_data] Downloading package names to /Users/bcl/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/bcl/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package state_union to /Users/bcl/nltk_data...
[nltk_data]   Package state_union is already up-to-date!
[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/bcl/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/bcl/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/bcl/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/bcl/nltk_data...
[nltk_data]   Package vader_lexicon is already up-

True

## 2. Traitement des données

In [4]:
words = [w for w in nltk.corpus.state_union.words() if w.isalpha()]

In [5]:
stopwords = nltk.corpus.stopwords.words("english")

In [6]:
words = [w for w in words if w.lower() not in stopwords]

In [7]:
text = """
For some quick analysis, creating a corpus could be overkill.
If all you need is a word list,
there are simpler ways to achieve that goal.
"""

In [8]:
#words: list[str] = nltk.word_tokenize(text)
text_words = nltk.word_tokenize(text)
pprint(text_words, width=79, compact=True)

['For', 'some', 'quick', 'analysis', ',', 'creating', 'a', 'corpus', 'could',
 'be', 'overkill', '.', 'If', 'all', 'you', 'need', 'is', 'a', 'word', 'list',
 ',', 'there', 'are', 'simpler', 'ways', 'to', 'achieve', 'that', 'goal', '.']


In [9]:
freq_dist = nltk.FreqDist(words)

In [10]:
freq_dist.most_common(3)

[('must', 1568), ('people', 1291), ('world', 1128)]

In [11]:
freq_dist.tabulate(3)

  must people  world 
  1568   1291   1128 


In [12]:
lower_fd = nltk.FreqDist([w.lower() for w in freq_dist])

In [13]:
lower_fd

FreqDist({'world': 3, 'year': 3, 'new': 3, 'congress': 3, 'peace': 3, 'federal': 3, 'program': 3, 'government': 3, 'war': 3, 'economic': 3, ...})

In [14]:
text = nltk.Text(nltk.corpus.state_union.words())
text.concordance("america", lines=5)

Displaying 5 of 1079 matches:
 would want us to do . That is what America will do . So much blood has already
ay , the entire world is looking to America for enlightened leadership to peace
beyond any shadow of a doubt , that America will continue the fight for freedom
 to make complete victory certain , America will never become a party to any pl
nly in law and in justice . Here in America , we have labored long and hard to 


In [15]:
concordance_list = text.concordance_list("america", lines=2)
for entry in concordance_list:
    print(entry.line)

 would want us to do . That is what America will do . So much blood has already
ay , the entire world is looking to America for enlightened leadership to peace


In [16]:
finder = nltk.collocations.TrigramCollocationFinder.from_words(words)

In [17]:
finder.ngram_fd.most_common(10)

[(('CONGRESS', 'STATE', 'UNION'), 46),
 (('JOINT', 'SESSION', 'CONGRESS'), 37),
 (('Mr', 'Speaker', 'Mr'), 37),
 (('dollars', 'fiscal', 'year'), 35),
 (('United', 'States', 'America'), 34),
 (('ADDRESS', 'JOINT', 'SESSION'), 32),
 (('STATE', 'UNION', 'January'), 32),
 (('State', 'local', 'governments'), 29),
 (('million', 'dollars', 'fiscal'), 28),
 (('Speaker', 'Mr', 'President'), 26)]

NLTK already has a built-in, pretrained sentiment analyzer called VADER (Valence Aware Dictionary and sEntiment Reasoner).

In [18]:
sia = SentimentIntensityAnalyzer()
sia.polarity_scores("Wow, NLTK is really powerful!")

{'neg': 0.0, 'neu': 0.295, 'pos': 0.705, 'compound': 0.8012}

In [19]:
tweets = [t.replace("://", "//") for t in nltk.corpus.twitter_samples.strings()]

In [1]:
def is_positive(tweet: str) -> bool:
    """True if tweet has positive compound sentiment, False otherwise."""
    return sia.polarity_scores(tweet)["compound"] > 0

shuffle(tweets)
for tweet in tweets[:10]:
    print(">", is_positive(tweet), tweet)

NameError: name 'shuffle' is not defined

In [21]:
positive_review_ids = nltk.corpus.movie_reviews.fileids(categories=["pos"])
negative_review_ids = nltk.corpus.movie_reviews.fileids(categories=["neg"])
all_review_ids = positive_review_ids + negative_review_ids

In [22]:
def is_positive(review_id: str) -> bool:
    """True if the average of all sentence compound scores is positive."""
    text = nltk.corpus.movie_reviews.raw(review_id)
    scores = [
        sia.polarity_scores(sentence)["compound"]
        for sentence in nltk.sent_tokenize(text)
    ]
    return np.mean(scores) > 0


In [23]:
shuffle(all_review_ids)
correct = 0
for review_id in all_review_ids:
    if is_positive(review_id):
        if review_id in positive_review_ids:
            correct += 1
    else:
        if review_id in negative_review_ids:
            correct += 1

print(F"{correct / len(all_review_ids):.2%} correct")

64.00% correct


## 3. Customization

In [24]:
unwanted = nltk.corpus.stopwords.words("english")
unwanted.extend([w.lower() for w in nltk.corpus.names.words()])

def skip_unwanted(pos_tuple):
    word, tag = pos_tuple
    if not word.isalpha() or word in unwanted:
        return False
    if tag.startswith("NN"):
        return False
    return True

positive_words = [word for word, tag in filter(
    skip_unwanted,
    nltk.pos_tag(nltk.corpus.movie_reviews.words(categories=["pos"]))
)]

negative_words = [word for word, tag in filter(
    skip_unwanted,
    nltk.pos_tag(nltk.corpus.movie_reviews.words(categories=["neg"]))

)]

In [25]:
positive_fd = nltk.FreqDist(positive_words)
negative_fd = nltk.FreqDist(negative_words)

common_set = set(positive_fd).intersection(negative_fd)

for word in common_set:
    del positive_fd[word]
    del negative_fd[word]

top_100_positive = {word for word, count in positive_fd.most_common(100)}
top_100_negative = {word for word, count in negative_fd.most_common(100)}


In [26]:
unwanted = nltk.corpus.stopwords.words("english")
unwanted.extend([w.lower() for w in nltk.corpus.names.words()])

positive_bigram_finder = nltk.collocations.BigramCollocationFinder.from_words([
    w for w in nltk.corpus.movie_reviews.words(categories=["pos"])
    if w.isalpha() and w not in unwanted
])
negative_bigram_finder = nltk.collocations.BigramCollocationFinder.from_words([
    w for w in nltk.corpus.movie_reviews.words(categories=["neg"])
    if w.isalpha() and w not in unwanted
])


In [27]:
def extract_features(text):
    features = dict()
    wordcount = 0
    compound_scores = list()
    positive_scores = list()

    for sentence in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sentence):
            if word.lower() in top_100_positive:
                wordcount += 1
        compound_scores.append(sia.polarity_scores(sentence)["compound"])
        positive_scores.append(sia.polarity_scores(sentence)["pos"])

    # Adding 1 to the final compound score to always have positive numbers
    # since some classifiers you'll use later don't work with negative numbers.
    features["mean_compound"] = mean(compound_scores) + 1
    features["mean_positive"] = mean(positive_scores)
    features["wordcount"] = wordcount

    return features


In [28]:
features = [
    (extract_features(nltk.corpus.movie_reviews.raw(review)), "pos")
    for review in nltk.corpus.movie_reviews.fileids(categories=["pos"])
]

features.extend([
    (extract_features(nltk.corpus.movie_reviews.raw(review)), "neg")
    for review in nltk.corpus.movie_reviews.fileids(categories=["neg"])
])


In [29]:
# Use 1/4 of the set for training
train_count = len(features) // 4
shuffle(features)
classifier = nltk.NaiveBayesClassifier.train(features[:train_count])
classifier.show_most_informative_features(10)

Most Informative Features
               wordcount = 3                 pos : neg    =      7.9 : 1.0
               wordcount = 2                 pos : neg    =      3.1 : 1.0
               wordcount = 0                 neg : pos    =      1.7 : 1.0
               wordcount = 1                 pos : neg    =      1.1 : 1.0
           mean_positive = 0.1437            pos : neg    =      1.0 : 1.0


In [30]:
nltk.classify.accuracy(classifier, features[train_count:])

0.6673333333333333

## 4. Prédiction sur un nouveau commentaire

In [31]:
new_review = """
I love it! wonderful movie!
"""
features = extract_features(new_review)
classifier.classify(features)

'neg'

In [80]:
features = extract_features(new_review)

In [81]:
features

{'mean_compound': 1.148, 'mean_positive': 0.108, 'wordcount': 0}