Antoine NUTTINCK  
MS Big Data  
TELECOM ParisTech  
05/06/2017

# ANALYSE DES OPINIONS SOUS TWITTER

In [1]:
import pandas as pd
import nltk
import csv
import re
import os
import numpy as np
import itertools
from nltk.corpus import wordnet as wn
from sentiwordnet import SentiWordNetCorpusReader, SentiSynset
from sklearn.metrics import accuracy_score, confusion_matrix
import sys
print(sys.version)

2.7.12 |Anaconda custom (64-bit)| (default, Jun 29 2016, 11:07:13) [MSC v.1500 64 bit (AMD64)]


In [3]:
pd.__version__

'0.17.1'

In [4]:
# nltk.download()

In [5]:
data_path = os.path.join('.', 'testdata.manual.2009.06.14.csv')
raw_data = pd.read_csv(data_path, sep='","', header=None, engine='python',
                           names=['polarite', 'id', 'date', 'requete',
                                  'utilisateur', 'text'])

## 3. Implémentation d'un systèmede détection d'opinions dans les tweets

### 3.1. Matériel :  Présentation du corpus de tweets

### 3.2. Prétraitements

In [6]:
def pretraitement(raw_tweets, dicoSlang):
    
    
    tweets = []
    url_pat = r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*)*\/?'
    nb_hashtags = 0
    nb_ats = 0

    for text in raw_tweets:
        text_without_url = re.sub(url_pat, '', text)
        text_without_url_ht, nb_hashtag = re.subn('#', '', text_without_url)
        text_without_url_ht_at, nb_at = re.subn('@\w*', '', text_without_url_ht)
        words = nltk.word_tokenize(text_without_url_ht_at)
        nb_hashtags += nb_hashtag
        nb_ats += nb_at

        for abr in dicoSlang.keys():
            text_without_url_ht_at_abr = re.sub(abr, dicoSlang[abr], text_without_url_ht)
        
        clean_words = nltk.word_tokenize(text_without_url)
    
        tweets.append(clean_words)

    print("Nombre de #", nb_hashtags)
    print("Nombre de @", nb_ats)

    return tweets

In [7]:
def load_dico(dico_path):

    with open(dico_path, 'rb') as dataFile:
        dicoSlang = csv.reader(dataFile, delimiter='\t')
        if len(next(dicoSlang)) > 1:
            dico = {}
            for abrev in dicoSlang:
                dico[abrev[0]] = abrev[1]
            # dico = pd.read_csv(dico_path, delimiter='\t+', index_col=0, encoding='cp1252', header=None, engine='python')
        else:
            dico = set()
            dico.update(itertools.chain.from_iterable(dicoSlang))
        dataFile.close()
        
    return dico

In [8]:
dicoSlang = load_dico(os.path.join('.', 'SlangLookupTable.txt'))
clean_tweets = pretraitement(raw_data.text, dicoSlang)

('Nombre de #', 52)
('Nombre de @', 128)


### 3.3. Etiquetage grammatical

In [9]:
def pos_tagger(tweets):

    taggedtweets = []
    for tweet in tweets:
        taggedtweets.append(nltk.pos_tag(tweet))

    return taggedtweets

In [10]:
tagged_tweets = pos_tagger(clean_tweets)
cntvb = lambda tagged_tweet: len(filter(lambda pos: pos[1][:2] == 'VB', tagged_tweet))
print("Il y a {0} mots étiquetés comme verbes dans le corpus.".format(sum(map(cntvb, tagged_tweets))))

Il y a 1104 mots étiquetés comme verbes dans le corpus.


### 3.4. Algorithme de détection v1 : appel au dictionnaire Sentiwordnet

In [12]:
adj_tags = ['JJ', 'JJR', 'JJS']
noun_tags = ['NN', 'NNP', 'NNPS', 'NNS']
adv_tags = ['RB', 'RBR', 'RBS']
verb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
accepted_tags = adj_tags + noun_tags + adv_tags + verb_tags

In [13]:
def getting_sentiment(preprocessed_tweets, acceptedPoSList):
    
    swn_filename = 'SentiWordNet_3.0.0_20130122.txt'
    swn = SentiWordNetCorpusReader(swn_filename)

    sentimentScore = []

    for pTweet in preprocessed_tweets:
        posScore = 0
        negScore = 0

        for word, pos in pTweet:
            if pos in acceptedPoSList:
                synset = wn.synsets(word)
                if synset:
                    senti_synset = swn.senti_synset(synset[0].name())
                    if senti_synset:
                        posScore += senti_synset.pos_score
                        negScore += senti_synset.neg_score
        final_sent = '"4"' if posScore > negScore else '"0"' if posScore < negScore else '"2"'
        sentimentScore.append([posScore, negScore, final_sent])

    return sentimentScore

In [14]:
def compute_scores(predictions, labels):
    
    neg_right_preds = ((predictions == '"0"') * (labels == '"0"'))
    true_neg = float(np.sum(neg_right_preds)) / np.sum((labels == '"0"'))
    
    pos_right_preds = ((predictions == '"4"') * (labels == '"4"'))
    true_pos = float(np.sum(pos_right_preds)) / np.sum((labels == '"4"'))
    
    neut_right_preds = ((predictions == '"2"') * (labels == '"2"'))
    true_neut = float(np.sum(neut_right_preds)) / np.sum((labels == '"2"'))
    
    return true_pos, true_neg, true_neut

In [38]:
preds = getting_sentiment(tagged_tweets, accepted_tags)
valid_preds = compute_scores(np.array(preds)[:,2], raw_data.polarite.values)
print("Il y a {0:.2%} de tweets positifs détectés avec cette version de l'algorithme.".format(valid_preds[0]))
print "Matrice de confusion:\n", confusion_matrix(raw_data.polarite.values, np.array(preds)[:,2])
print "Accuracy: {:.2%}".format(accuracy_score(raw_data.polarite.values, np.array(preds)[:,2]))

Il y a 64.84% de tweets positifs détectés avec cette version de l'algorithme.
Matrice de confusion:
[[ 70  33  74]
 [ 14  62  63]
 [ 28  36 118]]
Accuracy: 50.20%


### 3.5. Algorithme de détection v2 : gestion de la négation et des modifieurs

In [16]:
def getting_sentiment_v2(raw_tweets,
                            preprocessed_tweets,
                            acceptedPoSList,
                            negList,
                            boosterWordList):
    
    swn_filename = 'SentiWordNet_3.0.0_20130122.txt'
    swn = SentiWordNetCorpusReader(swn_filename)
    negList_re = "|".join(map(lambda neg: re.escape(neg), list(negList)))
    boostWdList_re = "|".join(map(lambda bstwd: re.escape(bstwd), list(boosterWordList)))

    sentimentScore = []
    # le facteur w permet d'augmenter l'impact du score d'un mot sur le score global du tweet
    w = 1

    for raw_tweet, pTweet in zip(raw_tweets, preprocessed_tweets):
        posScore = 0
        negScore = 0

        for word, pos in pTweet:
            if pos in acceptedPoSList:
                synset = wn.synsets(word)
                if synset:
                    senti_synset = swn.senti_synset(synset[0].name())
                    if senti_synset:
                        # la structure du code permet qu'un mot soit à a la fois
                        # un modifieur et une négation
                        # (même si  ce n'est pas le cas ici)
                        # if any([pTweet[n_tok - len(tbwd)].lower() for tbwd in tok_boosterWordList]):
                        if re.search("(" + negList_re + ")\s*(" + re.escape(word) + ")", raw_tweet.lower()):
                            w = 2

                        # if any([sum([pTweet[n_tok - len(tnwd):n_tok]], []) == tnwd for tnwd in tok_negList]):
                        if re.search("(" + negList_re + ")\s*(" + re.escape(word) + ")", raw_tweet.lower()):
                            # print(re.search("(" + negList_re + ")\s*(" + re.escape(word) + ")", raw_tweet).groups())
                            posScore += w * senti_synset.neg_score
                            negScore += w * senti_synset.pos_score
                        else:
                            posScore += w * senti_synset.pos_score
                            negScore += w * senti_synset.neg_score
            w = 1
        final_sent = '"4"' if posScore > negScore else '"0"' if posScore < negScore else '"2"'
        sentimentScore.append([posScore, negScore, final_sent])

    return sentimentScore

In [26]:
negList_filename = os.path.join('.', 'NegatingWordList.txt')
negList = load_dico(negList_filename)
negList_re = "|".join(map(lambda neg: re.escape(neg), list(negList)))

booster_filename = os.path.join('.', 'BoosterWordList.txt')
boosterWordList = load_dico(booster_filename).keys()

preds2 = getting_sentiment_v2(raw_data.text,
                                tagged_tweets,
                                accepted_tags,
                                negList,
                                boosterWordList)

# pos_tweets = [tw for tw, cl in zip(tagged_tweets, scores2) if cl[2]=='"4"']
pos_tweets = raw_data.text[raw_data.polarite == '"4"']
cntpos = len(pos_tweets)

tr_pos2, tr_neg2, tr_neut2 = compute_scores(np.array(preds2)[:,2], raw_data.polarite.values)

print("Il y a {0:.2%} de tweets positifs correctement détectés avec la version 2 de l'algorithme.".format(tr_pos2))
print("Il y a {0:.2%} tweets négatifs correctement détectés avec la version 2 de l'algorithme.".format(tr_neg2))
print("Il y a {0:.2%} tweets neutres correctement détectés avec la version 2 de l'algorithme.".format(tr_neut2))

neg_inpostw = [len(re.findall("\s*(" + negList_re + ")\s*", pos_tweet)) for pos_tweet in pos_tweets]
cntneg_inpostw = sum(neg_inpostw)
print("\nIl y a {0}/{1} termes négatifs contenus dans les {2} tweets positifs.".format(cntneg_inpostw,
                                                                                          len(neg_inpostw),
                                                                                          cntpos))

print "Matrice de confusion:\n", confusion_matrix(raw_data.polarite.values, np.array(preds2)[:,2])
print "Accuracy: {:.2%}".format(accuracy_score(raw_data.polarite.values, np.array(preds2)[:,2]))

Il y a 65.38% de tweets positifs correctement détectés avec la version 2 de l'algorithme.
Il y a 40.11% tweets négatifs correctement détectés avec la version 2 de l'algorithme.
Il y a 44.60% tweets neutres correctement détectés avec la version 2 de l'algorithme.

Il y a 13/182 termes négatifs contenus dans les 182 tweets positifs.
Matrice de confusion:
[[ 71  31  75]
 [ 14  62  63]
 [ 27  36 119]]
Accuracy: 50.60%


### 3.6. Algorithme de détection v3 : gestion des emoticons

In [18]:
def load_emoticons(emoticons_filename):

    emoticons_dico = load_dico(emoticons_filename)
    pos_emoticons = []
    neg_emoticons = []

    for emo, label in emoticons_dico.items():
        if int(label) > 0:
            pos_emoticons.append(emo)
        elif int(label) < 0:
            neg_emoticons.append(emo)
        else:
            # les emoticons avec un score de 0 ne sont pas renvoyer
            pass

    return pos_emoticons, neg_emoticons

In [19]:
def getting_sentiment_v3(raw_tweets,
                            preprocessed_tweets,
                            acceptedPoSList,
                            negList,
                            boosterWordList,
                            posEmoticonList,
                            negEmoticonList):
    
    swn_filename = 'SentiWordNet_3.0.0_20130122.txt'
    swn = SentiWordNetCorpusReader(swn_filename)
    negList_re = "|".join(map(lambda neg: re.escape(neg), list(negList)))
    boostWdList_re = "|".join(map(lambda bstwd: re.escape(bstwd), list(boosterWordList)))
    posEmos_re = "|".join(map(lambda p_emo: re.escape(p_emo), list(posEmoticonList)))
    negEmos_re = "|".join(map(lambda n_emo: re.escape(n_emo), list(negEmoticonList)))
    url_pattern = r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*)*\/?'

    sentimentScore = []
    # le facteur w permet d'augmenter l'impact du score d'un mot sur le score global du tweet
    w = 1

    for raw_tweet, pTweet in zip(raw_tweets, preprocessed_tweets):
        posScore = 0
        negScore = 0

        for word, pos in pTweet:
            if pos in acceptedPoSList:
                synset = wn.synsets(word)
                if synset:
                    senti_synset = swn.senti_synset(synset[0].name())
                    if senti_synset:
                        # la structure du code permet qu'un mot soit à a la fois
                        # un modifieur et une négation
                        # (même si  ce n'est pas le cas ici)
                        if re.search("(" + negList_re + ")\W+(" + re.escape(word) + ")", raw_tweet.lower()):
                            w = 2

                        if re.search("(" + negList_re + ")\W+(" + re.escape(word) + ")", raw_tweet.lower()):
                            posScore += w * senti_synset.neg_score
                            negScore += w * senti_synset.pos_score
                        else:
                            posScore += w * senti_synset.pos_score
                            negScore += w * senti_synset.neg_score
            w = 1


        nb_posemo = len(re.findall("(" + posEmos_re + ")\s*", re.sub(url_pattern, "", raw_tweet)))
        posScore += nb_posemo
        nb_negemo = len(re.findall("(" + negEmos_re + ")\s*", re.sub(url_pattern, "", raw_tweet)))
        negScore += nb_negemo
        
        final_sent = '"4"' if posScore > negScore else '"0"' if posScore < negScore else '"2"'
        sentimentScore.append([posScore, negScore, final_sent])

    return sentimentScore

In [31]:
negList_filename = os.path.join('.', 'NegatingWordList.txt')
negList = load_dico(negList_filename)
booster_filename = os.path.join('.', 'BoosterWordList.txt')
boosterWordList = load_dico(booster_filename).keys()
emoticons_filename = os.path.join('.', 'EmoticonLookupTable.txt')
posEmoticonList, negEmoticonList = load_emoticons(emoticons_filename)

preds3 = getting_sentiment_v3(raw_data.text, tagged_tweets,
                                accepted_tags,
                                negList,
                                boosterWordList,
                                posEmoticonList,
                                negEmoticonList)

tr_pos3, tr_neg3, tr_neut3 = compute_scores(np.array(preds3)[:,2], raw_data.polarite.values)
nb_tweets = len(tagged_tweets)

print("Il y a {0:.2%} tweets positifs correctement détectés avec la version 3 de l'algorithme.".format(tr_pos3))
print("Il y a {0:.2%} tweets négatifs correctement détectés avec la version 3 de l'algorithme.".format(tr_neg3))
print("Il y a {0:.2%} tweets neutres correctement détectés avec la version 3 de l'algorithme.".format(tr_neut3))

emoticons = posEmoticonList + negEmoticonList
emoticon_re = "|".join(map(lambda emo: re.escape(emo), emoticons))
nb_emoticons = raw_data.text.str.count(emoticon_re).sum()
print("\nIl y a {0} émoticons contenus dans le corpus de {1} tweets.\n".format(nb_emoticons,
                                                                              nb_tweets))
print "Matrice de confusion:\n", confusion_matrix(raw_data.polarite.values, np.array(preds3)[:,2])
print "Accuracy: {:.2%}".format(accuracy_score(raw_data.polarite.values, np.array(preds3)[:,2]))

Il y a 69.23% tweets positifs correctement détectés avec la version 3 de l'algorithme.
Il y a 45.76% tweets négatifs correctement détectés avec la version 3 de l'algorithme.
Il y a 44.60% tweets neutres correctement détectés avec la version 3 de l'algorithme.

Il y a 180 émoticons contenus dans le corpus de 498 tweets.

Matrice de confusion:
[[ 81  28  68]
 [ 14  62  63]
 [ 25  31 126]]
Accuracy: 54.02%


### 3.7. Votre version : v4

In [33]:
# Ex de tweets classe a tord comme negatifs
neg_predAsPos = raw_data.text[(np.array(preds3)[:,2]=='"4"') & (raw_data.polarite =='"0"')]
print("Il y a {} tweets avec une polarité negative et qui sont prédis comme positifs.\n".format(neg_predAsPos.shape[0]))
print(neg_predAsPos)

Il y a 68 tweets avec une polarité negative et qui sont prédis comme positifs.

11     "@Karoli I firmly believe that Obama/Pelosi ha...
14     "dear nike, stop with the flywire. that shit i...
33     "Played with an android google phone. The slid...
35     "omg so bored &amp; my tattoooos are so itchy!...
48     "?Obama Administration Must Stop Bonuses to AI...
49     "started to think that Citi is in really deep ...
54     "annoying new trend on the internets:  people ...
64     "@morind45 Because the twitter api is slow and...
79     "Took the Graduate Field Exam for Computer Sci...
87     "Can we just go ahead and blow North Korea off...
88     "North Korea, please cease this douchebaggery....
93     "just got back from church, and I totally hate...
94     "Just got mcdonalds goddam those eggs make me ...
103    """The Republican party is a bunch of anti-abo...
104    "is Twitter's connections API broken? Some twe...
138    "It's unfortunate that after the Stimulus plan...
140    "

In [27]:
def tag_synmatch(nltk_tag):
    if  nltk_tag.startswith("VB"): 
        return 'v'
    elif nltk_tag.startswith("JJ") :
        return "a"
    elif nltk_tag.startswith("ADV") :
        return "r"
    elif nltk_tag.startswith("N"):
        return "n"
    else:
        return None

In [62]:
def getting_sentiment_v4(raw_tweets,
                            preprocessed_tweets,
                            acceptedPoSList,
                            negList,
                            boosterWordList,
                            posEmoticonList,
                            negEmoticonList):
    
    swn_filename = 'SentiWordNet_3.0.0_20130122.txt'
    swn = SentiWordNetCorpusReader(swn_filename)
    negList_re = "|".join(map(lambda neg: re.escape(neg), list(negList)))
    boostWdList_re = "|".join(map(lambda bstwd: re.escape(bstwd), list(boosterWordList)))
    posEmos_re = "|".join(map(lambda p_emo: re.escape(p_emo), list(posEmoticonList)))
    negEmos_re = "|".join(map(lambda n_emo: re.escape(n_emo), list(negEmoticonList)))
    url_pattern = r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*)*\/?'

    sentimentScore = []
    # le facteur w permet d'augmenter l'impact du score d'un mot sur le score global du tweet
    w = 1

    for raw_tweet, pTweet in zip(raw_tweets, preprocessed_tweets):
        posScore = 0
        negScore = 0

        for word, pos in pTweet:
            if pos in acceptedPoSList:
                synsets = wn.synsets(word, pos=tag_synmatch(pos))
                if synsets:
                    for synset in synsets[:3]:
                        senti_synset = swn.senti_synset(synset.name())
                        if senti_synset:
                            if re.search(r'(' + negList_re + ')\s*(' + re.escape(word) + ')', raw_tweet.lower()) or \
                                re.match(r'[A-Z]{2,}', word):
                                w = 2

                            if re.search("(" + negList_re + ")\s*(" + re.escape(word) + ")", raw_tweet.lower()):
                                posScore += w * senti_synset.neg_score
                                negScore += w * senti_synset.pos_score
                            else:
                                posScore += w * senti_synset.pos_score
                                negScore += w * senti_synset.neg_score
            w = 1


        nb_posemo = len(re.findall("(" + posEmos_re + ")\s*", re.sub(url_pattern, "", raw_tweet)))
        posScore += nb_posemo
        nb_negemo = len(re.findall("(" + negEmos_re + ")\s*", re.sub(url_pattern, "", raw_tweet)))
        negScore += nb_negemo
        
        final_sent = '"4"' if posScore > negScore else '"0"' if posScore < negScore else '"2"'
        sentimentScore.append([posScore, negScore, final_sent])

    return sentimentScore

In [63]:
negList_filename = os.path.join('.', 'NegatingWordList.txt')
negList = load_dico(negList_filename)
booster_filename = os.path.join('.', 'BoosterWordList.txt')
boosterWordList = load_dico(booster_filename).keys()
emoticons_filename = os.path.join('.', 'EmoticonLookupTable.txt')
posEmoticonList, negEmoticonList = load_emoticons(emoticons_filename)

preds4 = getting_sentiment_v4(raw_data.text,
                                tagged_tweets,
                                accepted_tags,
                                negList,
                                boosterWordList,
                                posEmoticonList,
                                negEmoticonList)

tr_pos3, tr_neg3, tr_neut3 = compute_scores(np.array(preds4)[:,2], raw_data.polarite.values)

print("Il y a {0:.2%} tweets positifs correctement détectés avec la version 4 de l'algorithme.".format(tr_pos3))
print("Il y a {0:.2%} tweets négatifs correctement détectés avec la version 4 de l'algorithme.".format(tr_neg3))
print("Il y a {0:.2%} tweets neutres correctement détectés avec la version 4 de l'algorithme.".format(tr_neut3))
print "Matrice de confusion:\n", confusion_matrix(raw_data.polarite.values, np.array(preds4)[:,2])
print "Accuracy: {:.2%}".format(accuracy_score(raw_data.polarite.values, np.array(preds4)[:,2]))

Il y a 84.07% tweets positifs correctement détectés avec la version 4 de l'algorithme.
Il y a 40.11% tweets négatifs correctement détectés avec la version 4 de l'algorithme.
Il y a 20.14% tweets neutres correctement détectés avec la version 4 de l'algorithme.
Matrice de confusion:
[[ 71  18  88]
 [ 23  28  88]
 [ 16  13 153]]
Accuracy: 50.60%
