Do sprawdzenia dokładności algorytmów wykorzystano dane pochodzą z https://www.kaggle.com/kazanova/sentiment140. Kolumna target zawiera etykiety, które świadczą o wydźwięku danego tweeta: 0 oznacza wydźwięk negatywny, 4 oznacza wydźwięk pozytywny.

In [133]:
import pandas as pd
import csv
import numpy as np
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import wordnet
import re

from nltk.stem import WordNetLemmatizer
from sklearn.metrics import classification_report
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from afinn import Afinn
from textblob import TextBlob

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [134]:
df = pd.read_csv('sentyment140.csv', encoding='latin-1', header=None)
df.columns = ['target', 'id', 'date', 'flag', 'user', 'text']

In [167]:
df

array([0, 4], dtype=int64)

In [136]:
# funkcja ktora wyczysci tekst ze zbednych znacznikow
def clean(text):
    #usunie wszystkie znaki niealfanumeryczne, oprócz alfabetu (az) i cyfr (0-9). Znak ^ oznacza z wyjątkiem.
    text_without_symbols = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z'’\t])|(\w+:\/\/\S+)"," ",text)
    regex_pattern = re.compile(pattern = "["
        #u"\U0001F600-\U0001F64F"  # emotikony
        u"\U0001F300-\U0001F5FF"  # symbole i piktogramy
        u"\U0001F680-\U0001F6FF"  # symbole transportu i mapy
        u"\U0001F1E0-\U0001F1FF"  # flagi
                           "]+", flags = re.UNICODE)
    # spacja zamiast emotikonow/symboli
    text_without_pattern = re.sub(regex_pattern, ' ', text_without_symbols)
    
    #usuwa linki
    link = re.compile(r'(https?://)?(www\.)?(\w+\.)?(\w+)(\.\w+)(/.+)?')
    text_without_pattern = re.sub(link, ' ', text_without_pattern)

    return text_without_pattern

In [137]:
df['CleanedText'] = df['text'].apply(clean)
df.head()

Unnamed: 0,target,id,date,flag,user,text,CleanedText
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",Awww that's a bummer You shoulda got ...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,I dived many times for the ball Managed to ...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",no it's not behaving at all i'm mad why a...


In [138]:
#df['Tokenize'] = df['CleanedText'].apply(word_tokenize)
df['Tokenize'] = df['CleanedText'].str.split()

In [139]:
df['Tokenize']

0          [Awww, that's, a, bummer, You, shoulda, got, D...
1          [is, upset, that, he, can't, update, his, Face...
2          [I, dived, many, times, for, the, ball, Manage...
3          [my, whole, body, feels, itchy, and, like, its...
4          [no, it's, not, behaving, at, all, i'm, mad, w...
                                 ...                        
1599995    [Just, woke, up, Having, no, school, is, the, ...
1599996    [TheWDB, com, Very, cool, to, hear, old, Walt,...
1599997    [Are, you, ready, for, your, MoJo, Makeover, A...
1599998    [Happy, 38th, Birthday, to, my, boo, of, alll,...
1599999                              [happy, charitytuesday]
Name: Tokenize, Length: 1600000, dtype: object

In [140]:
pos_dict = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}
def token_stop_pos(text):
    tags = pos_tag(word_tokenize(text))
    newlist = []
    for word, tag in tags:
        if word not in set(stopwords.words('english')):
            newlist.append(tuple([word, pos_dict.get(tag[0])]))
    return newlist

In [141]:
df['POS_tagged'] = df['text'].apply(token_stop_pos)

In [142]:
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatize(pos_data):
    lemma_rew = " "
    for word, pos in pos_data:
        if not pos:
            lemma = word
            lemma_rew = lemma_rew + " " + lemma
        else:
            lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
            lemma_rew = lemma_rew + " " + lemma
    return lemma_rew

df['Lemma'] = df['POS_tagged'].apply(lemmatize)

# Vader

In [143]:
#sentyment vader
analyzer = SentimentIntensityAnalyzer()

def get_vader_score(sentence): 
    compound = analyzer.polarity_scores(sentence)['compound']
    if compound >= 0: 
        return 4
    else: 
        return 0

In [144]:
df['Vader_lemma'] = df.apply(lambda x: get_vader_score(x['Lemma']), axis=1)

In [145]:
# główne metryki klasyfikacji
# wartość powinna być jak najbliższa 1.00

print(classification_report(df.dropna()["target"].values, df.dropna()["Vader_lemma"].values))

              precision    recall  f1-score   support

           0       0.81      0.40      0.53    800000
           4       0.60      0.90      0.72    800000

    accuracy                           0.65   1600000
   macro avg       0.70      0.65      0.63   1600000
weighted avg       0.70      0.65      0.63   1600000



# Afinn

In [146]:
af = Afinn()
df['Afinn_Sentiment_lemma'] = [af.score(i) for i in df['Lemma']]

def analysis(score):
    if score >= 0:
        return 4
    else:
        return 0

In [147]:
df["Afinn_lemma"] = df.apply(lambda x: analysis(x["Afinn_Sentiment_lemma"]), axis=1)

In [148]:
print(classification_report(df.dropna()["target"].values, df.dropna()["Afinn_lemma"].values))

              precision    recall  f1-score   support

           0       0.80      0.38      0.52    800000
           4       0.59      0.91      0.72    800000

    accuracy                           0.64   1600000
   macro avg       0.70      0.64      0.62   1600000
weighted avg       0.70      0.64      0.62   1600000



# TextBlob

In [149]:
def getPolarity(text):
    return TextBlob(text).sentiment.polarity

In [150]:
df['Textblob_polarity_lemma'] = df['Lemma'].apply(getPolarity) 

In [151]:
df["Textblob_lemma"] = df.apply(lambda x: analysis(x["Textblob_polarity_lemma"]), axis=1)

In [152]:
print(classification_report(df.dropna()["target"].values, df.dropna()["Textblob_lemma"].values))

              precision    recall  f1-score   support

           0       0.76      0.29      0.42    800000
           4       0.56      0.90      0.69    800000

    accuracy                           0.60   1600000
   macro avg       0.66      0.60      0.56   1600000
weighted avg       0.66      0.60      0.56   1600000



# Przeprowadzenie analizy tylko za pomocą -> Clean + Tokenizacja + stopWords

In [153]:
stop_words = set(stopwords.words('english'))
df['Text_Ready'] = df['Tokenize'].apply(lambda x: [item for item in x if item not in stop_words])

In [154]:
df['Text_Ready']= df['Text_Ready'].apply(str)

In [155]:
df['Text_Ready']

0          ['Awww', "that's", 'bummer', 'You', 'shoulda',...
1          ['upset', "can't", 'update', 'Facebook', 'text...
2          ['I', 'dived', 'many', 'times', 'ball', 'Manag...
3          ['whole', 'body', 'feels', 'itchy', 'like', 'f...
4            ['behaving', "i'm", 'mad', 'I', "can't", 'see']
                                 ...                        
1599995    ['Just', 'woke', 'Having', 'school', 'best', '...
1599996    ['TheWDB', 'com', 'Very', 'cool', 'hear', 'old...
1599997    ['Are', 'ready', 'MoJo', 'Makeover', 'Ask', 'd...
1599998    ['Happy', '38th', 'Birthday', 'boo', 'alll', '...
1599999                          ['happy', 'charitytuesday']
Name: Text_Ready, Length: 1600000, dtype: object

In [156]:
df['Vader'] = df.apply(lambda x: get_vader_score(x['Text_Ready']), axis=1)

In [157]:
print(classification_report(df.dropna()["target"].values, df.dropna()["Vader"].values))

              precision    recall  f1-score   support

           0       0.80      0.39      0.52    800000
           4       0.60      0.90      0.72    800000

    accuracy                           0.65   1600000
   macro avg       0.70      0.65      0.62   1600000
weighted avg       0.70      0.65      0.62   1600000



In [158]:
df['Afinn_Sentiment'] = [af.score(i) for i in df['Text_Ready']]
df["Afinn"] = df.apply(lambda x: analysis(x["Afinn_Sentiment"]), axis=1)

In [159]:
print(classification_report(df.dropna()["target"].values, df.dropna()["Afinn"].values))

              precision    recall  f1-score   support

           0       0.80      0.39      0.52    800000
           4       0.60      0.90      0.72    800000

    accuracy                           0.65   1600000
   macro avg       0.70      0.65      0.62   1600000
weighted avg       0.70      0.65      0.62   1600000



In [160]:
df['Textblob_polarity'] = df['Text_Ready'].apply(getPolarity)
df["Textblob"] = df.apply(lambda x: analysis(x["Textblob_polarity"]), axis=1)

In [161]:
print(classification_report(df.dropna()["target"].values, df.dropna()["Textblob"].values))

              precision    recall  f1-score   support

           0       0.76      0.30      0.43    800000
           4       0.57      0.91      0.70    800000

    accuracy                           0.60   1600000
   macro avg       0.66      0.60      0.57   1600000
weighted avg       0.66      0.60      0.57   1600000



In [162]:
df

Unnamed: 0,target,id,date,flag,user,text,CleanedText,Tokenize,POS_tagged,Lemma,...,Afinn_Sentiment_lemma,Afinn_lemma,Textblob_polarity_lemma,Textblob_lemma,Text_Ready,Vader,Afinn_Sentiment,Afinn,Textblob_polarity,Textblob
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",Awww that's a bummer You shoulda got ...,"[Awww, that's, a, bummer, You, shoulda, got, D...","[(@, a), (switchfoot, n), (http, n), (:, None)...",@ switchfoot http : //twitpic.com/2y1zl - Aw...,...,-2.0,0,0.216667,4,"['Awww', ""that's"", 'bummer', 'You', 'shoulda',...",0,-2.0,0,0.2000,4
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,is upset that he can't update his Facebook by ...,"[is, upset, that, he, can't, update, his, Face...","[(upset, a), (ca, None), (n't, r), (update, v)...",upset ca n't update Facebook texting ... mig...,...,-5.0,0,0.000000,4,"['upset', ""can't"", 'update', 'Facebook', 'text...",0,-5.0,0,0.0000,4
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,I dived many times for the ball Managed to ...,"[I, dived, many, times, for, the, ball, Manage...","[(@, n), (Kenichan, n), (I, None), (dived, v),...",@ Kenichan I dive many time ball . Managed s...,...,2.0,4,0.500000,4,"['I', 'dived', 'many', 'times', 'ball', 'Manag...",4,2.0,4,0.5000,4
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its...","[(whole, a), (body, n), (feels, n), (itchy, v)...",whole body feel itchy like fire,...,-2.0,0,0.200000,4,"['whole', 'body', 'feels', 'itchy', 'like', 'f...",0,-2.0,0,0.2000,4
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",no it's not behaving at all i'm mad why a...,"[no, it's, not, behaving, at, all, i'm, mad, w...","[(@, a), (nationwideclass, n), (,, None), ('s,...","@ nationwideclass , 's behave . 'm mad . ? I...",...,-3.0,0,-0.625000,0,"['behaving', ""i'm"", 'mad', 'I', ""can't"", 'see']",0,-3.0,0,-0.6250,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...,Just woke up Having no school is the best fee...,"[Just, woke, up, Having, no, school, is, the, ...","[(Just, r), (woke, v), (., None), (Having, v),...",Just wake . Having school best feeling ever,...,4.0,4,1.000000,4,"['Just', 'woke', 'Having', 'school', 'best', '...",4,4.0,4,1.0000,4
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...,TheWDB com Very cool to hear old Walt interv...,"[TheWDB, com, Very, cool, to, hear, old, Walt,...","[(TheWDB.com, n), (-, None), (Very, r), (cool,...",TheWDB.com - Very cool hear old Walt intervi...,...,1.0,4,0.290000,4,"['TheWDB', 'com', 'Very', 'cool', 'hear', 'old...",4,1.0,4,0.2775,4
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...,Are you ready for your MoJo Makeover Ask me f...,"[Are, you, ready, for, your, MoJo, Makeover, A...","[(Are, n), (ready, a), (MoJo, n), (Makeover, n...",Are ready MoJo Makeover ? Ask detail,...,0.0,4,0.200000,4,"['Are', 'ready', 'MoJo', 'Makeover', 'Ask', 'd...",4,0.0,4,0.2000,4
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...,Happy 38th Birthday to my boo of alll time ...,"[Happy, 38th, Birthday, to, my, boo, of, alll,...","[(Happy, a), (38th, None), (Birthday, n), (boo...",Happy 38th Birthday boo alll time ! ! ! Tupa...,...,3.0,4,1.000000,4,"['Happy', '38th', 'Birthday', 'boo', 'alll', '...",4,3.0,4,0.8000,4


# Bez czyszczenia danych

In [130]:
df['Vader1'] = df.apply(lambda x: get_vader_score(x['text']), axis=1)
print(classification_report(df.dropna()["target"].values, df.dropna()["Vader1"].values))

              precision    recall  f1-score   support

           0       0.81      0.43      0.56    800000
           4       0.61      0.90      0.73    800000

    accuracy                           0.67   1600000
   macro avg       0.71      0.67      0.65   1600000
weighted avg       0.71      0.67      0.65   1600000



In [131]:
df['Afinn_Sentiment1'] = [af.score(i) for i in df['text']]
df["Afinn1"] = df.apply(lambda x: analysis(x["Afinn_Sentiment1"]), axis=1)
print(classification_report(df.dropna()["target"].values, df.dropna()["Afinn1"].values))

              precision    recall  f1-score   support

           0       0.80      0.41      0.54    800000
           4       0.60      0.90      0.72    800000

    accuracy                           0.65   1600000
   macro avg       0.70      0.65      0.63   1600000
weighted avg       0.70      0.65      0.63   1600000



In [132]:
df['Textblob_polarity1'] = df['text'].apply(getPolarity)
df["Textblob1"] = df.apply(lambda x: analysis(x["Textblob_polarity1"]), axis=1)
print(classification_report(df.dropna()["target"].values, df.dropna()["Textblob1"].values))

              precision    recall  f1-score   support

           0       0.76      0.32      0.45    800000
           4       0.57      0.90      0.70    800000

    accuracy                           0.61   1600000
   macro avg       0.67      0.61      0.57   1600000
weighted avg       0.67      0.61      0.57   1600000

