## Steps

1. Let VADER program identify the sentiment of all the tweets in SentEval dev dataset.
2. Check VADER's accuracy against SentEval's
3. ~~Assuming accuracy is sufficient, figure out if/how to use TwitterScraper for getting topics.~~
4. Do some text cleaning on the tweets.
5. Check VADER's accuracy against SentEval's, once again.

In [None]:
# install stuff
pip install twitterscraper vaderSentiment

In [1]:
import pandas as pd
import numpy as np
import string
import glob
import os
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from collections import Counter

%matplotlib inline

## #0 - Constructing DataFrame

In [2]:
# The two files moved to the GOLD/SHIT folder could not be used.
# Some entries are not properly tab separated and the tweets are merged together.

path = os.path.join('Datasets/semeval-datasets/2017_English_final/GOLD/Subtask_A')
txt_files = glob.glob(os.path.join(path, 'twitter*.txt'))
list_ = []

for file in txt_files:
    df = pd.read_csv(file, index_col=None, sep='\t', header=None,
                        names=['id', 'sentiment', 'text', 'to_delete'])
    
    list_.append(df.iloc[:, :-1])
    
df = pd.concat(list_)
df = df.drop_duplicates()
df = df.reset_index(drop=True)
df.tail()

Unnamed: 0,id,sentiment,text
41700,681877834982232064,neutral,@ShaquilleHoNeal from what I think you're aski...
41701,681879579129200640,positive,"Iran ranks 1st in liver surgeries, Allah bless..."
41702,681883903259357184,neutral,Hours before he arrived in Saudi Arabia on Tue...
41703,681904976860327936,negative,@VanityFair Alex Kim Kardashian worth how to ...
41704,681910549211287552,neutral,I guess even Pandora knows Justin Bieber is a ...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41705 entries, 0 to 41704
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         41705 non-null  int64 
 1   sentiment  41705 non-null  object
 2   text       41705 non-null  object
dtypes: int64(1), object(2)
memory usage: 977.6+ KB


In [4]:
df['token_length'] = [ len(x.split(" ")) for x in df.text ]
max(df.token_length)

53

In [5]:
def get_columns_index(df, column_types='all'):
    if column_types == 'all':
        return df.columns
    else:
        numeric_cols = df.select_dtypes(include=np.number).columns
        if column_types == 'numeric':
            return numeric_cols
        elif column_types == 'non-numeric':
            return pd.Index(set(df.columns).difference(set(numeric_cols)))
        else:
            return df.select_dtypes(include=column_types).columns

def print_missing_per_col(df, column_types='all'):
    def _print_missing_col(df, cols):
        print(f"Percent missing by column ({column_types})".center(32))
        for col in cols:
            p_missing = round(np.mean(df[col].isnull()), 5)
            print(f'{col} '.ljust(34, '.'), f'{p_missing*100}%')

    _print_missing_col(df, get_columns_index(df, column_types))

In [6]:
print_missing_per_col(df) # Verifying no missing data first

Percent missing by column (all) 
id ............................... 0.0%
sentiment ........................ 0.0%
text ............................. 0.0%
token_length ..................... 0.0%


##  #1/2 - vaderSentiment time

In [7]:
analyzer = SentimentIntensityAnalyzer()
vader_sentiments = []

for tweet in df['text']:
    sent = 'neutral'
    compound_score = analyzer.polarity_scores(tweet)['compound']
    if compound_score >= 0.05:
        sent = 'positive'
    elif compound_score <= -0.05:
        sent = 'negative'
    vader_sentiments.append(sent)
    
df['vader_sentiment'] = vader_sentiments

### Checking VaderSentiment's sentiment classification vs SemEval's

In [8]:
vader_sentiment_matches = []
for index, row in df.iterrows():
    match = 1 if row['sentiment'] == row['vader_sentiment'] else 0
    vader_sentiment_matches.append(match)
df['vader_sentiment_match'] = vader_sentiment_matches

In [9]:
n_matched_rows = df[df['vader_sentiment_match'] == 1].count()['vader_sentiment_match']
print(f"Matched rows:     {n_matched_rows}")
print(f"Percentage match: {(n_matched_rows/df.shape[0])*100}%")

Matched rows:     23332
Percentage match: 55.94533029612756%


#### Without any cleaning of the SemEval tweets fed to VADER, only 56% of VaderSentiment's labels match SemEval's

In [10]:
df.shape

(41705, 6)

## #4 - Text Cleaning

1. Replace ’(apostrophes) with ' (single quotes)
2. Remove square brackets, links, punctuation
3. Remove @mentions
4. Remove the unicode bullshit
5. Tokenize
6. Replace contractions with uncontracted forms
7. Lemmatize

In [11]:
def print_title(title):
    print(f"{title}", end="\n"+"-"*44+"\n")

def regex_sub(df, text_col, regex, substitute=''):
    r = re.compile(regex)
    for i, text in enumerate(df[text_col]):
        df[text_col].iloc[i] = re.sub(r, substitute, text)

### #1.

In [12]:
#1. Replace ’(apostrophes) with ' (single quotes)
regex_sub(df, 'text', "’", "'")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_col].iloc[i] = re.sub(r, substitute, text)


KeyboardInterrupt: 

In [138]:
# Verify since above warning
df[df['text'].str.contains("’")][:3] # Checks out

Unnamed: 0,id,sentiment,text,vader_sentiment,vader_sentiment_match


###  #2.

In [139]:
#2. Remove square brackets, links, punctuation -- takes a long time ;P

# Remove square brackets
regex_sub(df, 'text', r"\W*\[(.*?)\]")

# Remove links
url_regex = r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})"
regex_sub(df, 'text', url_regex)

# Remove punctuation
df['text'] = df['text'].str.replace(r"[^\w\s]",'')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_col].iloc[i] = re.sub(r, substitute, text)
  df['text'] = df['text'].str.replace(r"[^\w\s]",'')


###  #3.

In [141]:
#3. Remove @mentions
regex_sub(df, 'text', r"\B@\S+")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_col].iloc[i] = re.sub(r, substitute, text)


### #4.

In [142]:
df[df['text'].str.contains('Gas by my house hit')]

Unnamed: 0,id,sentiment,text,vader_sentiment,vader_sentiment_match
5193,264183816548130816,positive,Gas by my house hit 339 Iu2019m going to Chape...,positive,1


In [143]:
regex_sub(df, 'text', r"\\u\S+")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_col].iloc[i] = re.sub(r, substitute, text)


In [147]:
# Check for a specific unicode char I saw in a tweet
t = df[df['text'].str.contains('He should have started his speech with')]['text']
print(t)

20    Gary Ablett wins the AFLPA MVP for the 4th tim...
Name: text, dtype: object


### #5.

In [148]:
tokenizer = nltk.tokenize.WordPunctTokenizer()
def tokenize(text, remove_punctuation=True, preserve_case=True):
    text = regex_sub_str(text, r"[^\w\s]", '') if remove_punctuation else text
    if preserve_case:
        return [ word for word in tokenizer.tokenize(text) ]
    else:
        return [ word.lower() for word in tokenizer.tokenize(text) ]
    
def tokenize_column(df, text_col, remove_punctuation=True, preserve_case=True):
    tokens_list = []
    for text in df[text_col]:
        tokens = tokenize(text, remove_punctuation, preserve_case)
        tokens_list.append(tokens)
    return tokens_list

In [149]:
# Tokenize
df['text_tokenized'] = tokenize_column(df, 'text', remove_punctuation=False) # Already removed

### #6.

In [150]:
contractions_map = {"ain't": "is not", "aren't": "are not","can't": "cannot", 
                   "can't've": "cannot have", "'cause": "because", "could've": "could have", 
                   "couldn't": "could not", "couldn't've": "could not have","didn't": "did not", 
                   "doesn't": "does not", "don't": "do not", "hadn't": "had not", 
                   "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", 
                   "he'd": "he would", "he'd've": "he would have", "he'll": "he will", 
                   "he'll've": "he will have", "he's": "he is", "how'd": "how did", 
                   "how'd'y": "how do you", "how'll": "how will", "how's": "how is", 
                   "I'd": "I would", "I'd've": "I would have", "I'll": "I will", 
                   "I'll've": "I will have","I'm": "I am", "I've": "I have", 
                   "i'd": "i would", "i'd've": "i would have", "i'll": "i will", 
                   "i'll've": "i will have","i'm": "i am", "i've": "i have", 
                   "isn't": "is not", "it'd": "it would", "it'd've": "it would have", 
                   "it'll": "it will", "it'll've": "it will have","it's": "it is", 
                   "let's": "let us", "ma'am": "madam", "mayn't": "may not", 
                   "might've": "might have","mightn't": "might not","mightn't've": "might not have", 
                   "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", 
                   "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", 
                   "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
                   "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", 
                   "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", 
                   "she's": "she is", "should've": "should have", "shouldn't": "should not", 
                   "shouldn't've": "should not have", "so've": "so have","so's": "so as", 
                   "this's": "this is",
                   "that'd": "that would", "that'd've": "that would have","that's": "that is", 
                   "there'd": "there would", "there'd've": "there would have","there's": "there is", 
                       "here's": "here is",
                   "they'd": "they would", "they'd've": "they would have", "they'll": "they will", 
                   "they'll've": "they will have", "they're": "they are", "they've": "they have", 
                   "to've": "to have", "wasn't": "was not", "we'd": "we would", 
                   "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", 
                   "we're": "we are", "we've": "we have", "weren't": "were not", 
                   "what'll": "what will", "what'll've": "what will have", "what're": "what are", 
                   "what's": "what is", "what've": "what have", "when's": "when is", 
                   "when've": "when have", "where'd": "where did", "where's": "where is", 
                   "where've": "where have", "who'll": "who will", "who'll've": "who will have", 
                   "who's": "who is", "who've": "who have", "why's": "why is", 
                   "why've": "why have", "will've": "will have", "won't": "will not", 
                   "won't've": "will not have", "would've": "would have", "wouldn't": "would not", 
                   "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
                   "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                   "you'd": "you would", "you'd've": "you would have", "you'll": "you will", 
                   "you'll've": "you will have", "you're": "you are", "you've": "you have" } 

In [152]:
contractions_keys = contractions_map.keys()

def replace_contractions(tokens):
    return [ contractions_map[t] if t in contractions_keys else t for t in tokens ]
    
def replace_contractions_column(df, tokens_col):
    return [ replace_contractions(tokens) for tokens in df[tokens_col] ]

In [154]:
# Replace contractions with uncontracted forms
df['text_tokenized'] = replace_contractions_column(df, 'text_tokenized')

###  #7.

In [161]:
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatize_tokens(tokens):
    return [ wordnet_lemmatizer.lemmatize(token) for token in tokens ]

def lemmatize_tokens_column(df, tokens_col):
    return [ lemmatize_tokens(tokens) for tokens in df[tokens_col] ]

In [162]:
#5. Lemmatize
df['text_tokenized'] = lemmatize_tokens_column(df, 'text_tokenized')

## # 5. - Re-checking VADER labels vs. SemEval labels

In [163]:
# Redo vader sentiment
vader_sentiments.clear()

for tweet in df['text']:
    sent = 'neutral'
    compound_score = analyzer.polarity_scores(tweet)['compound']
    if compound_score >= 0.05:
        sent = 'positive'
    elif compound_score <= -0.05:
        sent = 'negative'
    vader_sentiments.append(sent)
    
df['vader_sentiment'] = vader_sentiments

In [164]:
vader_sentiment_matches = []
for index, row in df.iterrows():
    match = 1 if row['sentiment'] == row['vader_sentiment'] else 0
    vader_sentiment_matches.append(match)
df['vader_sentiment_match'] = vader_sentiment_matches

In [165]:
n_matched_rows = df[df['vader_sentiment_match'] == 1].count()['vader_sentiment_match']
print(f"Matched rows:     {n_matched_rows}")
print(f"Percentage match: {(n_matched_rows/df.shape[0])*100}%")

Matched rows:     22925
Percentage match: 54.969428126123965%
