<a href="https://colab.research.google.com/github/BradenAnderson/Twitter-Sentiment-Analysis/blob/main/02_vaderSentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q torch==1.4.0, torchvision

In [None]:
# Clone the neuspell github repository and install its contents. This is used to explore spell checking capabilities. 
!git clone https://github.com/neuspell/neuspell
!pip install /content/neuspell/.

In [None]:
!pip install -r /content/neuspell/requirements.txt

In [None]:
import nltk
import string
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize
from neuspell import BertChecker

nltk.download('vader_lexicon')
nltk.download('punkt')

from sklearn.metrics import confusion_matrix


In [None]:
import torch
print(torch.__version__)

In [None]:
pd.set_option('display.max_rows', 1000)

In [None]:
# Read in the tweet dataset that was output by the 01_Data_Cleaning_With_Spacy notebook.
filename = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/intermediate_output_files/train_tweets_spacy_clean.csv"
cleaned_tweet_df = pd.read_csv(filename)

# Remove columns we don't need and display the first five rows of the DataFrame.
cleaned_tweet_df.drop(columns=['tweet_emoji_cleaned', 'Fully_Clean_Tweet_Tokenized'], inplace=True)
cleaned_tweet_df.head()

Unnamed: 0,label,tweet,Clean_Tweet
0,0,@user when a father is dysfunctional and is s...,father dysfunctional significant selfish pron ...
1,0,@user @user thanks for #lyft credit i can't us...,thank #lyft credit use cause pron offer wheelc...
2,0,bihday your majesty,bihday pron majesty
3,0,#model i love u take with u all the time in ...,#model love pron pron time pron happy love hap...
4,0,factsguide: society now #motivation,factsguide society #motivation


In [None]:
# Read in the tweet dataset output by the 00_Emoji_Data_Cleaning notebook. (Emojis have been cleaned but the rest of the tweet has not be preprocessed yet).
filename = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/train_test_data/train_twitter_sentiment.csv"
tweet_df = pd.read_csv(filename, index_col=0)

# Make a copy of this DataFrame for later use. 
original_tweet_df = tweet_df.copy(deep=True)

# Display the first five rows.
tweet_df.head()

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,@user when a father is dysfunctional and is s...
2,0,@user @user thanks for #lyft credit i can't us...
3,0,bihday your majesty
4,0,#model i love u take with u all the time in ...
5,0,factsguide: society now #motivation


# Part 1 - Set up vaderSentiment and create a baseline classifier.

In [None]:
# Initialize the vaderSentiment SentimentIntensityAnalyzer. 
sentiment_analyzer = SentimentIntensityAnalyzer()

In [None]:
#---------------------------------------------------------------------------------------------------------------------------------------------------
# Inputs: A given dataframe, sentiment analyzer (default defined at start of notebook), and column name within the dataframe (default is 'tweet'). 
# Output: The input dataframe with a new 'Positive_Sentiment_Score' column added. This new column contains the "positive" portion of the
# vaderSentiment polarity score for the associated item in the tweet column.
#---------------------------------------------------------------------------------------------------------------------------------------------------
def add_positive_score_column(df, sentiment_analyzer=sentiment_analyzer, text_to_score_column='tweet'):

  df['Positive_Sentiment_Score'] = df[text_to_score_column].apply(lambda x : sentiment_analyzer.polarity_scores(x)['pos'])

  return df

In [None]:
#---------------------------------------------------------------------------------------------------------------------------------------------------
# Inputs: A given dataframe, sentiment analyzer (default defined at start of notebook), and column name within the dataframe (default is 'tweet'). 
# Output: The input dataframe with a new 'Negative_Sentiment_Score' column added. This new column contains the "positive" portion of the
# vaderSentiment polarity score for the associated item in the tweet column.
#---------------------------------------------------------------------------------------------------------------------------------------------------
def add_negative_score_column(df, sentiment_analyzer=sentiment_analyzer, text_to_score_column='tweet'):

  df['Negative_Sentiment_Score'] = df[text_to_score_column].apply(lambda x : sentiment_analyzer.polarity_scores(x)['neg'])

  return df

In [None]:
#---------------------------------------------------------------------------------------------------------------------------------------------------
# Inputs: A given dataframe, sentiment analyzer (default defined at start of notebook), and column name within the dataframe (default is 'tweet'). 
# Output: The input dataframe with a new 'Neutral_Sentiment_Score' column added. This new column contains the "neutral" portion of the
# vaderSentiment polarity score for the associated item in the tweet column.
#---------------------------------------------------------------------------------------------------------------------------------------------------
def add_neutral_score_column(df, sentiment_analyzer=sentiment_analyzer, text_to_score_column='tweet'):

  df['Neutral_Sentiment_Score'] = df[text_to_score_column].apply(lambda x : sentiment_analyzer.polarity_scores(x)['neu'])

  return df

In [None]:
#---------------------------------------------------------------------------------------------------------------------------------------------------
# Inputs: A given dataframe, sentiment analyzer (default defined at start of notebook), and column name within the dataframe (default is 'tweet'). 
# Output: The input dataframe with a new 'Compound_Sentiment_Score' column added. This new column contains the "compound" portion of the
# vaderSentiment polarity score for the associated item in the tweet column.
#
# Note: If a single metric is needed, compound score is the one to use. This is a measurement of sentiment intesnsity, normalized to 
# between -1 (most negative) to +1 (most positive).
#---------------------------------------------------------------------------------------------------------------------------------------------------
def add_compound_score_column(df, sentiment_analyzer=sentiment_analyzer, text_to_score_column='tweet'):

  df['Compound_Sentiment_Score'] = df[text_to_score_column].apply(lambda x : sentiment_analyzer.polarity_scores(x)['compound'])

  return df

In [None]:
#---------------------------------------------------------------------------------------------------------------------------------------------------
# If all of the vaderSentiment score types are desired, this function uses the four above functions to add each score type to the 
# dataframe as its own column.
#---------------------------------------------------------------------------------------------------------------------------------------------------
def add_sentiment_score_columns(df, sentiment_analyzer=sentiment_analyzer, text_to_score_column='tweet'): 

  df = add_positive_score_column(df, sentiment_analyzer, text_to_score_column)
  df = add_negative_score_column(df, sentiment_analyzer, text_to_score_column)
  df = add_neutral_score_column(df, sentiment_analyzer, text_to_score_column)
  df = add_compound_score_column(df, sentiment_analyzer, text_to_score_column)

  return df

In [None]:
# Takes in a single tweet string and returns a list of sentence tokens.
def tokenize_tweet(tweet): 

  tweet_sentence_tokens = sent_tokenize(tweet)
  
  return tweet_sentence_tokens

# Applys the function above to every column in the specified DataFrame. Creates a new column containing a list
# of sentence tokens for the associated tweet.
def add_sentence_tokens_column(df, tweet_text_column='tweet', tokenized_tweet_column='Tokenized_Tweet'):

  df[tokenized_tweet_column] = df[tweet_text_column].apply(tokenize_tweet)

  return df

In [None]:
#-----------------------------------------------------------------------------------------------------------------------------------------------------------
# This is a helper function used by the sentence_analysis_score function when calculating the average value of one or more of the vader sentiment scores.
# It returns the number of scores being averaged provided that at least 1 score exists. If an attempt is made to average 0 items, this function returns
# 1 to avoid a divide by zero condition. 
#-----------------------------------------------------------------------------------------------------------------------------------------------------------
def get_denominator(score_type, score_dict):

  denominator = 1

  if score_type == 'compound' and len(score_dict['Compound']) > 1: 
    denominator = len(score_dict['Compound'])
  elif score_type == 'pos' and len(score_dict['Positive']) > 1:
    denominator = len(score_dict['Positive'])
  elif score_type == 'neg' and len(score_dict['Negative']) > 1:
    denominator = len(score_dict['Negative'])
  elif score_type == 'neu' and len(score_dict['Neutral']) > 1:
    denominator = len(score_dict['Neutral'])
  
  return denominator

In [None]:
# Takes in a list of sentence tokens and returns the average compound score for the tokens in the list.
def sentence_analysis_score(tweet_sentence_tokens, sentiment_analyzer=sentiment_analyzer, score_type='compound'):

  score_dict = {'Positive' : [],
                'Negative' : [],
                'Neutral' : [],
                'Compound' : []}
  
  for sentence in tweet_sentence_tokens: 
    score = sentiment_analyzer.polarity_scores(sentence)
    score_dict['Positive'].append(score['pos'])
    score_dict['Negative'].append(score['neg'])
    score_dict['Neutral'].append(score['neu'])
    score_dict['Compound'].append(score['compound'])

  denominator = get_denominator(score_type, score_dict)

  if score_type == 'compound':
    average_compound_score = sum(score_dict['Compound']) / denominator
    return average_compound_score
  elif score_type == 'pos':
    average_positive_score = sum(score_dict['Positive']) / denominator
    return average_positive_score
  elif score_type == 'neg':
    average_negative_score = sum(score_dict['Negative']) / denominator
    return average_negative_score
  elif score_type == 'neu':
    average_neutral_score = sum(score_dict['Neutral']) / denominator
    return average_neutral_score


In [None]:
#---------------------------------------------------------------------------------------------------------------------------------------------------------------
# Creates a new column in the dataframe where the values are the compound score for the associated tweet, calculated by first breaking the tweet up into
# sentence level tokens, and then average the score for each token.
#---------------------------------------------------------------------------------------------------------------------------------------------------------------
def add_sentence_level_compound_score_column(df, sentiment_analyzer=sentiment_analyzer, tweet_text_column='tweet', text_to_score_column='Tokenized_Tweet', keep_subscores=False,
                                             sub_scores=None):

  # A list of valid subscores. If the user is specifying particular subscore columns to keep, they are checked against this to make sure they are valid scores.
  valid_subscores = ['pos', 'neg', 'neu', 'compound']

  # Get a list of column names for the dataframe that was passed in.
  df_column_names = list(df.columns)

  # If the dataframe already has a column containing sentence level tokens.
  if text_to_score_column in df_column_names: 

    # If the user does not want to keep subscore columns.
    if keep_subscores == False:

      # Apply the sentence_analysis score function to each list of sentence tokens to get the average compound score for that tweet.
      df['Sentence_Level_Compound_Score'] = df[text_to_score_column].apply(sentence_analysis_score, args=(sentiment_analyzer, 'compound'))

      return df

    # If we also want to add dataframe columns for the 'pos', 'neg' and 'neu' subscores (or a subset of them).
    elif keep_subscores == True:

      # Figure out which subscore to keep, if not specified or incorrectly specified, use them all. 
      if sub_scores is None or type(sub_scores) != list or not all(scores in valid_subscores for scores in sub_scores):
        sub_scores = valid_subscores

      # Create each subscore column.
      for score in sub_scores:
        column_name = 'Sentence_Level_' + score + "_Score"
        df[column_name] = df[text_to_score_column].apply(sentence_analysis_score, args=(sentiment_analyzer, score))

      return df
      
  # Else, we do not already have a column with sentence level tokens in the dataframe. We will need to make a temporary
  # column with sentence level tokenizations so we can use it to calculate the sentence level scores.
  else: 

    # Create a column of setence level tokens.
    df = add_sentence_tokens_column(df, tweet_text_column=tweet_text_column, tokenized_tweet_column=text_to_score_column)

    # When returning the dataframe keep all specified subscores.
    if keep_subscores == True:

      # Figure out which subscore to keep, if not specified or incorrectly specified, use them all. 
      if sub_scores is None or type(sub_scores) != list or not all(scores in valid_subscores for scores in sub_scores):
        sub_scores = valid_subscores

      # Create each subscore column.
      for score in sub_scores:
        column_name = 'Sentence_Level_' + score + "_Score"
        df[column_name] = df[text_to_score_column].apply(sentence_analysis_score, args=(sentiment_analyzer, score))

    elif keep_subscores == False: 

      # Create the sentence level compound score column.
      df['Sentence_Level_Compound_Score'] = df[text_to_score_column].apply(sentence_analysis_score, args=(sentiment_analyzer,))

    # Drop the column of sentence level tokens. This was not part of the df when the function was called, 
    # and was only used as an intermediate processing step. 
    df.drop(columns=[text_to_score_column], inplace=True)

    return df

In [None]:
#---------------------------------------------------------------------------------------------------------------------------------------------------
# This function uses the compound sentiment score generated by vaderSentiment to classify tweets as either "hate speech" or "not hate speech".
#
# Inputs: A dataframe containing the tweets that need to be classified, as well as optional parameters for the sentiment analyzer, tweet column,
# threshold, and sentence level analysis.
#
# Output: The input dataframe with a new column added. This new column will be 1 (indicating contains hate speech) for rows where the associated
# tweet had a compound sentiment score less than threshold (tweet was deemed more negative than the threshold value), and 0 (does not contain hate speech) 
# for rows where the associated tweet had a compound sentiment score higher than threshold (tweet more positive than the threshold setting).
# 
# Note: The sentence_level_analysis parameter determines how to calculate the decision boundary. If this parameter is set to True, vader will 
# calculate a compound score for each sentence in the tweet individually, and then the average compound score will become the decision boundary. If 
# this is set to False, then vader will calculate a single compound score for the entire tweet and no averaging is required. 
#
# Note: The threshold parameter sets the decision boundry for hate speech classification. Tweets with a compound score above the decision
# boundary are not classified as hate speech, while tweets below the decision boundry are classified as hate speech
#
# Note: This function may need to create intermediate columns in the Dataframe to perform calculations, however it is designed to return the
# DataFrame with only the columns it had when the function was called, with the exception of the prediction column being added (intermediate columns
# are removed before the function returns). 
# 
#---------------------------------------------------------------------------------------------------------------------------------------------------
def add_classification_predictions_column(df, sentiment_analyzer=sentiment_analyzer, text_column='tweet', threshold=0.0,
                                          sentence_level_analysis = False): 

  df_column_names = list(df.columns)

  if 'Compound_Sentiment_Score' in df_column_names and sentence_level_analysis == False:

    df['Predicted_Class'] = df['Compound_Sentiment_Score'].apply(lambda score : 1 if score < threshold else 0)

    return df

  elif 'Compound_Sentiment_Score' not in df_column_names and sentence_level_analysis == False: 

    df = add_compound_score_column(df, sentiment_analyzer, text_column)

    df['Predicted_Class'] = df['Compound_Sentiment_Score'].apply(lambda score : 1  if score < threshold else 0)

    df.drop(columns=['Compound_Sentiment_Score'], inplace=True)

    return df

  elif 'Sentence_Level_Compound_Score' in df_column_names and sentence_level_analysis == True:

    df['Predicted_Class_Sentence_Level'] = df['Sentence_Level_Compound_Score'].apply(lambda score : 1 if score < threshold else 0)

    return df
  
  elif 'Sentence_Level_Compound_Score' not in df_column_names and sentence_level_analysis == True:

    df = add_sentence_level_compound_score_column(df, sentiment_analyzer, text_column)

    df['Predicted_Class_Sentence_Level'] = df['Sentence_Level_Compound_Score'].apply(lambda score : 1 if score < threshold else 0)

    df.drop(columns=['Sentence_Level_Compound_Score'], inplace=True)

    return df

In [None]:
# This function calculates sensitivity with the foluma TP/(TP + FN)
def calculate_sensitiviy(true_positives, false_negatives): 
  return true_positives / (true_positives + false_negatives)

# This function calculates specificity with the foluma TN/(TN + FP)
def calculate_specificity(true_negatives, false_positives):
  return true_negatives / (true_negatives + false_positives)

# This function calculates precision with the foluma TP/(TP + FP)
def calculate_precision(true_positives, false_positives): 
  return true_positives / (true_positives + false_positives)

# This function calculates accuracy with the foluma (TP + TN)/(TP + TN + FP + FN)
def calculate_accuracy(true_negatives, false_positives, false_negatives, true_positives): 
  return (true_positives + true_negatives) / (true_positives + true_negatives + false_positives + false_negatives)

In [None]:
# This function recieves dictionary, and four values (TP, FP, FN, TP) as inputs. 
# This function simply adds the four input values to the input dictionary and returns the dictionary.
def add_confusion_params(input_dict, true_negatives, false_positives, false_negatives, true_positives): 
  input_dict['True_Negatives'] = true_negatives
  input_dict['False_Positives'] = false_positives
  input_dict['False_Negatives'] = false_negatives
  input_dict['True_Positives'] = true_positives
  return input_dict

In [None]:
#---------------------------------------------------------------------------------------------------------------------------------------------------
# This function uses the helper functions above to create a dictionary of the classification metrics and their values. 
#
# This function is a helper function to the get_classification_metrics function.
#
# Inputs: The confusion matrix parameters (true negatives, false positives, false negatives, true positives) and a list of metric names
# to calculate (metrics_to_calculate). 
#
# Output: A dictionary containing calculated values for the desired metrics.
# 
# Note: If the parameter "verbose" is set to True additional information will be added to the output dictionary that shows, for each metric, 
# the value of all the intermediate metrics (TP, TN, etc.) that were used to calculate it.
#
# Note: The include_confusion_params parameter is set to True then separate entries will be added to the dictionary for each 
# confusion matrix parameter (TP, TN, FP, FN).
#
#---------------------------------------------------------------------------------------------------------------------------------------------------
def calculate_metrics(true_negatives, false_positives, false_negatives, true_positives, metrics_to_calculate, verbose, include_confusion_params): 

  metrics = {}

  if 'Sensitivity' in metrics_to_calculate and not verbose: 
    metrics['Sensitivity'] = calculate_sensitiviy(true_positives, false_negatives)
  elif 'Sensitivity' in metrics_to_calculate and verbose:
    sensitivity = calculate_sensitiviy(true_positives, false_negatives)
    metrics['Sensitivity'] = [sensitivity, "True Positives: " + str(true_positives), "False Negatives: " + str(false_negatives)]
  
  if 'Specificity' in metrics_to_calculate and not verbose: 
    metrics['Specificity'] = calculate_specificity(true_negatives, false_positives)
  elif 'Specificity' in metrics_to_calculate and verbose: 
    specificity = calculate_specificity(true_negatives, false_positives)
    metrics['Specificity'] = [specificity, "True Negatives: " + str(true_negatives), "False Positives: " + str(false_positives)]
  
  if 'Precision' in metrics_to_calculate and not verbose:
    metrics['Precision'] = calculate_precision(true_positives, false_positives)
  elif 'Precision' in metrics_to_calculate and verbose:
    precision = calculate_precision(true_positives, false_positives)
    metrics['Precision'] = [precision, "True Positives: " + str(true_positives), "False Positives: " + str(false_positives)]

  if 'Accuracy' in metrics_to_calculate and not verbose:
    metrics['Accuracy'] = calculate_accuracy(true_negatives, false_positives, false_negatives, true_positives)
  elif 'Accuracy' in metrics_to_calculate and verbose: 
    accuracy = calculate_accuracy(true_negatives, false_positives, false_negatives, true_positives)
    metrics['Accuracy'] = [accuracy, "True Positives: " + str(true_positives), "True Negatives: " + str(true_negatives), 
                           "False Positives: " + str(false_positives), "False Negatives: " + str(false_negatives)]
  

  if(include_confusion_params == True): 
    metrics = add_confusion_params(metrics, true_negatives, false_positives, false_negatives, true_positives)


  return metrics

In [None]:
#-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# This function uses the scikit-learn confusion matrix to calculate various classification metrics.
# 
# Required Inputs: A dataframe containing tweet text being classified, and the correct labels (hate speech or not) for each tweet.
# 
# Optional Inputs: 1) The name of the dataframe column containing the tweets text (if excluded it is assumed to be 'tweet')
#                  2) The name of the dataframe column containing the ground truth labels (if excluded it is assumed to be 'label')
#                  3) A list of metrics to calculate. (if excluded, the list is assumed to be Sensitivity, Specificity, Precision, and Accuracy)
#                  4) The sentiment intensity analyzer to use (if excluded, the one defined at the start of this notebook is used).
#                  5) What the classification decision boundary should be (if exluded, it is assumed to be zero, meaning compound scores above zero are not hate speech).
#                  6) If the vader compound score should be calculated sentence by sentence then averaged, or once for the entire tweet (if excluded, assumes once for the tweet).
#                  7) Whether or not the confusion matrix parameters themselves (TP, TP, FP, FN) need their own dictionary entries. (if exlcuded, assumes no). 
#
# Outputs: A dictionary containing values for the desired metrics.
#
#-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
def get_classification_metrics(df, tweet_text_column='tweet', ground_truth_column_name='label', metrics_list = None, verbose = False,
                               sentiment_analyzer=sentiment_analyzer, threshold=0.0, sentence_level_analysis=False, include_confusion_params=False):

  if metrics_list is None or type(metrics_list) != list:
    metrics_list = ['Sensitivity', 'Specificity', 'Precision', 'Accuracy']

  if sentence_level_analysis == False:
    prediction_column_name = 'Predicted_Class'
  else:
    prediction_column_name = 'Predicted_Class_Sentence_Level'

  df_column_names = list(df.columns)

  if prediction_column_name in df_column_names: 

    tn, fp, fn, tp = confusion_matrix(df[ground_truth_column_name], df[prediction_column_name]).ravel()

    classification_metrics = calculate_metrics(tn, fp, fn, tp, metrics_list, verbose, include_confusion_params)
  
    return classification_metrics

  elif prediction_column_name not in df_column_names: 

    df = add_classification_predictions_column(df, sentiment_analyzer=sentiment_analyzer, text_column=tweet_text_column, threshold=threshold, sentence_level_analysis=sentence_level_analysis)

    tn, fp, fn, tp = confusion_matrix(df[ground_truth_column_name], df[prediction_column_name]).ravel()

    classification_metrics = calculate_metrics(tn, fp, fn, tp, metrics_list, verbose, include_confusion_params)

    df.drop(columns=[prediction_column_name], inplace = True)

    return classification_metrics


In [None]:
# ---------------------------------------------------------------------------------------------------------------------------------------------------------------
# This cell display the classification metrics for the case where vaderSentiment is used to classify the tweets as "containing hate speech" or 
# "not containing hate speech" without any preprocessing applied to the tweets. This can be used as a baseline to see if we can improve vaderSentiments
# ability to perform this type of classification by applying various data cleaning steps. 
#
# ----------------------------------------------------------------
# Review of metric deffinitions in the context of this problem
# ----------------------------------------------------------------
# Accuracy: The proportion of tweets that were classified correctly.
#
# Precision: Out of all the tweets the algorithm claimed contained hate speech, how many were correct? 
# 
# Sensitivity (i.e. True positive rate or recall): Out of all the tweets that actually did contain hate speech, what percentage were correctly classified?
#
# Specificity (i.e. True negative rate or selectivity): Out of all the tweets that did not contain hate speech, what percentage were correctly classified?
# 
# -------------------------
# Reviewing Scores
# -------------------------
#
# Precision = 0.14. This is a very low precision score, which tells us that we are getting a large number of false positives. This could be due 
# to the fact that we currently are using the compound_score=0 threshold to determine if a tweet contains hate speech or not. The vaderSentiment compound
# score was designed to take values between -1 and +1, where negative values indicate a "negative sentiment" and positive values indicate a "positive sentiment".
# We can interpret this score to say that there are a lot of tweets with "negative sentiment" that are not actually hate speech. 
#
# Sensitivity = 0.39. This score tells us that out of all the tweets that contained hate speech, we correctly classified only 39% of them. Since in this
# first simple model we classified all tweets with a negative compound score as being hate speech, this result is a bit suprising. It is saying that only
# 39% of the tweets that are hate speech actually have a negative compound score. Precision told us that there are many tweets with negative sentiment (according
# to vader) that are not hatespeech, and sensitivity tells us that there are many tweets with positive sentiment (according to vader) that actually are
# hate speech. 
#
# Specificity = 0.82. This score tell us that out of all the tweets that do not contain hate speech, we correctly classified 82% of them. In the context of the
# model we created, this says that 82% of tweets that do not contain hate speech have a positive sentiment according to vaders compound polarity score.
#
# Accuracy = 0.79. 79% accuracy is not terrible for a first basic attempt, but I think this value is a little bit misleading and it should not give us
# false confidence. In the metrics above we showed that when this model thought something was hate speech, it was correct only 14% of the time, and when something
# actually was hate speech it was correct only 39% of the time. This tells us that we have a significant number of false positives (5160) and 
# false negatives (1357). The fact that the classes in our dataset are greatly uneven (29720 examples of non-hate speech, compared to 2242 examples of hate
# speech) is likely making this score higher than it would be if the classes were balanced.
#
# Note: Our model over-classifies things as hate speech. If the purpose of the model is to filter down a long list of tweets so a human can then hand review
# tweets that are likely to contain hate speech, then ensuring all hate speech tweets are correctly classified (even at the expense of misclassifying some
# non hate speech tweets) may be the desired balance. (A larger set needs to be reviewed by the human but you don't miss the chance to remove a hateful person
# from your platform).
# 
# ---------------------------------------------------------------------------------------------------------------------------------------------------------------


metrics = get_classification_metrics(tweet_df, verbose=True)

metrics

{'Accuracy': [0.796101620674551,
  'True Positives: 885',
  'True Negatives: 24560',
  'False Positives: 5160',
  'False Negatives: 1357'],
 'Precision': [0.14640198511166252,
  'True Positives: 885',
  'False Positives: 5160'],
 'Sensitivity': [0.39473684210526316,
  'True Positives: 885',
  'False Negatives: 1357'],
 'Specificity': [0.8263795423956931,
  'True Negatives: 24560',
  'False Positives: 5160']}

In [None]:
# Base sentiment scores with no data cleaning steps applied. 
tweet_df = add_sentiment_score_columns(tweet_df, sentiment_analyzer)
tweet_df = add_classification_predictions_column(tweet_df)
tweet_df.head()

Unnamed: 0_level_0,label,tweet,Positive_Sentiment_Score,Negative_Sentiment_Score,Neutral_Sentiment_Score,Compound_Sentiment_Score,Predicted_Class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,@user when a father is dysfunctional and is s...,0.0,0.385,0.615,-0.8296,1
2,0,@user @user thanks for #lyft credit i can't us...,0.256,0.0,0.744,0.6705,0
3,0,bihday your majesty,0.0,0.0,1.0,0.0,0
4,0,#model i love u take with u all the time in ...,0.337,0.0,0.663,0.7249,0
5,0,factsguide: society now #motivation,0.0,0.0,1.0,0.0,0


# Part 2 - Explore how data cleaning increases (or decreases) vaderSentiments effectiveness.

## Explore how the vader sentiment score changes when twitter handles are removed prior to sentiment analysis.

In [None]:
def remove_handles(tweet):

  twitter_handle_removed = re.sub('@[^\s]+','',tweet).rstrip()
  twitter_handle_removed = re.sub(" +", " ", twitter_handle_removed)

  return twitter_handle_removed

def remove_twitter_handles(df, tweet_text_column='tweet', no_handles_column='Tweet_Without_Handles'):

  df[no_handles_column] = df[tweet_text_column].apply(remove_handles)

  return df

In [None]:
tweet_df = remove_twitter_handles(tweet_df)
metrics_without_handles = get_classification_metrics(tweet_df, tweet_text_column='Tweet_Without_Handles', verbose=True)

metrics_without_handles

{'Accuracy': [0.796101620674551,
  'True Positives: 885',
  'True Negatives: 24560',
  'False Positives: 5160',
  'False Negatives: 1357'],
 'Precision': [0.14640198511166252,
  'True Positives: 885',
  'False Positives: 5160'],
 'Sensitivity': [0.39473684210526316,
  'True Positives: 885',
  'False Negatives: 1357'],
 'Specificity': [0.8263795423956931,
  'True Negatives: 24560',
  'False Positives: 5160']}

In [None]:
without_handles_df = add_sentiment_score_columns(tweet_df, text_to_score_column='Tweet_Without_Handles')
without_handles_df.head()

Unnamed: 0_level_0,label,tweet,Positive_Sentiment_Score,Negative_Sentiment_Score,Neutral_Sentiment_Score,Compound_Sentiment_Score,Predicted_Class,Tweet_Without_Handles
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,@user when a father is dysfunctional and is s...,0.0,0.402,0.598,-0.8296,1,when a father is dysfunctional and is so self...
2,0,@user @user thanks for #lyft credit i can't us...,0.282,0.0,0.718,0.6705,0,thanks for #lyft credit i can't use cause the...
3,0,bihday your majesty,0.0,0.0,1.0,0.0,0,bihday your majesty
4,0,#model i love u take with u all the time in ...,0.337,0.0,0.663,0.7249,0,#model i love u take with u all the time in ur...
5,0,factsguide: society now #motivation,0.0,0.0,1.0,0.0,0,factsguide: society now #motivation


## Explore how the vader sentiment score changes when websites are removed prior to sentiment analysis.

In [None]:
def remove_web_address(tweet):

  tweet_without_website_http = re.sub(r"http\S+", "", tweet)
  tweet_without_website_www = re.sub(r"www.\S+", "", tweet_without_website_http).rstrip()
  tweet_without_website = re.sub(" +", " ", tweet_without_website_www)

  return tweet_without_website

def remove_websites_from_tweets(df, tweet_text_column='tweet', no_website_column='Tweet_Without_Websites'): 

  df[no_website_column] = df[tweet_text_column].apply(remove_web_address)

  return df

In [None]:
tweet_df = remove_websites_from_tweets(tweet_df)
metrics_without_websites = get_classification_metrics(tweet_df, tweet_text_column='Tweet_Without_Websites', verbose=True)

metrics_without_websites

{'Accuracy': [0.796101620674551,
  'True Positives: 885',
  'True Negatives: 24560',
  'False Positives: 5160',
  'False Negatives: 1357'],
 'Precision': [0.14640198511166252,
  'True Positives: 885',
  'False Positives: 5160'],
 'Sensitivity': [0.39473684210526316,
  'True Positives: 885',
  'False Negatives: 1357'],
 'Specificity': [0.8263795423956931,
  'True Negatives: 24560',
  'False Positives: 5160']}

In [None]:
without_websites_df = add_sentiment_score_columns(tweet_df, text_to_score_column='Tweet_Without_Websites')
without_websites_df.head()

Unnamed: 0_level_0,label,tweet,Positive_Sentiment_Score,Negative_Sentiment_Score,Neutral_Sentiment_Score,Compound_Sentiment_Score,Predicted_Class,Tweet_Without_Handles,Tweet_Without_Websites
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,@user when a father is dysfunctional and is s...,0.0,0.385,0.615,-0.8296,1,when a father is dysfunctional and is so self...,@user when a father is dysfunctional and is s...
2,0,@user @user thanks for #lyft credit i can't us...,0.256,0.0,0.744,0.6705,0,thanks for #lyft credit i can't use cause the...,@user @user thanks for #lyft credit i can't us...
3,0,bihday your majesty,0.0,0.0,1.0,0.0,0,bihday your majesty,bihday your majesty
4,0,#model i love u take with u all the time in ...,0.337,0.0,0.663,0.7249,0,#model i love u take with u all the time in ur...,#model i love u take with u all the time in ur...
5,0,factsguide: society now #motivation,0.0,0.0,1.0,0.0,0,factsguide: society now #motivation,factsguide: society now #motivation


## Explore how the vader sentiment score changes when sentence level analysis is used.

In [None]:
sentence_level_analysis_metrics = get_classification_metrics(tweet_df, verbose=True, sentence_level_analysis=True)

sentence_level_analysis_metrics

{'Accuracy': [0.795976472060572,
  'True Positives: 887',
  'True Negatives: 24554',
  'False Positives: 5166',
  'False Negatives: 1355'],
 'Precision': [0.14653890632744093,
  'True Positives: 887',
  'False Positives: 5166'],
 'Sensitivity': [0.39562890276538804,
  'True Positives: 887',
  'False Negatives: 1355'],
 'Specificity': [0.8261776581426649,
  'True Negatives: 24554',
  'False Positives: 5166']}

## Explore Spell Checking

Note: After exploring a few different spell checkers, I ultimately decided not to continue testing spell check capability as a data preprocessing step prior to sentiment analysis with vader. 

This decision was made because, for each spell checker I tried, one of two things was true: 

1) The spell checker was capable of providing good results, but lacked the customization needed effectively implement in a pipeline for this project. An example of this is the neuspell BertChecker shown below. BertChecker was able to correct some spelling mistakes that most others could not, however when BertChecker is applied to a sentence, there is no way to limit what corrections are made. So while it was impressive in some areas, it made unacceptable mistakes in others such as manipulating hashtags. 

2) The spell checker did provide the needed flexibility to filter which corrections are made, however the method lacked accuracy and processing speed. An example of this is the Textblob spell checker. 



In [None]:
checker = BertChecker()

In [None]:
checker.from_pretrained("/content/neuspell/data/checkpoints/subwordbert-probwordnoise");

/content/neuspell/data/checkpoints/subwordbert-probwordnoise created
Pretrained model downloading start (may take few seconds to couple of minutes based on download speed) ...
Pretrained model download success
loading vocab from path:/content/neuspell/data/checkpoints/subwordbert-probwordnoise/vocab.pkl
initializing model


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…


Number of parameters in the model: 185211810
loading pretrained weights from path:/content/neuspell/data/checkpoints/subwordbert-probwordnoise
Loading model params from checkpoint dir: /content/neuspell/data/checkpoints/subwordbert-probwordnoise


In [None]:
checker.correct("I luk foward to receving your reply")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




'I look forward to receiving your reply'

In [None]:
checker.correct_strings(["I luk foward to receving your reply", "my frends has a bihday party tomrrow"])

['I look forward to receiving your reply',
 'my friend has a birthday party tomorrow']

In [None]:
tweet_list = list(tweet_df.loc[(tweet_df.index >= 1) & (tweet_df.index <= 5), 'tweet'].to_numpy())

tweet_list_corrected = checker.correct_strings(tweet_list)

In [None]:
corrected_vs_not = zip(tweet_list, tweet_list_corrected)
for index, tweets in enumerate(corrected_vs_not):
  original_tweet = tweets[0]
  corrected_tweet = tweets[1]

  print("Original Tweet: ", original_tweet)
  print("Corrected Tweet: ", corrected_tweet)
  print("\n")

Original Tweet:   @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run
Corrected Tweet:  @ user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction . # run


Original Tweet:  @user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked
Corrected Tweet:  @ user @ user thanks for # lift credit i can ' t use cause they don ' t offer wheelchair vans in pdx . # disappointed # getthanked


Original Tweet:    bihday your majesty
Corrected Tweet:  birthday your majesty


Original Tweet:  #model   i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦  
Corrected Tweet:  # model i love u take with us all the time in urð± ! ! ! *


Original Tweet:   factsguide: society now    #motivation
Corrected Tweet:  factsguide : society now # motivation




In [None]:
from textblob import TextBlob
from textblob import Word

In [None]:
def correct_tweet_spelling(tweet, spelling_confidence_threshold=0.75):
  
  tweet = tweet.strip()
  tweet = re.sub(" +", " ", tweet)

  tweet_words = tweet.split(" ")
  processed_tweet = ""

  for word in tweet_words:
    if word.startswith('#') or ord(word[0]) > 127 or word.startswith("@"): 
      processed_tweet = processed_tweet + word + " "
    else: 
      word_obj = Word(word)
      spelling_suggestions = word_obj.spellcheck()
      top_suggestion_info = spelling_suggestions[0]
      suggestion = top_suggestion_info[0]
      confidence = top_suggestion_info[1]

      if confidence >= spelling_confidence_threshold and suggestion != word: 
        processed_tweet = processed_tweet + suggestion + " "
      else: 
        processed_tweet = processed_tweet + word + " "

  processed_tweet = processed_tweet.strip()
  processed_tweet = re.sub(" +", " ", processed_tweet)

  return processed_tweet


In [None]:
def perform_spelling_corrections(df, tweet_text_column='tweet', spelling_checked_column='Tweet_SpellChecked', spelling_confidence_threshold=0.75):

  df[spelling_checked_column] = df[tweet_text_column].apply(correct_tweet_spelling, args=(spelling_confidence_threshold,))

  return df

In [None]:
tweet_df = perform_spelling_corrections(tweet_df)
spelling_correction = get_classification_metrics(tweet_df, tweet_text_column='tweet', verbose=True)

spelling_correction

{'Accuracy': [0.796101620674551,
  'True Positives: 885',
  'True Negatives: 24560',
  'False Positives: 5160',
  'False Negatives: 1357'],
 'Precision': [0.14640198511166252,
  'True Positives: 885',
  'False Positives: 5160'],
 'Sensitivity': [0.39473684210526316,
  'True Positives: 885',
  'False Negatives: 1357'],
 'Specificity': [0.8263795423956931,
  'True Negatives: 24560',
  'False Positives: 5160']}

In [None]:
spelling_correction_df = pd.DataFrame(tweet_df)

spelling_correction_df.to_csv(path_or_buf="/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/intermediate_output_files/text_blob_spelling_df.csv")

# Function used to analyze the sentiment analyzer. 

In [None]:
#---------------------------------------------------------------------------------------------------------------------------------------------------------
# This function is used to perform multiple rounds of classification analysis using VADER. Results are returned as a dictionary, which can easily be
# converted to a pandas DataFrame. This makes it very easy to compare how various threshold values and data cleaning decisions impact the accuracy of 
# classifications.
#---------------------------------------------------------------------------------------------------------------------------------------------------------

def analyze_vader(df, threshold_min=-0.5, threshold_max=0.5, num_thresholds=100, data_cleaning_dict=None, text_column='tweet', sentiment_analyzer=sentiment_analyzer,
                  include_confusion_parameters=True): 

  threshold_values = list(np.linspace(start=threshold_min, stop=threshold_max, num=num_thresholds))

  valid_keys = ['Remove_Twitter_Handles', 'Remove_Websites', 'Perform_Sentence_Tokenization']

  if data_cleaning_dict is None or type(data_cleaning_dict) != dict or not all(keys in valid_keys for keys in data_cleaning_dict.keys()):
    data_cleaning_dict = {'Remove_Twitter_Handles' : [True, False], 
                          'Remove_Websites' : [True, False], 
                          'Perform_Sentence_Tokenization' : [True, False]}

  master_metrics = {'Accuracy' : [],
                    'Sensitivity' : [],
                    'Specificity' : [],
                    'Precision' : [],
                    'True_Positives' : [],
                    'True_Negatives' : [],
                    'False_Positives' : [],
                    'False_Negatives' : [],
                    'Removed_Twitter_Handles' : [], 
                    'Removed_Websites' : [],
                    'Performed_Sentence_Tokenization' : [], 
                    'Decision_Threshold' : [] }

  for twitter_handle_index, handle_parameter in enumerate(data_cleaning_dict['Remove_Twitter_Handles']):
    for website_index, website_parameter in enumerate(data_cleaning_dict['Remove_Websites']): 
      for sent_token_index, sentence_token_param in enumerate(data_cleaning_dict['Perform_Sentence_Tokenization']):
        for thresh_index, threshold_param in enumerate(threshold_values): 

          if handle_parameter == True:
            df = remove_twitter_handles(df, tweet_text_column=text_column, no_handles_column='Tweet_Without_Handles')
            text_column = 'Tweet_Without_Handles'
          
          if website_parameter == True: 
            df = remove_websites_from_tweets(df, tweet_text_column=text_column, no_website_column='Tweet_Without_Websites')
            text_column = 'Tweet_Without_Websites'

          metrics = get_classification_metrics(df, tweet_text_column=text_column, sentiment_analyzer=sentiment_analyzer, threshold=threshold_param,
                                               sentence_level_analysis=sentence_token_param, include_confusion_params=include_confusion_parameters)

          master_metrics['Accuracy'].append(metrics['Accuracy'])
          master_metrics['Sensitivity'].append(metrics['Sensitivity'])
          master_metrics['Specificity'].append(metrics['Specificity'])
          master_metrics['Precision'].append(metrics['Precision'])
          master_metrics['True_Positives'].append(metrics['True_Positives'])
          master_metrics['True_Negatives'].append(metrics['True_Negatives'])
          master_metrics['False_Positives'].append(metrics['False_Positives'])
          master_metrics['False_Negatives'].append(metrics['False_Negatives'])
          master_metrics['Removed_Twitter_Handles'].append(handle_parameter)
          master_metrics['Removed_Websites'].append(website_parameter)
          master_metrics['Performed_Sentence_Tokenization'].append(sentence_token_param)
          master_metrics['Decision_Threshold'].append(threshold_param)

  return master_metrics

In [None]:
# This cell calls the function above to create a detailed analysis file that shows how changing the decision boundary and the use of data cleaning prior to analysis 
# will impact the effectiveness of using vaderSentiment as a hate speech classifier. This function takes a long time to run, therefore the code has been commented out
# and the result of running this cell is simply imported from a csv file in the following cell.
'''
original_df = original_tweet_df.copy(deep=True)

master_vader_metrics = analyze_vader(original_df, threshold_min=-0.99, threshold_max=0.99, num_thresholds=400)

master_vader_df = pd.DataFrame(master_vader_metrics)

filename="/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/intermediate_output_files/vaderSentiment_Analysis.csv"

master_vader_df.to_csv(path_or_buf=filename)

master_vader_df.head()
'''

In [None]:
# Reading in the analysis file created in the cell above. 
filename="/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/intermediate_output_files/vaderSentiment_Analysis.csv"

vader_analysis_df = pd.read_csv(filename, index_col=0)

vader_analysis_df.head()

Unnamed: 0,Accuracy,Sensitivity,Specificity,Precision,True_Positives,True_Negatives,False_Positives,False_Negatives,Removed_Twitter_Handles,Removed_Websites,Performed_Sentence_Tokenization,Decision_Threshold
0,0.929854,0.0,1.0,,0,29720,0,2242,True,True,True,-0.99
1,0.929854,0.0,1.0,,0,29720,0,2242,True,True,True,-0.985038
2,0.929854,0.0,1.0,,0,29720,0,2242,True,True,True,-0.980075
3,0.929854,0.0,1.0,,0,29720,0,2242,True,True,True,-0.975113
4,0.929885,0.000446,1.0,1.0,1,29720,0,2241,True,True,True,-0.97015


# Create output files for later analysis

Create an output csv file that contains the tweet_df dataframe with the vader scores for each combination of the data cleaning decisions.

In [None]:
# Output tweet_df dataframe with vader scores calculated using no data cleaning.

# Commented out because the csv file has already been saved.
'''
tweet_no_data_cleaning_df = add_sentiment_score_columns(cleaned_tweet_df)

filename="/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/intermediate_output_files/vader_no_data_cleaning.csv"

tweet_no_data_cleaning_df.to_csv(path_or_buf=filename, index=False)
'''

In [None]:
# Output tweet_df dataframe with vader scores calculated using only twitter handle removal data cleaning.

# Commented out because the csv file has already been saved.
'''
temp_df = cleaned_tweet_df.copy(deep=True)

tweet_no_handles_df = remove_twitter_handles(temp_df)

tweet_no_handles_df = add_sentiment_score_columns(df=tweet_no_handles_df, text_to_score_column='Tweet_Without_Handles')

filename="/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/intermediate_output_files/vader_handles_removed.csv"

tweet_no_handles_df.to_csv(path_or_buf=filename, index=False)

tweet_no_handles_df.head()
'''

In [None]:
# Output tweet_df dataframe with vader scores calculated using only website removal data cleaning.

# Commented out because the csv file has already been saved.
'''
temp_df = cleaned_tweet_df.copy(deep=True)

tweet_no_website_df = remove_websites_from_tweets(temp_df)

tweet_no_website_df = add_sentiment_score_columns(df=tweet_no_website_df, text_to_score_column='Tweet_Without_Websites')

filename="/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/intermediate_output_files/vader_websites_removed.csv"

tweet_no_website_df.to_csv(path_or_buf=filename, index=False)

tweet_no_website_df.head()
'''

In [None]:
# Output tweet_df dataframe with vader scores calculated using only sentence level tokenization data cleaning.

# Commented out because the csv file has already been saved.
'''
temp_df = cleaned_tweet_df.copy(deep=True)

tweet_sentence_level_df = add_sentence_level_compound_score_column(df=temp_df, keep_subscores=True)

filename="/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/intermediate_output_files/vader_sentence_level.csv"

tweet_sentence_level_df.to_csv(path_or_buf=filename, index=False)

tweet_sentence_level_df.head()
'''

In [None]:
# Commented out because these csv files have already been saved.
'''
# Output tweet_df dataframe with vader scores calculated using all data cleaning techniques (twitter handle removal, website removal, sentence level tokenization).

tweet_all_preprocessing_df = remove_twitter_handles(cleaned_tweet_df, tweet_text_column='tweet', no_handles_column='Tweet_Without_Handles')

tweet_all_preprocessing_df = remove_websites_from_tweets(tweet_all_preprocessing_df, tweet_text_column='Tweet_Without_Handles',
                                                         no_website_column='No_Websites_Or_Handles')

tweet_all_preprocessing_df = add_sentence_level_compound_score_column(df=tweet_all_preprocessing_df, tweet_text_column='No_Websites_Or_Handles', keep_subscores=True)

filename="/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/intermediate_output_files/vader_full_preprocessing.csv"

tweet_all_preprocessing_df.to_csv(path_or_buf=filename, index=False)


# Create a simplified version that has only what I need for modeling.
tweet_all_preprocessing_df.drop(columns=['Tweet_Without_Handles', 'No_Websites_Or_Handles', 'Positive_Sentiment_Score',
                                         'Negative_Sentiment_Score', 'Neutral_Sentiment_Score', 'Compound_Sentiment_Score'], inplace=True)

# Shifting the compound score from (-1 to 1) to (0 to 2) because Naive bayes cannot take negative inputs.
tweet_all_preprocessing_df['Sentence_Level_compound_Score'] = tweet_all_preprocessing_df['Sentence_Level_compound_Score'] + 1

filename="/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/intermediate_output_files/vader_full_preprocessing_model.csv"

tweet_all_preprocessing_df.to_csv(path_or_buf=filename, index=False)

tweet_all_preprocessing_df.head()
'''