### Cleaning and preprocessing the data for NLP analysis
Dataset: Tweets_2020_11-March-7-June_workfromhome_all.csv => 29153 records

In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import re
import contractions
from nltk.tokenize import word_tokenize
import os
from sentistrength import PySentiStr

##### Import the raw dataset

In [2]:
raw_dataset = pd.read_csv('Tweets_2020_11-March-7-June_workfromhome_all.csv')
print(raw_dataset.shape)
raw_dataset.head()

(29147, 18)


Unnamed: 0,Category,Keyword,Web_Page_URL,Tweet_Website,Author_Name,Author_Web_Page_URL,Tweet_Timestamp,Tweet_Time,Tweet_Content,Tweet_Image_URL,Tweet_Number_of_Likes,Tweet_Number_of_Retweets,Tweet_Number_of_Reviews,Retweet_or_not,Retweet_Original_Tweet_Content,Retweet_Original_Tweet_Poster,Retweet_Original_Tweet_Time,Retweet_Original_Tweet_PosterID
0,Post,(#workfromhome) lang:en until:2020-03-12 since...,https://twitter.com/search?q=(%23workfromhome)...,https://twitter.com/TheWritingFitz/status/1237...,,https://twitter.com/TheWritingFitz,1580000000000.0,11-03-20 00:14,@CWAUnion\n Is the union able to take measures...,,,,,No,,,,
1,Post,(#workfromhome) lang:en until:2020-03-12 since...,https://twitter.com/search?q=(%23workfromhome)...,https://twitter.com/digitalkg/status/123753446...,,https://twitter.com/digitalkg,1580000000000.0,11-03-20 00:23,I’m looking forward to working from home a lot...,,,,,No,,,,
2,Post,(#workfromhome) lang:en until:2020-03-12 since...,https://twitter.com/search?q=(%23workfromhome)...,https://twitter.com/LukeZhang5/status/12375345...,,https://twitter.com/LukeZhang5,1580000000000.0,11-03-20 00:23,China team work from home more than 1 month al...,,,,,No,,,,
3,Post,(#workfromhome) lang:en until:2020-03-12 since...,https://twitter.com/search?q=(%23workfromhome)...,https://twitter.com/Adamhill1212/status/123753...,,https://twitter.com/Adamhill1212,1580000000000.0,11-03-20 00:39,We are about to find out how much work could h...,,363.0,28.0,14.0,No,,,,
4,Post,(#workfromhome) lang:en until:2020-03-12 since...,https://twitter.com/search?q=(%23workfromhome)...,https://twitter.com/kennethdashen/status/12375...,@kennethdashen,https://twitter.com/kennethdashen,1580000000000.0,11-03-20 00:45,Folks who can't #WorkFromHome probably hate th...,,1.0,,,No,,,,


In [3]:
# Populate the Author_Name column

raw_dataset['Author_Name'] = raw_dataset['Author_Web_Page_URL'].apply(lambda x: '@' + re.findall(r'https://twitter.com/(\w+)', x)[0])

In [4]:
# Check how many unique values are in each column of the dataset
raw_dataset.nunique()

Category                               2
Keyword                               98
Web_Page_URL                         187
Tweet_Website                      29073
Author_Name                        24126
Author_Web_Page_URL                24126
Tweet_Timestamp                        2
Tweet_Time                         23004
Tweet_Content                      29147
Tweet_Image_URL                      248
Tweet_Number_of_Likes                439
Tweet_Number_of_Retweets             181
Tweet_Number_of_Reviews              105
Retweet_or_not                         1
Retweet_Original_Tweet_Content         0
Retweet_Original_Tweet_Poster          0
Retweet_Original_Tweet_Time            0
Retweet_Original_Tweet_PosterID        0
dtype: int64

Check how many NaN values are in each column of the dataset and their percentage

In [5]:
print("Missing values percentage: \n")

# Calculate the maximum length of the column names
coloane = raw_dataset.columns.to_list()
max_col_len = max([len(str(col)) for col in coloane])

# Define the formatting string with a dynamic width for the column name field
fmt_str = '{:<{}}  {:>5}  {:.4f}%'

# Iterate over the columns and print the missing values percentage for each one
for c in coloane:
    if c != 'tconst':
        num_missing = len(raw_dataset[raw_dataset[c].isna()])
        percent_missing = num_missing / len(raw_dataset) * 100
        print(fmt_str.format(c, max_col_len, num_missing, percent_missing))
    else:
        num_missing = len(raw_dataset[raw_dataset[c].isna()])
        percent_missing = num_missing / len(raw_dataset) * 100
        print(fmt_str.format(c, max_col_len + 8, num_missing, percent_missing))

Missing values percentage: 

Category                             0  0.0000%
Keyword                              0  0.0000%
Web_Page_URL                         0  0.0000%
Tweet_Website                        0  0.0000%
Author_Name                          0  0.0000%
Author_Web_Page_URL                  0  0.0000%
Tweet_Timestamp                      0  0.0000%
Tweet_Time                           0  0.0000%
Tweet_Content                        0  0.0000%
Tweet_Image_URL                  28882  99.0908%
Tweet_Number_of_Likes            10222  35.0705%
Tweet_Number_of_Retweets         20278  69.5715%
Tweet_Number_of_Reviews          18689  64.1198%
Retweet_or_not                       0  0.0000%
Retweet_Original_Tweet_Content   29147  100.0000%
Retweet_Original_Tweet_Poster    29147  100.0000%
Retweet_Original_Tweet_Time      29147  100.0000%
Retweet_Original_Tweet_PosterID  29147  100.0000%


In [6]:
# Drop the unnecessary columns from the dataset
raw_dataset.drop(['Category','Tweet_Timestamp','Tweet_Image_URL','Retweet_or_not','Retweet_Original_Tweet_Content', 'Retweet_Original_Tweet_Poster', 'Retweet_Original_Tweet_Time', 'Retweet_Original_Tweet_PosterID'], 
                     axis=1, inplace=True)

In [7]:
# Replace NaN values with zeros for Tweet Number of Likes, Retweets and Reviews
raw_dataset[["Tweet_Number_of_Likes","Tweet_Number_of_Retweets", "Tweet_Number_of_Reviews"]] = raw_dataset[["Tweet_Number_of_Likes","Tweet_Number_of_Retweets", "Tweet_Number_of_Reviews"]].fillna(0)
raw_dataset.head()

Unnamed: 0,Keyword,Web_Page_URL,Tweet_Website,Author_Name,Author_Web_Page_URL,Tweet_Time,Tweet_Content,Tweet_Number_of_Likes,Tweet_Number_of_Retweets,Tweet_Number_of_Reviews
0,(#workfromhome) lang:en until:2020-03-12 since...,https://twitter.com/search?q=(%23workfromhome)...,https://twitter.com/TheWritingFitz/status/1237...,@TheWritingFitz,https://twitter.com/TheWritingFitz,11-03-20 00:14,@CWAUnion\n Is the union able to take measures...,0,0,0.0
1,(#workfromhome) lang:en until:2020-03-12 since...,https://twitter.com/search?q=(%23workfromhome)...,https://twitter.com/digitalkg/status/123753446...,@digitalkg,https://twitter.com/digitalkg,11-03-20 00:23,I’m looking forward to working from home a lot...,0,0,0.0
2,(#workfromhome) lang:en until:2020-03-12 since...,https://twitter.com/search?q=(%23workfromhome)...,https://twitter.com/LukeZhang5/status/12375345...,@LukeZhang5,https://twitter.com/LukeZhang5,11-03-20 00:23,China team work from home more than 1 month al...,0,0,0.0
3,(#workfromhome) lang:en until:2020-03-12 since...,https://twitter.com/search?q=(%23workfromhome)...,https://twitter.com/Adamhill1212/status/123753...,@Adamhill1212,https://twitter.com/Adamhill1212,11-03-20 00:39,We are about to find out how much work could h...,363,28,14.0
4,(#workfromhome) lang:en until:2020-03-12 since...,https://twitter.com/search?q=(%23workfromhome)...,https://twitter.com/kennethdashen/status/12375...,@kennethdashen,https://twitter.com/kennethdashen,11-03-20 00:45,Folks who can't #WorkFromHome probably hate th...,1,0,0.0


In [8]:
# Check the datatypes of the columns
raw_dataset.dtypes

Keyword                      object
Web_Page_URL                 object
Tweet_Website                object
Author_Name                  object
Author_Web_Page_URL          object
Tweet_Time                   object
Tweet_Content                object
Tweet_Number_of_Likes        object
Tweet_Number_of_Retweets     object
Tweet_Number_of_Reviews     float64
dtype: object

In [9]:
# Convert the datatype of the columns that contain numeric values from object to Int64 type
raw_dataset['Tweet_Number_of_Likes'] = pd.to_numeric(raw_dataset['Tweet_Number_of_Likes'], errors='coerce').astype('Int64')
raw_dataset['Tweet_Number_of_Retweets'] = pd.to_numeric(raw_dataset['Tweet_Number_of_Retweets'], errors='coerce').astype('Int64')
raw_dataset['Tweet_Number_of_Reviews'] = pd.to_numeric(raw_dataset['Tweet_Number_of_Reviews'], errors='coerce').astype('Int64')

In [10]:
#Todo: Convert Tweet_Time from object type to data type

### Functions to preprocess the Tweets for sentiment analysis

In [10]:
def custom_stop_words(path_to_stopwords):
    """Function to read a .txt file containing (custom) stop words and return a set of these stop words.
    Args:
        path_to_stopwords (str): path to the.txt file containing stop words (e.g. /your/path/to/files/stop_words.txt)
    Returns:
        set: set of stop words
    """    
    stop_words = set()
    with open(path_to_stopwords, 'r') as f:
        for line in f:
            word = line.strip()  # remove whitespace and newline characters
            stop_words.add(word)
    return stop_words


def remove_emoji(text):
    """Function that takes a text string as input and uses a regular expression pattern to match all Unicode characters
    that are classified as emojis. The regular expression includes different ranges of Unicode characters 
    that represent different types of emojis, such as emoticons, symbols, and flags.
    Args:
        text (str): text string to remove emojis from
    Returns:
        str: text string with all emojis removed
    """    
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    
    return emoji_pattern.sub(r'', text)


def remove_stopwords(text, stop_words):
    """Function that removes stop words from a given text.
    Args:
        text (str): text string
        stop_words (set): set of stop words
    Returns:
        str: text string without stop words
    """    
    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove the stopwords
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

    # Join the filtered tokens back into a string
    filtered_text = ' '.join(filtered_tokens)

    return filtered_text


def clean_text(text, stop_words):
    """Function to clean the raw text, e.g. from a tweet. Performs the following steps:
    1. Lowercase all the words in the text
    2. Replace all new line characters with a white space
    3. Remove tags
    4. Remove URLs
    5. Convert contractions to their full forms
    6. Remove punctuations
    7. Remove emojis (emoticons, symbols, flags, etc.)
    8. Remove stopwords
    Args:
        text (str): text string to be cleaned before passing it to the sentiment analysis model
        stop_words (set): set of stop words to be removed from the text
    Returns:
        str: cleaned text string
    """        
    # 1. Lowercase all words in the text
    text = text.lower()

    # 2. Replace the new line character with empty string
    text = text.replace("\n", "")
    
    # 3. Remove words starting with '@' - tags (most common noise in replies)
    text = re.sub(r'@\w+', '', text, flags=re.MULTILINE)

    # 4. Remove words starting with 'http' - hyperlinks
    text = re.sub(r'http\S+|www.\S+', '', text, flags=re.MULTILINE)

    # 5. Remove contractions, such as you're => you are
    contractions.fix(text)

    # 6. Remove punctuation from the text using regular expressions
    text = re.sub(r'[^\w\s]', '', text)

    # 7. Remove emojis
    text = remove_emoji(text)

    # 8. Remove stopwords in English
    text = remove_stopwords(text, stop_words)

    return text

#### Populating the set of stop words from the text file

In [11]:
# Getting the path to the root directory of the filesystem
root_dir = os.getcwd()

In [12]:
stopwords = custom_stop_words(os.path.join(root_dir,'stopwords.txt'))
stopwords

{'an',
 'and',
 'at',
 'been',
 'being',
 'had',
 'it',
 'its',
 'o',
 'of',
 'or',
 'so',
 "that'll",
 'the',
 'this',
 'to',
 'which',
 'y'}

### Setup for SentiStrength library

In [13]:
path_to_sentistrength = os.path.join(root_dir, 'SentiStrength')
# Replace with the path to the Java executable file of SentiStrength.
path_to_sentistrength_jar = os.path.join(path_to_sentistrength, 'SentiStrengthCom.jar')
# Replace with the path to the language folder, which is used along with the .jar file to compute sentiment scores.
path_to_sentistrength_language_folder = os.path.join(path_to_sentistrength, 'LanguageFolder')

In [14]:
senti = PySentiStr()
senti.setSentiStrengthPath(path_to_sentistrength_jar)
senti.setSentiStrengthLanguageFolderPath(path_to_sentistrength_language_folder)

In [15]:
# create a new dataframe with the desired columns for sentiment analysis, initializing empty columns for Cleaned_Tweet, Sentiment_Score, and Dual_Score
new_dataset = pd.DataFrame({
    'Tweet_Time': raw_dataset['Tweet_Time'],
    'Tweet_Content': raw_dataset['Tweet_Content'],
    'Cleaned_Tweet': [''] * len(raw_dataset), # initializing empty values for Cleaned_Tweet
    'Sentiment_Score': [0] * len(raw_dataset), # initializing empty values for Sentiment_Score
    'Binary_Score': [0] * len(raw_dataset), # initializing empty values for Binary_Score
    'Tweet_Number_of_Likes': raw_dataset['Tweet_Number_of_Likes'],
    'Tweet_Number_of_Retweets': raw_dataset['Tweet_Number_of_Retweets'],
    'Tweet_Number_of_Reviews': raw_dataset['Tweet_Number_of_Reviews']
})
new_dataset.head()


Unnamed: 0,Tweet_Time,Tweet_Content,Cleaned_Tweet,Sentiment_Score,Binary_Score,Tweet_Number_of_Likes,Tweet_Number_of_Retweets,Tweet_Number_of_Reviews
0,11-03-20 00:14,@CWAUnion\n Is the union able to take measures...,,0,0,0,0,0
1,11-03-20 00:23,I’m looking forward to working from home a lot...,,0,0,0,0,0
2,11-03-20 00:23,China team work from home more than 1 month al...,,0,0,0,0,0
3,11-03-20 00:39,We are about to find out how much work could h...,,0,0,363,28,14
4,11-03-20 00:45,Folks who can't #WorkFromHome probably hate th...,,0,0,1,0,0


In [16]:
new_dataset['Cleaned_Tweet'] = new_dataset['Tweet_Content'].apply(lambda x: clean_text(x, stopwords))
new_dataset.head()

Unnamed: 0,Tweet_Time,Tweet_Content,Cleaned_Tweet,Sentiment_Score,Binary_Score,Tweet_Number_of_Likes,Tweet_Number_of_Retweets,Tweet_Number_of_Reviews
0,11-03-20 00:14,@CWAUnion\n Is the union able to take measures...,is union able take measures in getting deal wi...,0,0,0,0,0
1,11-03-20 00:23,I’m looking forward to working from home a lot...,im looking forward working from home a lot bec...,0,0,0,0,0
2,11-03-20 00:23,China team work from home more than 1 month al...,china team work from home more than 1 month al...,0,0,0,0,0
3,11-03-20 00:39,We are about to find out how much work could h...,we are about find out how much work could have...,0,0,363,28,14
4,11-03-20 00:45,Folks who can't #WorkFromHome probably hate th...,folks who cant workfromhome probably hate i fe...,0,0,1,0,0


In [67]:
# Define a function to compute the sentiment score for a given tweet text
def compute_sentiment_score(tweet_text):
    result = senti.getSentiment(tweet_text)
    return int(result[0])

# apply the compute_sentiment_score function to each row of the 'Cleaned_Tweet' column and store the results in the 'Sentiment_Score' column
new_dataset['Sentiment_Score'] = new_dataset['Cleaned_Tweet'].apply(compute_sentiment_score)


KeyboardInterrupt: 

In [18]:
scor = new_dataset.loc[3,'Cleaned_Tweet']

In [19]:
scor

'we are about find out how much work could have actually done better home in sweats on couch petting your doga lot answer is going be a lot covid19 coronavirus covid2019 workfromhome'

In [26]:
raw_dataset.loc[81, 'Tweet_Content']

'How many of you are working from home due to #coronavirus ?\n\nOur HR sent us an email today asking to stay home. What a blessing to work for a company who cares so much about their people  #blessed \n\n#WorkFromHome'

In [27]:
cleaned = clean_text(raw_dataset.loc[81, 'Tweet_Content'], stopwords)
cleaned

'how many you are working home due coronavirus hr sent us email today asking stay home what blessing work company who cares much people blessed workfromhome'

In [34]:
scor = senti.getSentiment(cleaned, score='binary')
scor

[1]