# Data Cleaning 

## Resources used for pre-processing 

- https://towardsdatascience.com/an-easy-tutorial-about-sentiment-analysis-with-deep-learning-and-keras-2bf52b9cba91

* Remove URLs from the tweets
* Tokenize text
* Remove emails
* Remove new lines characters
* Remove distracting single quotes
* Remove all punctuation signs
* Lowercase all text
* Detokenize text
* Convert list of texts to Numpy array

In [1]:
# pip -- install nltk 

# Importing relevant libraries 
import numpy as np
import pandas as pd
import re
import nltk
from nltk.tokenize.treebank import TreebankWordDetokenizer
import gensim
from gensim.utils import simple_preprocess
print('Done')

Done


In [3]:
# Load data 

tweet_text = pd.read_csv('/Users/eleanordavies/Desktop/Sample_twitterdata_05Feb2021.csv')
tweet_text

Unnamed: 0,id,text,hashtags_cnt,hashtags,created_at,tw_date,place_id,full_name,name
0,1356026121549131779,It’s National Storytelling Week -&amp; we all ...,0,,2021-01-31T23:46:35.000Z,2021-01-31,00486f39ae8bd30d,"Ilford, London",Ilford
1,1356015933110673412,@blue_notorious That's a really clever way to ...,0,,2021-01-31T23:06:06.000Z,2021-01-31,519618c33762168f,"South Shore, England",South Shore
2,1355998497649078275,Personal view. Banning leafletting is a nonsen...,0,,2021-01-31T21:56:49.000Z,2021-01-31,4b6c0ea1297b258a,"Leyland, England",Leyland
3,1355985193128112134,Trophy Hunters: this is one.of the very rare t...,0,,2021-01-31T21:03:57.000Z,2021-01-31,22611f9e4155fb8c,"Crewkerne, England",Crewkerne
4,1355979822623825920,@LycanEclipse Its not stupid. But I know what ...,0,,2021-01-31T20:42:37.000Z,2021-01-31,42d0cf7d49d27c95,"Hillingdon, London",Hillingdon
...,...,...,...,...,...,...,...,...,...
107002,1349332515488722945,@angiebUK @LTHlondon @JuliaHB1 Why? He's perfe...,0,,2021-01-13T12:28:35.000Z,2021-01-13,2d41dd150edf488a,"Weston-super-Mare, England",Weston-super-Mare
107003,1349321013478232065,"Shut, and I cannot stress this next part enoug...",0,,2021-01-13T11:42:53.000Z,2021-01-13,573ede7f6c450804,"Brighton, England",Brighton
107004,1349284431111057409,Eavesdropping on school assembly my daughter’s...,1,MentalHealthAwareness,2021-01-13T09:17:31.000Z,2021-01-13,493d127fe5461353,"Upper Quinton, England",Upper Quinton
107005,1349282649383161856,Likely stress point is Russia - Western relati...,0,,2021-01-13T09:10:26.000Z,2021-01-13,4393349f368f67a1,"Lambeth, London",Lambeth


In [4]:
# Creating a function to remove characters 

def depure_data(data):
    
    #Removing URLs with a regular expression
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    data = url_pattern.sub(r'', data)

    # Remove Emails
    data = re.sub('\S*@\S*\s?', '', data)

    # Remove new line characters
    data = re.sub('\s+', ' ', data)

    # Remove distracting single quotes
    data = re.sub("\'", "", data)
        
    return data

In [5]:
temp = []
#Splitting twtter data text column to list
data_to_list = tweet_text['text'].values.tolist()

#applying the above function 
for i in range(len(data_to_list)):
    temp.append(depure_data(data_to_list[i]))

# tempdf = pd.DataFrame(temp)
# tempdf.head()

In [6]:
# Tokenize and removal all punctation, emojis and puts text into lowercase 

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
        
data_words = list(sent_to_words(temp))

print(data_words[:10])

[['it', 'national', 'storytelling', 'week', 'amp', 'we', 'all', 'have', 'story', 'to', 'tell', 'story', 'to', 'share', 'story', 'to', 'hear', 'let', 'use', 'our', 'stories', 'to', 'connect', 'with', 'each', 'other', 'to', 'enhance', 'listening', 'language', 'amp', 'communication', 'to', 'release', 'stress', 'to', 'empathise', 'amp', 'understand', 'each', 'other', 'it', 'all', 'about', 'our', 'story'], ['thats', 'really', 'clever', 'way', 'to', 'get', 'look', 'at', 'the', 'characters', 'hopefully', 'its', 'helped', 'with', 'the', 'anxiety'], ['personal', 'view', 'banning', 'leafletting', 'is', 'nonsensical', 'over', 'reaction', 'could', 'be', 'part', 'of', 'my', 'daily', 'exercise', 'and', 'wouldnt', 'be', 'harming', 'anyone'], ['trophy', 'hunters', 'this', 'is', 'one', 'of', 'the', 'very', 'rare', 'times', 'wish', 'we', 'had', 'capital', 'punishment', 'in', 'gb', 'people', 'who', 'exercise', 'this', 'fun', 'sport', 'do', 'not', 'deserve', 'to', 'exist', 'on', 'this', 'planet'], ['its',

In [7]:
# detokenise returns them back into a sentence 

def detokenize(text):
    return TreebankWordDetokenizer().detokenize(text)

In [8]:
data_detoken = []
for i in range(len(data_words)):
    data_detoken.append(detokenize(data_words[i]))
data_detoken = np.array(data_detoken)
data_detoken_df = pd.DataFrame(data_detoken)
data_detoken_df

Unnamed: 0,0
0,it national storytelling week amp we all have ...
1,thats really clever way to get look at the cha...
2,personal view banning leafletting is nonsensic...
3,trophy hunters this is one of the very rare ti...
4,its not stupid but know what you mean thoughs ...
...,...
107002,why hes perfectly correct gov advice is only e...
107003,shut and cannot stress this next part enough t...
107004,eavesdropping on school assembly my daughter s...
107005,likely stress point is russia western relation...


In [None]:
# data_detoken_df.to_csv('/Users/eleanordavies/Desktop/df.csv', index = False)

In [9]:
# add cleaned tweet text as column 'selected_tweets' to original dataframe 
frames = [data_detoken_df,tweet_text]
demo_tweets_clean = pd.concat(frames, axis=1)

demo_tweets_clean = demo_tweets_clean.rename(columns={0: "selected_tweets"})

In [10]:
# Exporting the cleaned tweets to local directory 

# demo_tweets_clean.to_csv('/Users/eleanordavies/Desktop/demo_tweets_clean.csv', index = False)
print("Tweets Clean!")

Tweets Clean!


In [11]:
demo_tweets_clean.head()

Unnamed: 0,selected_tweets,id,text,hashtags_cnt,hashtags,created_at,tw_date,place_id,full_name,name
0,it national storytelling week amp we all have ...,1356026121549131779,It’s National Storytelling Week -&amp; we all ...,0,,2021-01-31T23:46:35.000Z,2021-01-31,00486f39ae8bd30d,"Ilford, London",Ilford
1,thats really clever way to get look at the cha...,1356015933110673412,@blue_notorious That's a really clever way to ...,0,,2021-01-31T23:06:06.000Z,2021-01-31,519618c33762168f,"South Shore, England",South Shore
2,personal view banning leafletting is nonsensic...,1355998497649078275,Personal view. Banning leafletting is a nonsen...,0,,2021-01-31T21:56:49.000Z,2021-01-31,4b6c0ea1297b258a,"Leyland, England",Leyland
3,trophy hunters this is one of the very rare ti...,1355985193128112134,Trophy Hunters: this is one.of the very rare t...,0,,2021-01-31T21:03:57.000Z,2021-01-31,22611f9e4155fb8c,"Crewkerne, England",Crewkerne
4,its not stupid but know what you mean thoughs ...,1355979822623825920,@LycanEclipse Its not stupid. But I know what ...,0,,2021-01-31T20:42:37.000Z,2021-01-31,42d0cf7d49d27c95,"Hillingdon, London",Hillingdon


# Empath 

## Empath Resources 

- https://github.com/Ejhfast/empath-client
- https://www.tandfonline.com/doi/abs/10.1080/09638237.2020.1739251

In [12]:
import empath
import os
import sys
sys.path.insert(0, os.path.abspath('..'))

from empath import Empath

In [13]:
# creating the lexicon object 
# list of categories within empath 
lexicon = Empath()
list_cat = ['help', 'office', 'dance', 'money', 'wedding', 'domestic_work', 'sleep', 'medical_emergency', 'cold', 'hate', 'cheerfulness', 'aggression', 'occupation', 'envy', 'anticipation', 'family', 'vacation', 'crime', 'attractive', 'masculine', 'prison', 'health', 'pride', 'dispute', 'nervousness', 'government', 'weakness', 'horror', 'swearing_terms', 'leisure', 'suffering', 'royalty', 'wealthy', 'tourism', 'furniture', 'school', 'magic', 'beach', 'journalism', 'morning', 'banking', 'social_media', 'exercise', 'night', 'kill', 'blue_collar_job', 'art', 'ridicule', 'play', 'computer', 'college', 'optimism', 'stealing', 'real_estate', 'home', 'divine', 'sexual', 'fear', 'irritability', 'superhero', 'business', 'driving', 'pet', 'childish', 'cooking', 'exasperation', 'religion', 'hipster', 'internet', 'surprise', 'reading', 'worship', 'leader', 'independence', 'movement', 'body', 'noise', 'eating', 'medieval', 'zest', 'confusion', 'water', 'sports', 'death', 'healing', 'legend', 'heroic', 'celebration', 'restaurant', 'violence', 'programming', 'dominant_heirarchical', 'military', 'neglect', 'swimming', 'exotic', 'love', 'hiking', 'communication', 'hearing', 'order', 'sympathy', 'hygiene', 'weather', 'anonymity', 'trust', 'ancient', 'deception', 'fabric', 'air_travel', 'fight', 'dominant_personality', 'music', 'vehicle', 'politeness', 'toy', 'farming', 'meeting', 'war', 'speaking', 'listen', 'urban', 'shopping', 'disgust', 'fire', 'tool', 'phone', 'gain', 'sound', 'injury', 'sailing', 'rage', 'science', 'work', 'appearance', 'valuable', 'warmth', 'youth', 'sadness', 'fun', 'emotional', 'joy', 'affection', 'traveling', 'fashion', 'ugliness', 'lust', 'shame', 'torment', 'economics', 'anger', 'politics', 'ship', 'clothing', 'car', 'strength', 'technology', 'breaking', 'shape_and_size', 'power', 'white_collar_job', 'animal', 'party', 'terrorism', 'smell', 'disappointment', 'poor', 'plant', 'pain', 'beauty', 'timidity', 'philosophy', 'negotiate', 'negative_emotion', 'cleaning', 'messaging', 'competing', 'law', 'friends', 'payment', 'achievement', 'alcohol', 'liquid', 'feminine', 'weapon', 'children', 'monster', 'ocean', 'giving', 'contentment', 'writing', 'rural', 'positive_emotion', 'musical', 'colors', 'id', 'injury and death', 'demo', 'what']

pos_cat = ["worship", "masculine", "death", "weakness", "divine", "religion", "sleep", "swearing_terms", "injury", "envy"]
neg_cat = ["gain", "reading", "banking", "independence", "programming", "payment", "technology", "tourism", "air_travel", "negotiate"]
tot_cat = pos_cat + neg_cat 

In [14]:
# defining function to analyse sentences 
# can be altered to add category and normalise values 
# may need to specifiy categories 

def empath_analyse(sentence):
    x = lexicon.analyze(sentence, categories=tot_cat)
    return x

## Running tweets through empath 
- Create an array of the 'cleaned' text from the tweets 
- Run a for loop to analyse tweets usinh emapth_analyse function 
- Convert data dictionary into dataframe  
- Append data dictionary dataframe to the exsiting tweet dataframe 

In [15]:
data_to_list = demo_tweets_clean['selected_tweets'].values.tolist()
data_to_list = np.array(data_to_list)

In [16]:
data =[]
for i in range(len(data_to_list)):
    data.append(empath_analyse(data_to_list[i])),
data_dict = pd.DataFrame.from_dict(data)
data_dict.head()

Unnamed: 0,worship,masculine,death,weakness,divine,religion,sleep,swearing_terms,injury,envy,gain,reading,banking,independence,programming,payment,technology,tourism,air_travel,negotiate
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
result = pd.concat([demo_tweets_clean, data_dict], axis=1, join="inner")
print("Emapth Analysis Complete!")

Emapth Analysis Complete!


In [24]:
# result["id"] = result.index
# saving non aggregated dataset of empath analysed tweets 
result.to_csv('/Users/eleanordavies/Desktop/results.csv', index = False) 
result.head()

Unnamed: 0,selected_tweets,id,text,hashtags_cnt,hashtags,created_at,tw_date,place_id,full_name,name,...,gain,reading,banking,independence,programming,payment,technology,tourism,air_travel,negotiate
0,it national storytelling week amp we all have ...,1356026121549131779,It’s National Storytelling Week -&amp; we all ...,0,,2021-01-31T23:46:35.000Z,2021-01-31,00486f39ae8bd30d,"Ilford, London",Ilford,...,0.0,4.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,thats really clever way to get look at the cha...,1356015933110673412,@blue_notorious That's a really clever way to ...,0,,2021-01-31T23:06:06.000Z,2021-01-31,519618c33762168f,"South Shore, England",South Shore,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,personal view banning leafletting is nonsensic...,1355998497649078275,Personal view. Banning leafletting is a nonsen...,0,,2021-01-31T21:56:49.000Z,2021-01-31,4b6c0ea1297b258a,"Leyland, England",Leyland,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,trophy hunters this is one of the very rare ti...,1355985193128112134,Trophy Hunters: this is one.of the very rare t...,0,,2021-01-31T21:03:57.000Z,2021-01-31,22611f9e4155fb8c,"Crewkerne, England",Crewkerne,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,its not stupid but know what you mean thoughs ...,1355979822623825920,@LycanEclipse Its not stupid. But I know what ...,0,,2021-01-31T20:42:37.000Z,2021-01-31,42d0cf7d49d27c95,"Hillingdon, London",Hillingdon,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# Counting total number of tweets per date 

pivot_table = result.pivot_table(
     index='tw_date',
     values='id',
     aggfunc= len).reset_index()

pivot_table.columns = ['tw_date','count_of_tweets']

pivot_table.head()

Unnamed: 0,tw_date,count_of_tweets
0,2019-01-01,18
1,2019-01-02,20
2,2019-01-03,13
3,2019-01-04,30
4,2019-01-05,17


In [20]:
# Aggregating data by date and get mean of lexical categories 

mean_pivot_table = result.pivot_table(
    index=[ "tw_date"], # "name"], 
    values=tot_cat,
    aggfunc=np.mean).reset_index()
mean_pivot_table

Unnamed: 0,tw_date,air_travel,banking,death,divine,envy,gain,independence,injury,masculine,...,payment,programming,reading,religion,sleep,swearing_terms,technology,tourism,weakness,worship
0,2019-01-01,0.111111,0.000000,0.333333,0.000000,0.000000,0.055556,0.111111,0.111111,0.000000,...,0.000000,0.111111,0.055556,0.000000,0.055556,0.000000,0.111111,0.000000,0.000000,0.111111
1,2019-01-02,0.000000,0.000000,0.100000,0.000000,0.000000,0.100000,0.000000,0.300000,0.000000,...,0.000000,0.100000,0.150000,0.000000,0.100000,0.000000,0.300000,0.000000,0.000000,0.000000
2,2019-01-03,0.000000,0.076923,0.230769,0.000000,0.076923,0.076923,0.000000,0.384615,0.000000,...,0.153846,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.153846,0.000000
3,2019-01-04,0.000000,0.000000,0.000000,0.200000,0.000000,0.000000,0.000000,0.066667,0.066667,...,0.000000,0.000000,0.100000,0.033333,0.166667,0.000000,0.000000,0.033333,0.000000,0.000000
4,2019-01-05,0.000000,0.000000,0.294118,0.117647,0.058824,0.000000,0.000000,0.470588,0.000000,...,0.000000,0.000000,0.058824,0.117647,0.411765,0.000000,0.000000,0.058824,0.294118,0.117647
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
757,2021-01-27,0.044199,0.066298,0.237569,0.022099,0.022099,0.016575,0.038674,0.193370,0.022099,...,0.088398,0.044199,0.088398,0.060773,0.138122,0.022099,0.071823,0.033149,0.049724,0.033149
758,2021-01-28,0.028986,0.086957,0.202899,0.014493,0.007246,0.057971,0.021739,0.289855,0.036232,...,0.101449,0.014493,0.137681,0.028986,0.050725,0.043478,0.065217,0.007246,0.043478,0.014493
759,2021-01-29,0.030864,0.012346,0.154321,0.018519,0.006173,0.061728,0.012346,0.216049,0.012346,...,0.018519,0.061728,0.086420,0.012346,0.240741,0.012346,0.080247,0.018519,0.074074,0.006173
760,2021-01-30,0.048951,0.027972,0.125874,0.013986,0.006993,0.034965,0.013986,0.174825,0.020979,...,0.006993,0.034965,0.076923,0.013986,0.216783,0.020979,0.027972,0.034965,0.027972,0.020979


In [21]:
mean_pivot_table['count'] = mean_pivot_table.tw_date.map(
   pivot_table.set_index('tw_date').count_of_tweets)

In [22]:
mean_pivot_table

Unnamed: 0,tw_date,air_travel,banking,death,divine,envy,gain,independence,injury,masculine,...,programming,reading,religion,sleep,swearing_terms,technology,tourism,weakness,worship,count
0,2019-01-01,0.111111,0.000000,0.333333,0.000000,0.000000,0.055556,0.111111,0.111111,0.000000,...,0.111111,0.055556,0.000000,0.055556,0.000000,0.111111,0.000000,0.000000,0.111111,18
1,2019-01-02,0.000000,0.000000,0.100000,0.000000,0.000000,0.100000,0.000000,0.300000,0.000000,...,0.100000,0.150000,0.000000,0.100000,0.000000,0.300000,0.000000,0.000000,0.000000,20
2,2019-01-03,0.000000,0.076923,0.230769,0.000000,0.076923,0.076923,0.000000,0.384615,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.153846,0.000000,13
3,2019-01-04,0.000000,0.000000,0.000000,0.200000,0.000000,0.000000,0.000000,0.066667,0.066667,...,0.000000,0.100000,0.033333,0.166667,0.000000,0.000000,0.033333,0.000000,0.000000,30
4,2019-01-05,0.000000,0.000000,0.294118,0.117647,0.058824,0.000000,0.000000,0.470588,0.000000,...,0.000000,0.058824,0.117647,0.411765,0.000000,0.000000,0.058824,0.294118,0.117647,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
757,2021-01-27,0.044199,0.066298,0.237569,0.022099,0.022099,0.016575,0.038674,0.193370,0.022099,...,0.044199,0.088398,0.060773,0.138122,0.022099,0.071823,0.033149,0.049724,0.033149,181
758,2021-01-28,0.028986,0.086957,0.202899,0.014493,0.007246,0.057971,0.021739,0.289855,0.036232,...,0.014493,0.137681,0.028986,0.050725,0.043478,0.065217,0.007246,0.043478,0.014493,138
759,2021-01-29,0.030864,0.012346,0.154321,0.018519,0.006173,0.061728,0.012346,0.216049,0.012346,...,0.061728,0.086420,0.012346,0.240741,0.012346,0.080247,0.018519,0.074074,0.006173,162
760,2021-01-30,0.048951,0.027972,0.125874,0.013986,0.006993,0.034965,0.013986,0.174825,0.020979,...,0.034965,0.076923,0.013986,0.216783,0.020979,0.027972,0.034965,0.027972,0.020979,143


In [23]:
mean_pivot_table.to_csv('/Users/eleanordavies/Desktop/mean_pivot_table.csv', index = False) 