# Data Cleaning 

## Resources used for pre-processing 

- https://towardsdatascience.com/an-easy-tutorial-about-sentiment-analysis-with-deep-learning-and-keras-2bf52b9cba91

* Remove URLs from the tweets
* Tokenize text
* Remove emails
* Remove new lines characters
* Remove distracting single quotes
* Remove all punctuation signs
* Lowercase all text
* Detokenize text
* Convert list of texts to Numpy array

In [1]:
# pip -- install nltk 

# Importing relevant libraries 
import numpy as np
import pandas as pd
import re
import nltk
from nltk.tokenize.treebank import TreebankWordDetokenizer
import gensim
from gensim.utils import simple_preprocess
print('Done')

Done


In [4]:
# Load data 

tweet_text = pd.read_csv('/Users/eleanordavies/Desktop/Twiiter_data5.csv')
tweet_text.head()

Unnamed: 0,id,text,hashtags_cnt,hashtags,created_at,tw_date,place_id,full_name,name,coordinates,coord_type
0,1200923972742848512,Huge thank you to @Empowerment_ #peer4U projec...,2,"peer4U,codesign",2019-11-30T23:46:01.000Z,2019-11-30,13b1318251fcfc7c,"Bilsborrow, England",Bilsborrow,,
1,1200922864142241793,Woohoo! First new friend since moving to Liver...,0,,2019-11-30T23:41:37.000Z,2019-11-30,151b9e91272233d1,"Liverpool, England",Liverpool,,
2,1200918723449606146,I had anxiety about conceiving for a couple of...,0,,2019-11-30T23:25:10.000Z,2019-11-30,38eabeb176f29378,"East Kilbride, Scotland",East Kilbride,,
3,1200914479661359104,@JBattye I wouldn’t even know where to look! I...,0,,2019-11-30T23:08:18.000Z,2019-11-30,778909dfad43f3d6,"Huddersfield, England",Huddersfield,,
4,1200914017679814661,Bloody hell Asda @AsdaServiceTeam shopping due...,0,,2019-11-30T23:06:28.000Z,2019-11-30,6da127116b06cece,"Upchurch, England",Upchurch,,


In [5]:
# Creating a function to remove characters 

def depure_data(data):
    
    #Removing URLs with a regular expression
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    data = url_pattern.sub(r'', data)

    # Remove Emails
    data = re.sub('\S*@\S*\s?', '', data)

    # Remove new line characters
    data = re.sub('\s+', ' ', data)

    # Remove distracting single quotes
    data = re.sub("\'", "", data)
        
    return data

In [6]:
temp = []
#Splitting twtter data text column to list
data_to_list = tweet_text['text'].values.tolist()

#applying the above function 
for i in range(len(data_to_list)):
    temp.append(depure_data(data_to_list[i]))

# tempdf = pd.DataFrame(temp)
# tempdf.head()

Unnamed: 0,0
0,Huge thank you to #peer4U project for allowing...
1,Woohoo! First new friend since moving to Liver...
2,I had anxiety about conceiving for a couple of...
3,I wouldn’t even know where to look! IMO there ...
4,Bloody hell Asda shopping due between 8-9pm. I...


In [7]:
# Tokenize and removal all punctation, emojis and puts text into lowercase 

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
        
data_words = list(sent_to_words(temp))

print(data_words[:10])

[['huge', 'thank', 'you', 'to', 'peer', 'project', 'for', 'allowing', 'me', 'to', 'design', 'my', 'ma', 'research', 'exploring', 'the', 'impact', 'of', 'peer', 'mentor', 'model', 'on', 'reducing', 'loneliness', 'thank', 'you', 'and', 'the', 'team', 'of', 'mentors', 'and', 'mentees', 'codesign'], ['woohoo', 'first', 'new', 'friend', 'since', 'moving', 'to', 'liverpool', 'and', 'its', 'only', 'been', 'few', 'weeks', 'for', 'an', 'introvert', 'with', 'social', 'anxiety', 'im', 'really', 'getting', 'better', 'at', 'this'], ['had', 'anxiety', 'about', 'conceiving', 'for', 'couple', 'of', 'reasons', 'and', 'then', 'this', 'wee', 'miracle', 'enter', 'our', 'lives', 'for', 'me', 'this', 'is', 'the', 'best', 'xmas', 'gift', 'and', 'cant', 'even', 'describe', 'how', 'privileged', 'feel', 'to', 'be', 'her', 'mum', 'excuse', 'the', 'messy', 'paint', 'work', 'and', 'feel', 'free', 'to', 'admire', 'the', 'buffy', 'funko', 'pop'], ['wouldn', 'even', 'know', 'where', 'to', 'look', 'imo', 'there', 'is'

In [9]:
# detokenise returns them back into a sentence 

def detokenize(text):
    return TreebankWordDetokenizer().detokenize(text)

In [10]:
data_detoken = []
for i in range(len(data_words)):
    data_detoken.append(detokenize(data_words[i]))
data_detoken = np.array(data_detoken)
data_detoken_df = pd.DataFrame(data_detoken)
data_detoken_df

Unnamed: 0,0
0,huge thank you to peer project for allowing me...
1,woohoo first new friend since moving to liverp...
2,had anxiety about conceiving for couple of rea...
3,wouldn even know where to look imo there is fa...
4,bloody hell asda shopping due between pm it no...
...,...
49025,well weird watching bits of virtual fashion sh...
49026,depression mentalhealth addiction recovery anx...
49027,tried to adapt worksheet to allow pupils to di...
49028,waltz away dreaming would be my favourite tiff


In [None]:
# data_detoken_df.to_csv('/Users/eleanordavies/Desktop/df.csv', index = False)

In [11]:
# add cleaned tweet text as column 'selected_tweets' to original dataframe 
frames = [data_detoken_df,tweet_text]
demo_tweets_clean = pd.concat(frames, axis=1)

demo_tweets_clean = demo_tweets_clean.rename(columns={0: "selected_tweets"})

In [12]:
# Exporting the cleaned tweets to local directory 

# demo_tweets_clean.to_csv('/Users/eleanordavies/Desktop/demo_tweets_clean.csv', index = False)
print("Tweets Clean!")

Tweets Clean!


In [13]:
demo_tweets_clean.head()

Unnamed: 0,selected_tweets,id,text,hashtags_cnt,hashtags,created_at,tw_date,place_id,full_name,name,coordinates,coord_type
0,huge thank you to peer project for allowing me...,1200923972742848512,Huge thank you to @Empowerment_ #peer4U projec...,2,"peer4U,codesign",2019-11-30T23:46:01.000Z,2019-11-30,13b1318251fcfc7c,"Bilsborrow, England",Bilsborrow,,
1,woohoo first new friend since moving to liverp...,1200922864142241793,Woohoo! First new friend since moving to Liver...,0,,2019-11-30T23:41:37.000Z,2019-11-30,151b9e91272233d1,"Liverpool, England",Liverpool,,
2,had anxiety about conceiving for couple of rea...,1200918723449606146,I had anxiety about conceiving for a couple of...,0,,2019-11-30T23:25:10.000Z,2019-11-30,38eabeb176f29378,"East Kilbride, Scotland",East Kilbride,,
3,wouldn even know where to look imo there is fa...,1200914479661359104,@JBattye I wouldn’t even know where to look! I...,0,,2019-11-30T23:08:18.000Z,2019-11-30,778909dfad43f3d6,"Huddersfield, England",Huddersfield,,
4,bloody hell asda shopping due between pm it no...,1200914017679814661,Bloody hell Asda @AsdaServiceTeam shopping due...,0,,2019-11-30T23:06:28.000Z,2019-11-30,6da127116b06cece,"Upchurch, England",Upchurch,,


# Empath 

## Empath Resources 

- https://github.com/Ejhfast/empath-client
- https://www.tandfonline.com/doi/abs/10.1080/09638237.2020.1739251

In [14]:
import empath
import os
import sys
sys.path.insert(0, os.path.abspath('..'))

from empath import Empath

In [15]:
# creating the lexicon object 
# list of categories within empath 
lexicon = Empath()
list_cat = ['help', 'office', 'dance', 'money', 'wedding', 'domestic_work', 'sleep', 'medical_emergency', 'cold', 'hate', 'cheerfulness', 'aggression', 'occupation', 'envy', 'anticipation', 'family', 'vacation', 'crime', 'attractive', 'masculine', 'prison', 'health', 'pride', 'dispute', 'nervousness', 'government', 'weakness', 'horror', 'swearing_terms', 'leisure', 'suffering', 'royalty', 'wealthy', 'tourism', 'furniture', 'school', 'magic', 'beach', 'journalism', 'morning', 'banking', 'social_media', 'exercise', 'night', 'kill', 'blue_collar_job', 'art', 'ridicule', 'play', 'computer', 'college', 'optimism', 'stealing', 'real_estate', 'home', 'divine', 'sexual', 'fear', 'irritability', 'superhero', 'business', 'driving', 'pet', 'childish', 'cooking', 'exasperation', 'religion', 'hipster', 'internet', 'surprise', 'reading', 'worship', 'leader', 'independence', 'movement', 'body', 'noise', 'eating', 'medieval', 'zest', 'confusion', 'water', 'sports', 'death', 'healing', 'legend', 'heroic', 'celebration', 'restaurant', 'violence', 'programming', 'dominant_heirarchical', 'military', 'neglect', 'swimming', 'exotic', 'love', 'hiking', 'communication', 'hearing', 'order', 'sympathy', 'hygiene', 'weather', 'anonymity', 'trust', 'ancient', 'deception', 'fabric', 'air_travel', 'fight', 'dominant_personality', 'music', 'vehicle', 'politeness', 'toy', 'farming', 'meeting', 'war', 'speaking', 'listen', 'urban', 'shopping', 'disgust', 'fire', 'tool', 'phone', 'gain', 'sound', 'injury', 'sailing', 'rage', 'science', 'work', 'appearance', 'valuable', 'warmth', 'youth', 'sadness', 'fun', 'emotional', 'joy', 'affection', 'traveling', 'fashion', 'ugliness', 'lust', 'shame', 'torment', 'economics', 'anger', 'politics', 'ship', 'clothing', 'car', 'strength', 'technology', 'breaking', 'shape_and_size', 'power', 'white_collar_job', 'animal', 'party', 'terrorism', 'smell', 'disappointment', 'poor', 'plant', 'pain', 'beauty', 'timidity', 'philosophy', 'negotiate', 'negative_emotion', 'cleaning', 'messaging', 'competing', 'law', 'friends', 'payment', 'achievement', 'alcohol', 'liquid', 'feminine', 'weapon', 'children', 'monster', 'ocean', 'giving', 'contentment', 'writing', 'rural', 'positive_emotion', 'musical', 'colors', 'id', 'injury and death', 'demo', 'what']

pos_cat = ["worship", "masculine", "death", "weakness", "divine", "religion", "sleep", "swearing_terms", "injury", "envy"]
neg_cat = ["gain", "reading", "banking", "independence", "programming", "payment", "technology", "tourism", "air_travel", "negotiate"]
tot_cat = pos_cat + neg_cat 

In [16]:
# defining function to analyse sentences 
# can be altered to add category and normalise values 
# may need to specifiy categories 

def empath_analyse(sentence):
    x = lexicon.analyze(sentence, categories=tot_cat)
    return x

## Running tweets through empath 
- Create an array of the 'cleaned' text from the tweets 
- Run a for loop to analyse tweets usinh emapth_analyse function 
- Convert data dictionary into dataframe  
- Append data dictionary dataframe to the exsiting tweet dataframe 

In [22]:
data_to_list = demo_tweets_clean['selected_tweets'].values.tolist()
data_to_list = np.array(data_to_list)

In [23]:
data =[]
for i in range(len(data_to_list)):
    data.append(empath_analyse(data_to_list[i])),
data_dict = pd.DataFrame.from_dict(data)
data_dict.head()

Unnamed: 0,worship,masculine,death,weakness,divine,religion,sleep,swearing_terms,injury,envy,gain,reading,banking,independence,programming,payment,technology,tourism,air_travel,negotiate
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
result = pd.concat([demo_tweets_clean, data_dict], axis=1, join="inner")
print("Emapth Analysis Complete!")

Emapth Analysis Complete!


In [25]:
# result["id"] = result.index
# saving non aggregated dataset of empath analysed tweets 
result.to_csv('/Users/eleanordavies/Desktop/mean_pivot_table.csv', index = False) 
result.head()

Unnamed: 0,selected_tweets,id,text,hashtags_cnt,hashtags,created_at,tw_date,place_id,full_name,name,...,gain,reading,banking,independence,programming,payment,technology,tourism,air_travel,negotiate
0,huge thank you to peer project for allowing me...,1200923972742848512,Huge thank you to @Empowerment_ #peer4U projec...,2,"peer4U,codesign",2019-11-30T23:46:01.000Z,2019-11-30,13b1318251fcfc7c,"Bilsborrow, England",Bilsborrow,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,woohoo first new friend since moving to liverp...,1200922864142241793,Woohoo! First new friend since moving to Liver...,0,,2019-11-30T23:41:37.000Z,2019-11-30,151b9e91272233d1,"Liverpool, England",Liverpool,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,had anxiety about conceiving for couple of rea...,1200918723449606146,I had anxiety about conceiving for a couple of...,0,,2019-11-30T23:25:10.000Z,2019-11-30,38eabeb176f29378,"East Kilbride, Scotland",East Kilbride,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,wouldn even know where to look imo there is fa...,1200914479661359104,@JBattye I wouldn’t even know where to look! I...,0,,2019-11-30T23:08:18.000Z,2019-11-30,778909dfad43f3d6,"Huddersfield, England",Huddersfield,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,bloody hell asda shopping due between pm it no...,1200914017679814661,Bloody hell Asda @AsdaServiceTeam shopping due...,0,,2019-11-30T23:06:28.000Z,2019-11-30,6da127116b06cece,"Upchurch, England",Upchurch,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
# Counting total number of tweets per date 

pivot_table = result.pivot_table(
     index='tw_date',
     values='id',
     aggfunc= len).reset_index()

pivot_table.columns = ['tw_date','count_of_tweets']

pivot_table.head()

Unnamed: 0,tw_date,count_of_tweets
0,2019-11-27,18
1,2019-11-28,199
2,2019-11-29,162
3,2019-11-30,112
4,2019-12-16,835


In [35]:
# Aggregating data by date and get mean of lexical categories 

mean_pivot_table = result.pivot_table(
    index=[ "tw_date"], # "name"], 
    values=tot_cat,
    aggfunc=np.mean).reset_index()
mean_pivot_table

Unnamed: 0,tw_date,air_travel,banking,death,divine,envy,gain,independence,injury,masculine,...,payment,programming,reading,religion,sleep,swearing_terms,technology,tourism,weakness,worship
0,2019-11-27,0.000000,0.111111,0.722222,0.000000,0.000000,0.000000,0.000000,0.444444,0.000000,...,0.277778,0.000000,0.166667,0.000000,0.000000,0.111111,0.000000,0.000000,0.000000,0.000000
1,2019-11-28,0.035176,0.030151,0.175879,0.030151,0.030151,0.035176,0.030151,0.226131,0.050251,...,0.050251,0.005025,0.105528,0.050251,0.140704,0.060302,0.050251,0.025126,0.035176,0.025126
2,2019-11-29,0.055556,0.104938,0.209877,0.067901,0.000000,0.061728,0.043210,0.197531,0.012346,...,0.080247,0.061728,0.098765,0.037037,0.141975,0.080247,0.030864,0.012346,0.030864,0.049383
3,2019-11-30,0.044643,0.053571,0.080357,0.044643,0.008929,0.035714,0.026786,0.232143,0.053571,...,0.062500,0.026786,0.133929,0.017857,0.098214,0.071429,0.044643,0.008929,0.053571,0.008929
4,2019-12-16,0.023952,0.035928,0.287425,0.023952,0.035928,0.065868,0.035928,0.221557,0.035928,...,0.059880,0.017964,0.131737,0.053892,0.155689,0.053892,0.059880,0.011976,0.035928,0.011976
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,2021-01-10,0.005988,0.041916,0.359281,0.023952,0.023952,0.011976,0.023952,0.317365,0.023952,...,0.035928,0.041916,0.083832,0.035928,0.125749,0.035928,0.089820,0.005988,0.047904,0.011976
60,2021-01-11,0.012121,0.036364,0.218182,0.054545,0.012121,0.042424,0.006061,0.230303,0.036364,...,0.048485,0.024242,0.096970,0.018182,0.133333,0.054545,0.048485,0.030303,0.030303,0.024242
61,2021-01-12,0.031746,0.032880,0.142857,0.020408,0.013605,0.032880,0.013605,0.207483,0.026077,...,0.032880,0.066893,0.123583,0.006803,0.171202,0.045351,0.112245,0.032880,0.019274,0.006803
62,2021-01-13,0.033149,0.055249,0.270718,0.044199,0.011050,0.049724,0.027624,0.309392,0.049724,...,0.060773,0.071823,0.060773,0.033149,0.121547,0.038674,0.088398,0.011050,0.033149,0.027624


In [36]:
mean_pivot_table['count'] = mean_pivot_table.tw_date.map(
   pivot_table.set_index('tw_date').count_of_tweets)

In [37]:
mean_pivot_table

Unnamed: 0,tw_date,air_travel,banking,death,divine,envy,gain,independence,injury,masculine,...,programming,reading,religion,sleep,swearing_terms,technology,tourism,weakness,worship,count
0,2019-11-27,0.000000,0.111111,0.722222,0.000000,0.000000,0.000000,0.000000,0.444444,0.000000,...,0.000000,0.166667,0.000000,0.000000,0.111111,0.000000,0.000000,0.000000,0.000000,18
1,2019-11-28,0.035176,0.030151,0.175879,0.030151,0.030151,0.035176,0.030151,0.226131,0.050251,...,0.005025,0.105528,0.050251,0.140704,0.060302,0.050251,0.025126,0.035176,0.025126,199
2,2019-11-29,0.055556,0.104938,0.209877,0.067901,0.000000,0.061728,0.043210,0.197531,0.012346,...,0.061728,0.098765,0.037037,0.141975,0.080247,0.030864,0.012346,0.030864,0.049383,162
3,2019-11-30,0.044643,0.053571,0.080357,0.044643,0.008929,0.035714,0.026786,0.232143,0.053571,...,0.026786,0.133929,0.017857,0.098214,0.071429,0.044643,0.008929,0.053571,0.008929,112
4,2019-12-16,0.023952,0.035928,0.287425,0.023952,0.035928,0.065868,0.035928,0.221557,0.035928,...,0.017964,0.131737,0.053892,0.155689,0.053892,0.059880,0.011976,0.035928,0.011976,835
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,2021-01-10,0.005988,0.041916,0.359281,0.023952,0.023952,0.011976,0.023952,0.317365,0.023952,...,0.041916,0.083832,0.035928,0.125749,0.035928,0.089820,0.005988,0.047904,0.011976,835
60,2021-01-11,0.012121,0.036364,0.218182,0.054545,0.012121,0.042424,0.006061,0.230303,0.036364,...,0.024242,0.096970,0.018182,0.133333,0.054545,0.048485,0.030303,0.030303,0.024242,825
61,2021-01-12,0.031746,0.032880,0.142857,0.020408,0.013605,0.032880,0.013605,0.207483,0.026077,...,0.066893,0.123583,0.006803,0.171202,0.045351,0.112245,0.032880,0.019274,0.006803,882
62,2021-01-13,0.033149,0.055249,0.270718,0.044199,0.011050,0.049724,0.027624,0.309392,0.049724,...,0.071823,0.060773,0.033149,0.121547,0.038674,0.088398,0.011050,0.033149,0.027624,1086


In [38]:
mean_pivot_table.to_csv('/Users/eleanordavies/Desktop/mean_pivot_table.csv', index = False) 