# Data Cleaning 

## Resources used for pre-processing 

- https://towardsdatascience.com/an-easy-tutorial-about-sentiment-analysis-with-deep-learning-and-keras-2bf52b9cba91

* Check and convert datatypes as appropraite 
* Remove Scotland, Wales and Ireland (we just want to explore England)
* Remove URLs from the tweets
* Tokenize text
* Remove emails
* Remove new lines characters
* Remove distracting single quotes
* Remove all punctuation signs
* Lowercase all text
* Detokenize text
* Convert list of texts to Numpy array

In [1]:
# pip -- install nltk 

# Importing relevant libraries 
import numpy as np
import pandas as pd
import re
import nltk
from nltk.tokenize.treebank import TreebankWordDetokenizer
import gensim
from gensim.utils import simple_preprocess
print('Done')

Done


In [4]:
file = '/Users/eleanordavies/Desktop/HDS_Content/Term_2/Data_Challenge/Tweet_Data/Meltwater/Anxiety/meltwater_anxiety.csv'

In [5]:
# Load data 
tweet_raw = pd.read_csv(file)
tweet_raw

Unnamed: 0.1,Unnamed: 0,Date,Headline,URL,Opening Text,Hit Sentence,Source,Influencer,Country,Subregion,...,Twitter Screen Name,User Profile Url,Twitter Bio,Twitter Followers,Twitter Following,Alternate Date Format,Time,State,City,Document Tags
0,1,19-Feb-2020 11:58PM,,http://twitter.com/Ephi_BL/statuses/1230280629...,,So I want to talk about an element of masking ...,Twitter,@ephi_bl,United Kingdom,,...,Ephi??,https://twitter.com/Ephi_BL,Twitch: http://twitch.tv/ephibl \nYoutube: ht...,1999.0,430.0,"Feb 19, 2020",23:58:00,England,London,
1,2,19-Feb-2020 11:58PM,,http://twitter.com/jaestalgia/statuses/1230280...,,i’m talking with an english boy on ig and can ...,Twitter,@jaestalgia,United Kingdom,,...,bennie misses day6 🖇 REVEAL,https://twitter.com/jaestalgia,#박제형 ;; i talk to the 𝙢𝙤𝙤𝙣 about my 𝙨𝙪𝙣 . . . ...,547.0,763.0,"Feb 19, 2020",23:58:00,England,,
2,3,19-Feb-2020 11:57PM,,http://twitter.com/unfitmotherblog/statuses/12...,,#CarolineFlack has truly rocked me. The slight...,Twitter,@unfitmotherblog,United Kingdom,,...,The Unfit Mother,https://twitter.com/unfitmotherblog,"One of those Pinot drinking, blogging, foul-mo...",1009.0,976.0,"Feb 19, 2020",23:57:00,England,,
3,4,19-Feb-2020 11:53PM,,http://twitter.com/Hails33251204/statuses/1230...,,Great 1/2 term so far & no work done. This is ...,Twitter,@hails33251204,United Kingdom,,...,Hails,https://twitter.com/Hails33251204,"Mum of 3 demanding boys, wife, teacher, travel...",92.0,104.0,"Feb 19, 2020",23:53:00,England,,
4,5,19-Feb-2020 11:52PM,,http://twitter.com/RockingMisfit/statuses/1230...,,"I redrew this with a bit of my own spin on it,...",Twitter,@rockingmisfit,United Kingdom,,...,alex,https://twitter.com/RockingMisfit,Living life 🔥🤘🏼,150.0,540.0,"Feb 19, 2020",23:52:00,England,Reading,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31944,31945,03-Feb-2021 12:04AM,,https://twitter.com/cleavagequeenx/statuses/13...,,I’ve got up 3000 followers in one night on Tik...,Twitter,@cleavagequeenx,United Kingdom,,...,Gemma 💎,https://twitter.com/cleavagequeenx,"Boobs, Booty & Bunnies 🐰 • Glamour Model | FIN...",55767.0,992.0,"Feb 3, 2021",00:04:00,England,London,
31945,31946,03-Feb-2021 12:03AM,,https://twitter.com/_130cortes/statuses/135675...,,anxiety social or anxiety in general is the wo...,Twitter,@_130cortes,United Kingdom,,...,🐉,https://twitter.com/_130cortes,minor rt and sometimes vent with sh //// sorry...,58.0,138.0,"Feb 3, 2021",00:03:00,England,,
31946,31947,03-Feb-2021 12:02AM,,http://twitter.com/blogmhad/statuses/135675478...,,New blog online #mentalhealth #anxiety #bpd #d...,Twitter,@blogmhad,United Kingdom,,...,Mental Health and Dyslexia Blog 💙,http://www.twitter.com/blogmhad,Blog about a service users experience with men...,273.0,133.0,"Feb 3, 2021",00:02:00,England,Southwick,
31947,31948,03-Feb-2021 12:01AM,,https://twitter.com/jxckallen_/statuses/135675...,,i applied for a job earlier and i think it’s c...,Twitter,@jxckallen_,United Kingdom,,...,jack :),https://twitter.com/jxckallen_,yuck |~| he/him |-| 16,221.0,524.0,"Feb 3, 2021",00:01:00,England,Birmingham,


In [6]:
# Checking datatypes for each col
tweet_raw.dtypes

Unnamed: 0                 int64
Date                      object
Headline                 float64
URL                       object
Opening Text             float64
Hit Sentence              object
Source                    object
Influencer                object
Country                   object
Subregion                float64
Language                  object
Reach                      int64
Desktop Reach              int64
Mobile Reach               int64
Twitter Social Echo      float64
Facebook Social Echo     float64
Reddit Social Echo       float64
National Viewership        int64
Engagement               float64
AVE                      float64
Sentiment                 object
Key Phrases               object
Input Name                object
Keywords                  object
Twitter Authority        float64
Tweet Id                  object
Twitter Id               float64
Twitter Client            object
Twitter Screen Name       object
User Profile Url          object
Twitter Bi

In [44]:
# converting datatypes to more reable format 
tweet_raw['Date'] = pd.to_datetime(tweet_raw['Date']).dt.normalize()
tweet_raw

Unnamed: 0.1,Unnamed: 0,Date,Headline,URL,Opening Text,Hit Sentence,Source,Influencer,Country,Subregion,...,Twitter Screen Name,User Profile Url,Twitter Bio,Twitter Followers,Twitter Following,Alternate Date Format,Time,State,City,Document Tags
0,1,2020-02-19,,http://twitter.com/Ephi_BL/statuses/1230280629...,,So I want to talk about an element of masking ...,Twitter,@ephi_bl,United Kingdom,,...,Ephi??,https://twitter.com/Ephi_BL,Twitch: http://twitch.tv/ephibl \nYoutube: ht...,1999.0,430.0,"Feb 19, 2020",23:58:00,England,London,
1,2,2020-02-19,,http://twitter.com/jaestalgia/statuses/1230280...,,i’m talking with an english boy on ig and can ...,Twitter,@jaestalgia,United Kingdom,,...,bennie misses day6 🖇 REVEAL,https://twitter.com/jaestalgia,#박제형 ;; i talk to the 𝙢𝙤𝙤𝙣 about my 𝙨𝙪𝙣 . . . ...,547.0,763.0,"Feb 19, 2020",23:58:00,England,,
2,3,2020-02-19,,http://twitter.com/unfitmotherblog/statuses/12...,,#CarolineFlack has truly rocked me. The slight...,Twitter,@unfitmotherblog,United Kingdom,,...,The Unfit Mother,https://twitter.com/unfitmotherblog,"One of those Pinot drinking, blogging, foul-mo...",1009.0,976.0,"Feb 19, 2020",23:57:00,England,,
3,4,2020-02-19,,http://twitter.com/Hails33251204/statuses/1230...,,Great 1/2 term so far & no work done. This is ...,Twitter,@hails33251204,United Kingdom,,...,Hails,https://twitter.com/Hails33251204,"Mum of 3 demanding boys, wife, teacher, travel...",92.0,104.0,"Feb 19, 2020",23:53:00,England,,
4,5,2020-02-19,,http://twitter.com/RockingMisfit/statuses/1230...,,"I redrew this with a bit of my own spin on it,...",Twitter,@rockingmisfit,United Kingdom,,...,alex,https://twitter.com/RockingMisfit,Living life 🔥🤘🏼,150.0,540.0,"Feb 19, 2020",23:52:00,England,Reading,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31944,31945,2021-02-03,,https://twitter.com/cleavagequeenx/statuses/13...,,I’ve got up 3000 followers in one night on Tik...,Twitter,@cleavagequeenx,United Kingdom,,...,Gemma 💎,https://twitter.com/cleavagequeenx,"Boobs, Booty & Bunnies 🐰 • Glamour Model | FIN...",55767.0,992.0,"Feb 3, 2021",00:04:00,England,London,
31945,31946,2021-02-03,,https://twitter.com/_130cortes/statuses/135675...,,anxiety social or anxiety in general is the wo...,Twitter,@_130cortes,United Kingdom,,...,🐉,https://twitter.com/_130cortes,minor rt and sometimes vent with sh //// sorry...,58.0,138.0,"Feb 3, 2021",00:03:00,England,,
31946,31947,2021-02-03,,http://twitter.com/blogmhad/statuses/135675478...,,New blog online #mentalhealth #anxiety #bpd #d...,Twitter,@blogmhad,United Kingdom,,...,Mental Health and Dyslexia Blog 💙,http://www.twitter.com/blogmhad,Blog about a service users experience with men...,273.0,133.0,"Feb 3, 2021",00:02:00,England,Southwick,
31947,31948,2021-02-03,,https://twitter.com/jxckallen_/statuses/135675...,,i applied for a job earlier and i think it’s c...,Twitter,@jxckallen_,United Kingdom,,...,jack :),https://twitter.com/jxckallen_,yuck |~| he/him |-| 16,221.0,524.0,"Feb 3, 2021",00:01:00,England,Birmingham,


In [45]:
# Creating a function to remove characters 

def depure_data(data):
    
    #Removing URLs with a regular expression
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    data = url_pattern.sub(r'', data)

    # Remove Emails
    data = re.sub('\S*@\S*\s?', '', data)

    # Remove new line characters
    data = re.sub('\s+', ' ', data)

    # Remove distracting single quotes
    data = re.sub("\'", "", data)
        
    return data

In [46]:
temp = []
#Splitting twtter data text column to list
data_to_list = tweet_raw['Hit Sentence'].values.tolist()

#applying the above function 
for i in range(len(data_to_list)):
    temp.append(depure_data(data_to_list[i]))

tempdf = pd.DataFrame(temp)
len(data_to_list)

31949

In [11]:
# Tokenize and removal all punctation, emojis and puts text into lowercase 

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
        
data_words = list(sent_to_words(temp))

print(data_words[:10])

[['so', 'want', 'to', 'talk', 'about', 'an', 'element', 'of', 'masking', 'for', 'the', 'late', 'discovered', 'diagnosed', 'just', 'successfully', 'masked', 'my', 'way', 'through', 'some', 'small', 'talk', 'the', 'anxiety', 'was', 'there', 'beforehand', 'played', 'my', 'part', 'and', 'in', 'the', 'end', 'we', 'had', 'sum', 'positive', 'interaction'], ['talking', 'with', 'an', 'english', 'boy', 'on', 'ig', 'and', 'can', 'believe', 'sweating', 'for', 'anxiety', 'cause', 'scared', 'will', 'make', 'some', 'english', 'mistakes'], ['carolineflack', 'has', 'truly', 'rocked', 'me', 'the', 'slightest', 'negative', 'comment', 'and', 'feel', 'anxious', 'but', 'maybe', 'this', 'isnt', 'bad', 'thing', 'cant', 'help', 'feel', 'though', 'that', 'anyone', 'championing', 'caroline', 'by', 'slagging', 'off', 'or', 'is', 'ignorantly', 'adding', 'to', 'the', 'problem'], ['great', 'term', 'so', 'far', 'no', 'work', 'done', 'this', 'is', 'causing', 'anxiety', 'but', 'im', 'determined', 'to', 'put', 'family',

In [47]:
# detokenise returns them back into a sentence 

def detokenize(text):
    return TreebankWordDetokenizer().detokenize(text)

In [48]:
data_detoken = []
for i in range(len(data_words)):
    data_detoken.append(detokenize(data_words[i]))
data_detoken = np.array(data_detoken)
data_detoken_df = pd.DataFrame(data_detoken)
data_detoken_df

Unnamed: 0,0
0,so want to talk about an element of masking fo...
1,talking with an english boy on ig and can beli...
2,carolineflack has truly rocked me the slightes...
3,great term so far no work done this is causing...
4,redrew this with bit of my own spin on it the ...
...,...
31944,ve got up followers in one night on tiktok fee...
31945,anxiety social or anxiety in general is the wo...
31946,new blog online mentalhealth anxiety bpd depre...
31947,applied for job earlier and think it confirmed...


In [49]:
# add cleaned tweet text as column 'selected_tweets' to original dataframe 
frames = [data_detoken_df,tweet_raw]
demo_tweets_clean = pd.concat(frames, axis=1, join='inner')
demo_tweets_clean = demo_tweets_clean.rename(columns={0: "selected_tweets"})
print("Tweets Clean!")
demo_tweets_clean

Tweets Clean!


Unnamed: 0.1,selected_tweets,Unnamed: 0,Date,Headline,URL,Opening Text,Hit Sentence,Source,Influencer,Country,...,Twitter Screen Name,User Profile Url,Twitter Bio,Twitter Followers,Twitter Following,Alternate Date Format,Time,State,City,Document Tags
0,so want to talk about an element of masking fo...,1,2020-02-19,,http://twitter.com/Ephi_BL/statuses/1230280629...,,So I want to talk about an element of masking ...,Twitter,@ephi_bl,United Kingdom,...,Ephi??,https://twitter.com/Ephi_BL,Twitch: http://twitch.tv/ephibl \nYoutube: ht...,1999.0,430.0,"Feb 19, 2020",23:58:00,England,London,
1,talking with an english boy on ig and can beli...,2,2020-02-19,,http://twitter.com/jaestalgia/statuses/1230280...,,i’m talking with an english boy on ig and can ...,Twitter,@jaestalgia,United Kingdom,...,bennie misses day6 🖇 REVEAL,https://twitter.com/jaestalgia,#박제형 ;; i talk to the 𝙢𝙤𝙤𝙣 about my 𝙨𝙪𝙣 . . . ...,547.0,763.0,"Feb 19, 2020",23:58:00,England,,
2,carolineflack has truly rocked me the slightes...,3,2020-02-19,,http://twitter.com/unfitmotherblog/statuses/12...,,#CarolineFlack has truly rocked me. The slight...,Twitter,@unfitmotherblog,United Kingdom,...,The Unfit Mother,https://twitter.com/unfitmotherblog,"One of those Pinot drinking, blogging, foul-mo...",1009.0,976.0,"Feb 19, 2020",23:57:00,England,,
3,great term so far no work done this is causing...,4,2020-02-19,,http://twitter.com/Hails33251204/statuses/1230...,,Great 1/2 term so far & no work done. This is ...,Twitter,@hails33251204,United Kingdom,...,Hails,https://twitter.com/Hails33251204,"Mum of 3 demanding boys, wife, teacher, travel...",92.0,104.0,"Feb 19, 2020",23:53:00,England,,
4,redrew this with bit of my own spin on it the ...,5,2020-02-19,,http://twitter.com/RockingMisfit/statuses/1230...,,"I redrew this with a bit of my own spin on it,...",Twitter,@rockingmisfit,United Kingdom,...,alex,https://twitter.com/RockingMisfit,Living life 🔥🤘🏼,150.0,540.0,"Feb 19, 2020",23:52:00,England,Reading,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31944,ve got up followers in one night on tiktok fee...,31945,2021-02-03,,https://twitter.com/cleavagequeenx/statuses/13...,,I’ve got up 3000 followers in one night on Tik...,Twitter,@cleavagequeenx,United Kingdom,...,Gemma 💎,https://twitter.com/cleavagequeenx,"Boobs, Booty & Bunnies 🐰 • Glamour Model | FIN...",55767.0,992.0,"Feb 3, 2021",00:04:00,England,London,
31945,anxiety social or anxiety in general is the wo...,31946,2021-02-03,,https://twitter.com/_130cortes/statuses/135675...,,anxiety social or anxiety in general is the wo...,Twitter,@_130cortes,United Kingdom,...,🐉,https://twitter.com/_130cortes,minor rt and sometimes vent with sh //// sorry...,58.0,138.0,"Feb 3, 2021",00:03:00,England,,
31946,new blog online mentalhealth anxiety bpd depre...,31947,2021-02-03,,http://twitter.com/blogmhad/statuses/135675478...,,New blog online #mentalhealth #anxiety #bpd #d...,Twitter,@blogmhad,United Kingdom,...,Mental Health and Dyslexia Blog 💙,http://www.twitter.com/blogmhad,Blog about a service users experience with men...,273.0,133.0,"Feb 3, 2021",00:02:00,England,Southwick,
31947,applied for job earlier and think it confirmed...,31948,2021-02-03,,https://twitter.com/jxckallen_/statuses/135675...,,i applied for a job earlier and i think it’s c...,Twitter,@jxckallen_,United Kingdom,...,jack :),https://twitter.com/jxckallen_,yuck |~| he/him |-| 16,221.0,524.0,"Feb 3, 2021",00:01:00,England,Birmingham,


# Empath 

## Empath Resources 

- https://github.com/Ejhfast/empath-client
- https://www.tandfonline.com/doi/abs/10.1080/09638237.2020.1739251

In [50]:
import empath
import os
import sys
sys.path.insert(0, os.path.abspath('..'))

from empath import Empath

In [51]:
# creating the lexicon object 
# list of categories within empath 
lexicon = Empath()
# list_cat = ['help', 'office', 'dance', 'money', 'wedding', 'domestic_work', 'sleep', 'medical_emergency', 'cold', 'hate', 'cheerfulness', 'aggression', 'occupation', 'envy', 'anticipation', 'family', 'vacation', 'crime', 'attractive', 'masculine', 'prison', 'health', 'pride', 'dispute', 'nervousness', 'government', 'weakness', 'horror', 'swearing_terms', 'leisure', 'suffering', 'royalty', 'wealthy', 'tourism', 'furniture', 'school', 'magic', 'beach', 'journalism', 'morning', 'banking', 'social_media', 'exercise', 'night', 'kill', 'blue_collar_job', 'art', 'ridicule', 'play', 'computer', 'college', 'optimism', 'stealing', 'real_estate', 'home', 'divine', 'sexual', 'fear', 'irritability', 'superhero', 'business', 'driving', 'pet', 'childish', 'cooking', 'exasperation', 'religion', 'hipster', 'internet', 'surprise', 'reading', 'worship', 'leader', 'independence', 'movement', 'body', 'noise', 'eating', 'medieval', 'zest', 'confusion', 'water', 'sports', 'death', 'healing', 'legend', 'heroic', 'celebration', 'restaurant', 'violence', 'programming', 'dominant_heirarchical', 'military', 'neglect', 'swimming', 'exotic', 'love', 'hiking', 'communication', 'hearing', 'order', 'sympathy', 'hygiene', 'weather', 'anonymity', 'trust', 'ancient', 'deception', 'fabric', 'air_travel', 'fight', 'dominant_personality', 'music', 'vehicle', 'politeness', 'toy', 'farming', 'meeting', 'war', 'speaking', 'listen', 'urban', 'shopping', 'disgust', 'fire', 'tool', 'phone', 'gain', 'sound', 'injury', 'sailing', 'rage', 'science', 'work', 'appearance', 'valuable', 'warmth', 'youth', 'sadness', 'fun', 'emotional', 'joy', 'affection', 'traveling', 'fashion', 'ugliness', 'lust', 'shame', 'torment', 'economics', 'anger', 'politics', 'ship', 'clothing', 'car', 'strength', 'technology', 'breaking', 'shape_and_size', 'power', 'white_collar_job', 'animal', 'party', 'terrorism', 'smell', 'disappointment', 'poor', 'plant', 'pain', 'beauty', 'timidity', 'philosophy', 'negotiate', 'negative_emotion', 'cleaning', 'messaging', 'competing', 'law', 'friends', 'payment', 'achievement', 'alcohol', 'liquid', 'feminine', 'weapon', 'children', 'monster', 'ocean', 'giving', 'contentment', 'writing', 'rural', 'positive_emotion', 'musical', 'colors', 'id', 'injury and death', 'demo', 'what']
pos_cat = ["worship", "masculine", "death", "weakness", "divine", "religion", "sleep", "swearing_terms", "injury", "envy"]
neg_cat = ["gain", "reading", "banking", "independence", "programming", "payment", "technology", "tourism", "air_travel", "negotiate"]
tot_cat = pos_cat + neg_cat 

In [52]:
# defining function to analyse sentences 
# can be altered to add category and normalise values 
# may need to specifiy categories 

def empath_analyse(sentence):
    x = lexicon.analyze(sentence, categories=tot_cat)
    return x

## Running tweets through empath 
- Create an array of the 'cleaned' text from the tweets 
- Run a for loop to analyse tweets usinh emapth_analyse function 
- Convert data dictionary into dataframe  
- Append data dictionary dataframe to the exsiting tweet dataframe 

In [53]:
data_to_list = demo_tweets_clean['selected_tweets'].values.tolist()
data_to_list = np.array(data_to_list)

In [54]:
tweets =[]
for i in range(len(data_to_list)):
    tweets.append(empath_analyse(data_to_list[i])),
empath_dict = pd.DataFrame.from_dict(tweets)
empath_dict

Unnamed: 0,worship,masculine,death,weakness,divine,religion,sleep,swearing_terms,injury,envy,gain,reading,banking,independence,programming,payment,technology,tourism,air_travel,negotiate
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31945,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31946,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
31947,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
tweet_empath_df = pd.concat([demo_tweets_clean, empath_dict], axis=1, join="inner")
print("Emapth Analysis Complete!")

Emapth Analysis Complete!


In [56]:
# result["id"] = result.index
# saving non aggregated dataset of empath analysed tweets 
tweet_empath_df.to_csv('/Users/eleanordavies/Desktop/HDS_Content/Term_2/Data_Challenge/Tweet_Data/Meltwater/Anxiety/melt_anxiety_empath.csv', index = False) 
tweet_empath_df

Unnamed: 0.1,selected_tweets,Unnamed: 0,Date,Headline,URL,Opening Text,Hit Sentence,Source,Influencer,Country,...,gain,reading,banking,independence,programming,payment,technology,tourism,air_travel,negotiate
0,so want to talk about an element of masking fo...,1,2020-02-19,,http://twitter.com/Ephi_BL/statuses/1230280629...,,So I want to talk about an element of masking ...,Twitter,@ephi_bl,United Kingdom,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,talking with an english boy on ig and can beli...,2,2020-02-19,,http://twitter.com/jaestalgia/statuses/1230280...,,i’m talking with an english boy on ig and can ...,Twitter,@jaestalgia,United Kingdom,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,carolineflack has truly rocked me the slightes...,3,2020-02-19,,http://twitter.com/unfitmotherblog/statuses/12...,,#CarolineFlack has truly rocked me. The slight...,Twitter,@unfitmotherblog,United Kingdom,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,great term so far no work done this is causing...,4,2020-02-19,,http://twitter.com/Hails33251204/statuses/1230...,,Great 1/2 term so far & no work done. This is ...,Twitter,@hails33251204,United Kingdom,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,redrew this with bit of my own spin on it the ...,5,2020-02-19,,http://twitter.com/RockingMisfit/statuses/1230...,,"I redrew this with a bit of my own spin on it,...",Twitter,@rockingmisfit,United Kingdom,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31944,ve got up followers in one night on tiktok fee...,31945,2021-02-03,,https://twitter.com/cleavagequeenx/statuses/13...,,I’ve got up 3000 followers in one night on Tik...,Twitter,@cleavagequeenx,United Kingdom,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31945,anxiety social or anxiety in general is the wo...,31946,2021-02-03,,https://twitter.com/_130cortes/statuses/135675...,,anxiety social or anxiety in general is the wo...,Twitter,@_130cortes,United Kingdom,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31946,new blog online mentalhealth anxiety bpd depre...,31947,2021-02-03,,http://twitter.com/blogmhad/statuses/135675478...,,New blog online #mentalhealth #anxiety #bpd #d...,Twitter,@blogmhad,United Kingdom,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
31947,applied for job earlier and think it confirmed...,31948,2021-02-03,,https://twitter.com/jxckallen_/statuses/135675...,,i applied for a job earlier and i think it’s c...,Twitter,@jxckallen_,United Kingdom,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
# Counting total number of tweets per date 
pivot_table = tweet_empath_df.pivot_table(
     index='Date',
     values='selected_tweets',
     aggfunc= len).reset_index()

pivot_table.columns = ['Date','count_of_tweets']

pivot_table.head()

Unnamed: 0,Date,count_of_tweets
0,2020-02-19,1024
1,2020-03-04,25
2,2020-03-18,3784
3,2020-04-01,1946
4,2020-04-15,1409


In [62]:
# Aggregating data by date and get mean of lexical categories 

mean_pivot_table = tweet_empath_df.pivot_table(
    index=[ "Date", "City"], 
    values=tot_cat,
    aggfunc=np.mean).reset_index()
mean_pivot_table

Unnamed: 0,Date,City,air_travel,banking,death,divine,envy,gain,independence,injury,...,payment,programming,reading,religion,sleep,swearing_terms,technology,tourism,weakness,worship
0,2020-02-19,Accrington,0.0,0.000,0.0000,0.0,0.0,0.0000,0.0000,0.0000,...,0.000,0.0000,0.0000,0.0,1.0000,0.0,0.000,0.0,1.0,0.0
1,2020-02-19,Acton,0.0,0.000,0.0000,0.0,0.0,0.0000,0.0000,0.0000,...,0.000,0.0000,1.0000,0.0,0.0000,0.0,0.000,0.0,0.0,0.0
2,2020-02-19,Altrincham,0.0,0.000,0.0000,0.0,0.0,0.0000,0.0000,0.0000,...,0.000,0.0000,0.0000,0.0,0.0000,0.0,0.000,0.0,0.0,0.0
3,2020-02-19,Ashbourne,0.0,0.000,0.0000,0.0,0.0,0.0000,0.0000,0.0000,...,0.000,0.0000,1.0000,0.0,1.0000,0.0,0.000,0.0,0.0,0.0
4,2020-02-19,Ashford,0.0,0.000,2.0000,0.0,0.0,0.0000,0.0000,1.0000,...,0.000,0.0000,0.0000,0.0,0.0000,0.0,0.000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5344,2021-02-03,Wokingham,0.0,0.000,0.5000,0.0,0.0,0.0000,0.0000,0.5000,...,0.000,0.0000,0.0000,0.0,0.5000,0.0,0.500,0.0,0.0,0.0
5345,2021-02-03,Woodbridge,0.0,0.000,0.0000,0.0,0.0,0.0000,0.0000,0.0000,...,0.000,0.0000,0.0000,1.0,0.0000,0.0,0.000,0.0,0.0,0.0
5346,2021-02-03,Worcester,0.0,0.000,1.0000,0.0,0.0,0.0000,0.0000,0.0000,...,0.000,0.0000,0.0000,0.0,0.0000,0.0,0.000,0.0,0.0,0.0
5347,2021-02-03,Yate,0.0,0.000,0.0000,0.0,0.0,0.0000,0.0000,0.0000,...,0.000,0.0000,0.0000,1.0,0.0000,0.0,0.000,0.0,1.0,0.0


In [63]:
mean_pivot_table['count'] = mean_pivot_table.Date.map(
   pivot_table.set_index('Date').count_of_tweets)

In [64]:
mean_pivot_table

Unnamed: 0,Date,City,air_travel,banking,death,divine,envy,gain,independence,injury,...,programming,reading,religion,sleep,swearing_terms,technology,tourism,weakness,worship,count
0,2020-02-19,Accrington,0.0,0.000,0.0000,0.0,0.0,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0,1.0000,0.0,0.000,0.0,1.0,0.0,1024
1,2020-02-19,Acton,0.0,0.000,0.0000,0.0,0.0,0.0000,0.0000,0.0000,...,0.0000,1.0000,0.0,0.0000,0.0,0.000,0.0,0.0,0.0,1024
2,2020-02-19,Altrincham,0.0,0.000,0.0000,0.0,0.0,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0,0.0000,0.0,0.000,0.0,0.0,0.0,1024
3,2020-02-19,Ashbourne,0.0,0.000,0.0000,0.0,0.0,0.0000,0.0000,0.0000,...,0.0000,1.0000,0.0,1.0000,0.0,0.000,0.0,0.0,0.0,1024
4,2020-02-19,Ashford,0.0,0.000,2.0000,0.0,0.0,0.0000,0.0000,1.0000,...,0.0000,0.0000,0.0,0.0000,0.0,0.000,0.0,0.0,0.0,1024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5344,2021-02-03,Wokingham,0.0,0.000,0.5000,0.0,0.0,0.0000,0.0000,0.5000,...,0.0000,0.0000,0.0,0.5000,0.0,0.500,0.0,0.0,0.0,1095
5345,2021-02-03,Woodbridge,0.0,0.000,0.0000,0.0,0.0,0.0000,0.0000,0.0000,...,0.0000,0.0000,1.0,0.0000,0.0,0.000,0.0,0.0,0.0,1095
5346,2021-02-03,Worcester,0.0,0.000,1.0000,0.0,0.0,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0,0.0000,0.0,0.000,0.0,0.0,0.0,1095
5347,2021-02-03,Yate,0.0,0.000,0.0000,0.0,0.0,0.0000,0.0000,0.0000,...,0.0000,0.0000,1.0,0.0000,0.0,0.000,0.0,1.0,0.0,1095


In [65]:
mean_pivot_table.to_csv('/Users/eleanordavies/Desktop/HDS_Content/Term_2/Data_Challenge/Tweet_Data/Meltwater/Anxiety/mean_pivot_table_melt_anxietyB.csv', index = False) 