In [1]:
import os
import pandas as pd
from numpy import random

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

import tweepy
import re

import matplotlib.pyplot as plt
import seaborn as sns

from langdetect import detect
from langid import classify
from spellchecker import SpellChecker

from textblob.translate import Translator
from textblob import TextBlob
from textblob_nl import PatternTagger, PatternAnalyzer

import time as t

import spacy

twitter_c = [0.11, 0.63, 0.95]
pd.set_option('display.max_colwidth', -1) # normally 50
data = 'DATA/'
pickles = 'pickles/'

# Twitter Data


## Functions

In [2]:
def replace_url(txt, sub = '||U||'):
    """Replace URLs found in a text string with ||U|| 
    (i.e. it will remove the URL from the string).

    Parameters
    ----------
    txt : string
        A text string that you want to parse and remove urls.

    Returns
    -------
    The same txt string with url's replaced.
    """
    return " ".join(re.sub("((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*", sub, txt).split())

def replace_tags(txt, sub = "||T||"):
    return " ".join(re.sub('(@[a-zA-Z0-9]+)', sub, txt).split())

def check_inst(txt, inst):
    for word in txt.split():
        if word in inst:
            return 1
    return 0


def nl_sent(text):
    ''' 
    text: string in Dutch
    returns 
    '''
    return TextBlob(text, analyzer = PatternAnalyzer()).sentiment[0]

def en_sent(text):
    '''
    text: string in English
    returns: polarity sentiment of the given text. the text is translated from dutch to english
    '''
    return TextBlob(text).sentiment.polarity

def remove_punct(text):
    '''
    removes all punctuation from a string
    '''
    return re.sub('([^\sA-Za-z0-9])', '', text)

def remove_small(text):
    '''
    removes small words of size smaller than 3
    '''
    return re.sub('(\\W*\\b\\w{1,2}\\b)', '', text)

stopwords_l = set(stopwords.words('dutch'))

def remove_stopwords(text, stopwords = stopwords_l):
    '''
    text: string
    stopwords: (array-like, optional) a list containing stopwords. default list is dutch.
    removes stopwords from the given text
    '''
    word_list = text.split()
    return ' '.join([word for word in word_list if word not in stopwords])

def small_tweet(text):
    '''
    returns true when the given text is smaller than 3 words
    '''
    length = len(text.split())
    if length < 3:
        return False
    else:
        return True
    
def contains_text(text):
    '''
    if text contains text True is returned False otherwise'''
    if re.search('([a-zA-Z0-9]+)', text):
        return True
    else:
        return False
    
def detect_lang(x):
    '''
    x: a string of text
    returns the language and None if there is no language detected'''
    try:
        lang = detect(x)
        return lang
    except:
        return None
    
def sentence_checker(sentence, speller):
    '''
    sentence: a string
    speller: a SpellChecker object
    returns the same text but corrected for spelling'''
    word_list = sentence.split()
    for i, word in enumerate(word_list):
        new = word_checker(word, speller)
        word_list[i] = new
    return ' '.join(word_list)

def word_checker(word, speller):
    '''
    word: a string containing a single word
    speller: a SpellChecker object
    returns: the corrected word'''
    if word in speller:
        return word
    else:
        cor = speller.correction(word)
#         print('5', cor)
        return cor

def sent_cat(sent, threshold = 0.05):
    '''
    Sets sentiment score to 1 if positive, -1 if negative and 0 if neutral
    theshold (float) default 0.05 and -(0.05) for negative side
    '''
    if sent > threshold:
        return 1
    elif sent < -threshold:
        return -1
    else:
        return 0

def is_pos(x, threshold = 0.05):
    if x > threshold:
        return 1
    else:
        return 0

def is_neg(x, threshold = 0.05):
    if x < -threshold:
        return 1
    else:
        return 0
    
def margins(freq, margin = 0.05):
    '''
    freq: a integer representing the frequency of a word occuring in the text
    margin: the margin to consider similar word frequencies default 0.05
    returns a tuple with lower and upperbound'''
    upper = freq*(margin+1)
    lower = freq*(1-margin)
    return (lower, upper)

# First analysis of the twitter data

## Check spelling checkers
The spellingschecker is checked on spelling of words and whether weird things happen. 

In [7]:
# years of data 
years = ['2014', '2015', '2016', '2017', '2018', '2019']

path = os.getcwd()
data_folder = '15 km radius'

year_data = {}

for year in years:
    print('year: '+year)
    files = [file for file in os.listdir(path+os.sep+data_folder) if year in file] # pay attention where your data is stored
    temp = pd.DataFrame(data = None, columns = ['date', 'username', 'to', 'replies', 'retweets', 'favorites', 'text',
                                                'geo', 'mentions', 'hashtags', 'id', 'permalink'])

    for file in files:
        print(file)
        temp = temp.append(pd.read_csv(path+os.sep+data_folder+os.sep+file), ignore_index = True)
        print(temp.shape)

    start = t.time()
    print('select data')
    tweets = temp
    tweets = tweets.drop_duplicates()
    print(tweets.shape)
    year_data[year] = tweets

year: 2014
2014 01-01 tot 01-11.csv
(14200, 12)
2014 01-11 tot 01-22.csv
(28500, 12)
2014 01-22 tot 01-25.csv
(33000, 12)
2014 01-25 tot 02-08.csv
(47300, 12)
2014 02-09 tot 02-23.csv
(61600, 12)
2014 02-23 tot 03-14.csv
(75900, 12)
2014 03-13 tot 03-31.csv
(92700, 12)
2014 04-02 tot 04-19.csv
(107000, 12)
2014 04-19 tot 04-20.csv
(108007, 12)
2014 04-20 tot 05-19.csv
(129007, 12)
2014 05-19 tot 05-28.csv
(135143, 12)
2014 05-28 tot 06-14.csv
(149443, 12)
2014 06-15 tot 06-29.csv
(163743, 12)
2014 06-29 tot 07-13.csv
(178043, 12)
2014 07-13 tot 08-02.csv
(192343, 12)
2014 08-01 tot 08-25.csv
(206643, 12)
2014 08-25 tot 09-16.csv
(220943, 12)
2014 09-16 tot 10-15.csv
(235243, 12)
2014 10-15 tot 11-14.csv
(249643, 12)
2014 11-14 tot 11-26.csv
(254843, 12)
2014 11-26 tot 12-31.csv
(269143, 12)
select data
(264710, 12)
year: 2015
2015 01-01 tot 01-29.csv
(9117, 12)
2015 01-29 tot 02-28.csv
(23517, 12)
2015 03-01 tot 03-27.csv
(37817, 12)
2015 03-27 tot 04-26.csv
(52117, 12)
2015 04-25 tot 

In [8]:
# getting the total number of tweets in each year before pre-processing
tweets = pd.DataFrame(data = None, columns = ['date', 'username', 'to', 'replies', 'retweets', 'favorites', 'text',
                                            'geo', 'mentions', 'hashtags', 'id', 'permalink'])
for year in years:
    temp = year_data[year]
    print(year+' ', year_data[year].shape)
    tweets = tweets.append(temp)

2014  (264710, 12)
2015  (79893, 12)
2016  (28821, 12)
2017  (28251, 12)
2018  (25501, 12)
2019  (113505, 12)


In [9]:
random_state = 10
temp = tweets.sample(n = 5000, random_state= random_state)
temp.loc[:,'langdetect'] = temp.loc[:,'text'].apply(lambda x: detect_lang(x))

In [10]:
distance = 1 # sets the Levenshtein distance for the spellingchecker

nl_spell = SpellChecker(language = None, distance = distance)
nl_spell.word_frequency.load_dictionary('nl_NL.json')

nl_ext = ['ff', "'s", 'A2', 'file', 'cda', 'IT', 'CDA', 'mengen', 'kraan', 'NL']
nl_spell.word_frequency.load_words(nl_ext)

test_nl = temp.loc[temp.loc[:,'langdetect'] == 'nl', :].sample(n = 100, random_state = random_state)

test_nl.loc[:, 'corrected'] = test_nl.loc[:,'text'].apply(lambda x: sentence_checker(x, nl_spell))

test_nl.loc[:,'gelijk'] = test_nl.loc[:, ['text', 'corrected']].apply(lambda x: x['text'] == x['corrected'], axis = 1)
test_nl.loc[test_nl.loc[:,'gelijk'] == False, ['text', 'corrected']].head(20)

Unnamed: 0,text,corrected
23332,"GOIRLE - SPRAAKBERICHT: https://is.gd/A9fO0W , Details: ...https://is.gd/9qHRTo","GOIRLE - SPRAAKBERICHT: https://is.gd/A9fO0W , details ...https://is.gd/9qHRTo"
105696,@Rickk_013 komt omdat er ook weinig opzit,@Rickk_013 komt omdat er ook weinig opzij
34292,Klopt. En de volgende ochtend weer om 6:30 uur gaat de wekker weer.,klopt En de volgende ochtend weer om 6:30 uur gaat de lekker weer
59521,SBS 6 live uitzending van Maarten. #11Stedenzwemtocht,sms 6 live uitzending van Maarten. #11Stedenzwemtocht
258562,"""@Chris_dfwy: @DweezyKid @Sabajo_ maar alle grappen op een stok, Sherwin lai deng thoo""dat mi no sabie , ik weet van mij","""@Chris_dfwy: @DweezyKid @Sabajo_ maar alle grappen op een stok Sherwin lag denk thoo""dat me nog sabie , ik weet van mij"
178310,Pff wat een gezeik hier. Does ff normaal tegen de #joden. #jodenhaat of #moslimhaat lijdt nergens toe. Gewoon geen haat!,ff wat een gezeik hier doen ff normaal tegen de #joden. #jodenhaat of #moslimhaat lijdt nergens toen Gewoon geen haat
186704,"Bijkomen van een super gezellige kermisavond! Beetje liggen, een kopje koffie en genieten... http://4sq.com/UjPZ1v pic.twitter.com/X7vSfhVNHl",Bijkomen van een super gezellig kermisavond! Beetje liggen een kopje koffie en genieten... http://4sq.com/UjPZ1v pic.twitter.com/X7vSfhVNHl
96200,Vandaag al 14 maandjes met babe @PaulPPF jammer dat ik vandaag niet met je kan zijn schat heel veel plezier i love you,Vandaag al 14 maandjes met baby @PaulPPF jammer dat ik vandaag niet met je kan zijn schat heel veel plezier ik love you
5905,My favorite! Toral! #zweertsschoenen #myfavorites #loafers #shoes #studs #trendy #fashionable #spanish #luxury #panterprint @Zweerts Schoenen Oisterwijk https://www.instagram.com/p/BnMbYd0BAaX/?utm_source=ig_twitter_share&igshid=1d8iqkl9fei0y …,me favorite! Toral! #zweertsschoenen #myfavorites #loafers #shoes #studs #trendy #fashionable #spanish #luxury #panterprint @Zweerts Schoenen Oisterwijk https://www.instagram.com/p/BnMbYd0BAaX/?utm_source=ig_twitter_share&igshid=1d8iqkl9fei0y …
18321,(Directe inzet: A2 5041DN 12 F : Stedekestraat 12 F Tilburg 20109 http://watiserloos.in/melding/14632409/ … #p2000,(Directe inzet A2 5041DN 12 of : Stedekestraat 12 of Tilburg 20109 http://watiserloos.in/melding/14632409/ … #p2000


In [11]:
# setting up the english spellingchecker
en_spell = SpellChecker(distance = distance)

en_ext = ["i'm", "we're", "I'll", "i'd", "you're"]
en_spell.word_frequency.load_words(en_ext)

test_en = temp.loc[temp.loc[:,'langdetect'] == 'en', :].sample(n = 100, random_state = random_state)

test_en.loc[:, 'corrected'] = test_en.loc[:,'text'].apply(lambda x: sentence_checker(x, en_spell))

test_en.loc[:,'gelijk'] = test_en.loc[:, ['text', 'corrected']].apply(lambda x: x['text'] == x['corrected'], axis = 1)
test_en.loc[test_en.loc[:,'gelijk'] == False, ['text', 'corrected']].tail(20)

Unnamed: 0,text,corrected
79729,Morning view from the officepic.twitter.com/dO9ZUk2HPd – at DMG Holding B.V.,Morning view from the officepic.twitter.com/dO9ZUk2HPd a at dog Holding q.v.
100805,facking tasty,facing tasty
19693,Enjoy the beautiful trip and sharing with other enthousiasts. Consider my contribution as a reference to over 50% of EV drivers in NL (together about 70% of elec. miles) that have these Tesla travel benefits on a daily base. Building standards. Will be ok.,Enjoy the beautiful trip and sharing with other enthousiasts. Consider my contribution as a reference to over 50% of EV drivers in NL together about 70% of elect miles that have these tessa travel benefits on a daily based Building standards Will be ok
29261,#FlashPoll Tonights movie? #Aquaman Or #MortalEngines,#FlashPoll Tonights movie #Aquaman Or #MortalEngines
2761,With good music comes good food. Best catering on a festival i ever had #goeiepannenkoek… https://www.instagram.com/p/BG2RR1GmfM8/,With good music comes good food Best catering on a festival i ever had #goeiepannenkoek… https://www.instagram.com/p/BG2RR1GmfM8/
23424,"After a great weekend with lots of cake, party and qualitytime with the three of us, it's time… https://www.instagram.com/p/BLYnPA8BB7j/",After a great weekend with lots of cake party and qualitytime with the three of us its time https://www.instagram.com/p/BLYnPA8BB7j/
83567,Met de Eftelfans naar @pathe #it #itchapter2 #movies #film #bioscoop @Pathé https://www.instagram.com/p/B2Zu0e8ie2m/?igshid=14zvg5w34kltj …,Met de Eftelfans near pathe it #itchapter2 movies film #bioscoop @Pathé https://www.instagram.com/p/B2Zu0e8ie2m/?igshid=14zvg5w34kltj a
9586,DEATH ALLEY. Raaaawk @Roadburn Festival https://www.instagram.com/p/BEOoHv_KOlh/,DEATH alley Raaaawk @Roadburn Festival https://www.instagram.com/p/BEOoHv_KOlh/
68225,Damnnn I would have loved to see fancy photo's of your sushi and cocktails ^^ But guess you really enjoyed it so thats good too ;),Damnnn I would have loved to see fancy photos of your sushi and cocktails ^^ But guess you really enjoyed it so thats good too ;)
86561,you didn't even see it coming that fast!,you didn't even see it coming that fast


# Pre processing the Twitter data for emotions and sentiment
This was originally used to load all data and apply multiple steps at once. Once this was run the data would be stored in 6 pre-processed files. One for each year.

The first cell does only pre processing. After this step all files are saved to "tweets pre=processed 2014.csv". After the previous step the spellingchecker and sentiment analysis can be done. After this step all files are saved to "tweets+year+.csv". 
This was done to not do all the steps again if something went wrong. 

In [12]:
### SpellingChecker objects 
## may take one hour to run
distance = 1 # sets the Levenshtein distance for the spellingchecker
# setting up the Dutch spellingchecker
nl_spell = SpellChecker(language = None, distance = distance)
nl_spell.word_frequency.load_dictionary('nl_NL.json')

nl_ext = ['ff', "'s", 'A2', 'file', 'cda', 'IT', 'CDA', 'mengen', 'kraan', 'NL']
nl_spell.word_frequency.load_words(nl_ext)

# setting up the english spellingchecker
en_spell = SpellChecker(distance = distance)

en_ext = ["i'm", "we're", "I'll", "i'd", "you're"]
en_spell.word_frequency.load_words(en_ext)

# years of data 
years = ['2014', '2015', '2016', '2017', '2018', '2019']

### Pipeline for Selection, pre-processing, Language detection, spelling checker and Sentiment analysis
# for year in years:
path = os.getcwd()
data_folder = '15 km radius'

for year in years:
    print('year: '+year)
    files = [file for file in os.listdir(path+os.sep+data_folder) if year in file]
    temp = pd.DataFrame(data = None, columns = ['date', 'username', 'to', 'replies', 'retweets', 'favorites', 'text',
                                                'geo', 'mentions', 'hashtags', 'id', 'permalink'])

    for file in files:
        temp = temp.append(pd.read_csv(path+os.sep+data_folder+os.sep+file), ignore_index = True)
    print('shape: ',temp.shape)
    ### # analysis starts here
    ### Select data
    start = t.time()
    print('select data')
    tweets = temp
    tweets = tweets.drop_duplicates()

    p2000 = temp.loc[temp.loc[:,'username'] == 'P2000013', 'id'] # Removing this account, since it only posts emergency calls reportings
    tweets = tweets.loc[~tweets.loc[:,'id'].isin(p2000),:]
    
    ### Pre-processing
    print('Pre-processing')
    # remove nans in column text
    tweets = tweets.dropna(axis = 0, subset = ['text'])
    
    #remove urls
    tweets.loc[:,'text'] = tweets.loc[:,'text'].apply(lambda x: replace_url(x, sub = ''))

    # removes tweets without text
    tweets.loc[:,'contains text'] = tweets.loc[:,'text'].apply(lambda x: contains_text(x))
    tweets = tweets.loc[tweets.loc[:,'contains text'],:]
    tweets = tweets.drop(labels = 'contains text', axis= 'columns')

    # removes tweets that have less than 3 words
    tweets.loc[:,'long'] = tweets.loc[:,'text'].apply(lambda x: small_tweet(x))
    tweets = tweets.loc[tweets.loc[:,'long'], :]
    tweets = tweets.drop(labels = 'long', axis= 'columns')
    
    ### language detection
    print('Language detection')
    tweets.loc[:,'langdetect'] = tweets.loc[:,'text'].apply(lambda x: detect_lang(x))
    # only Dutch and English tweets
    tweets = tweets.loc[(tweets.loc[:,'langdetect'] == 'nl') | (tweets.loc[:,'langdetect'] == 'en'), :] 
    print('Final shape: ', tweets.shape)
    print('Finished\nTime: {}'.format(t.time() - start))

    tweets.to_csv(data+'tweets pre-processed '+year+'.csv', index = False)

year: 2014
shape:  (269143, 12)
select data
Pre-processing
Language detection
Final shape:  (157227, 13)
Finished
Time: 2102.5618481636047
year: 2015
shape:  (80317, 12)
select data
Pre-processing
Language detection
Final shape:  (48739, 13)
Finished
Time: 623.4078094959259
year: 2016
shape:  (28846, 12)
select data
Pre-processing
Language detection
Final shape:  (14528, 13)
Finished
Time: 193.7040729522705
year: 2017
shape:  (28251, 12)
select data
Pre-processing
Language detection
Final shape:  (11680, 13)
Finished
Time: 152.8936104774475
year: 2018
shape:  (25559, 12)
select data
Pre-processing
Language detection
Final shape:  (18434, 13)
Finished
Time: 186.30706906318665
year: 2019
shape:  (114195, 12)
select data
Pre-processing
Language detection
Final shape:  (74959, 13)
Finished
Time: 739.2866082191467


In [13]:
# tag tweets talking about institutions maually considered 
institutions = ['#FNV', 'Stadskantoor', 'MWB', 'Burgemeester', 'Burgernet', 'TBV','#MWB', 'beleid', 'gemeentehuis', 
                'provinciehuis', 'verkiezingen', 'raadslid', '@MVO_NL', 'campagneleider', '@raadtilburg', '@gemeentetilburg', 
                'minister', 'campagne','bestuursakkoord', '@D66Brabant', 'SP', 'LST', '@GLTilburg', '@SPTilburg', 'VVD', 
                '@CDATilburg', '@PvdATilburg', '@50pluspartij', '@LokaalTilburg', '#pvv', '@VoorTilburg', '#CDA', 
                '@Onderzoeksraad', '@IFVtweet', 'debat', '#tilburginbeeld', 'europa', '@D66Tilburg', '#D66Tilburg', 
                'coalitieakkoord', '@fontys', '@stationTilburg', 'Theresialyceum', '@Brabant', '#Spoorzone', '#lochal', 
                '#locomotiefhal', '#TilburgU', '#tilburguniversity', '#tiu', '#uvt', '@TilburgUniversity', '#starterslift', 
                '#topinkomens', '@NatuurmuseumBra', 'cultuurbudget', '@uvt_tilburg', '@BerkelEnschot', '@Avanshogeschool', 
                'Midpoint', 'BOM', '@BOMBrabant', '@TiwosTilburg', '@WonenBreburg', '@MidpointBrabant', '#midpointbrabant', 
                '@starterslift', '@nvrewin', '@IFVtweet','regelgeving', '#onderwijs', 'onderwijs', 'debat',
                'Europa','#CDA7','#demonstratie', '@minlnv']

years = ['2014', '2015', '2016', '2017', '2018', '2019']

for year in years:
    start = t.time()
    tweets = pd.read_csv(data+'tweets pre-processed '+year+'.csv')
    tweets.loc[:,'institutions'] = tweets.loc[:,'text'].apply(lambda x: check_inst(x, institutions))

    ### spellingchecker
    print('spellingchecker\nTime: {}'.format(t.time() - start))
    tweets.loc[(tweets.loc[:,'langdetect'] == 'nl') & (tweets.loc[:,'institutions'] == 1), 'text'] = tweets.loc[(tweets.loc[:,'langdetect'] == 'nl')& (tweets.loc[:,'institutions'] == 1), 'text'].apply(lambda x: sentence_checker(x, nl_spell))
    tweets.loc[(tweets.loc[:,'langdetect'] == 'en') & (tweets.loc[:,'institutions'] == 1), 'text'] = tweets.loc[(tweets.loc[:,'langdetect'] == 'en')& (tweets.loc[:,'institutions'] == 1), 'text'].apply(lambda x: sentence_checker(x, en_spell))

    ### sentiment analysis
    print('sentiment analysis\nTime: {}'.format(t.time() - start))
    tweets.loc[(tweets.loc[:,'langdetect'] == 'nl') & (tweets.loc[:,'institutions'] == 1), 'sentiment'] = tweets.loc[(tweets.loc[:,'langdetect'] == 'nl') & (tweets.loc[:,'institutions'] == 1),'text'].apply(lambda x: nl_sent(x))
    tweets.loc[(tweets.loc[:,'langdetect'] == 'en') & (tweets.loc[:,'institutions'] == 1), 'sentiment'] = tweets.loc[(tweets.loc[:,'langdetect'] == 'en') & (tweets.loc[:,'institutions'] == 1),'text'].apply(lambda x: en_sent(x))
    tweets.to_csv(data+'tweets '+year+'.csv', index = False)

spellingchecker
Time: 2.3746187686920166
sentiment analysis
Time: 11.151150941848755
spellingchecker
Time: 0.8268132209777832
sentiment analysis
Time: 6.787846803665161
spellingchecker
Time: 0.23735737800598145
sentiment analysis
Time: 4.135928153991699
spellingchecker
Time: 0.22440361976623535
sentiment analysis
Time: 1.2496581077575684
spellingchecker
Time: 0.38795995712280273
sentiment analysis
Time: 1.1160128116607666
spellingchecker
Time: 1.7632853984832764
sentiment analysis
Time: 7.433154582977295


## Read tweets pre-processed for sentiment analysis

In [14]:
tweets = pd.DataFrame(data = None, columns = ['date', 'text', 'id'])
years = ['2014', '2015', '2016', '2017', '2018', '2019']

for year in years:
    tweets = tweets.append(pd.read_csv(data+'tweets '+year+'.csv', dtype = {'id':'object'}), ignore_index = False)
    print(tweets.shape)
print('read files')

tweets.loc[:,'date'] = pd.to_datetime(tweets.date, dayfirst = True, infer_datetime_format = True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


(157227, 15)
(205966, 15)
(220494, 15)
(232174, 15)
(250608, 15)
(325567, 15)
read files


# Robustness check of Random wordlist
## Sentiment analysis
Correlate you monthly sentiment with the entrepreneurial output. 
Correlate the sentiment in tweets with randomly chosen words. 

In [15]:
# counts the occurrence of all words in all tweets so that we can take similar words from this list compared to our own list 
# of words. 
# takes about 8 secs # Get the frequency of all words
texts_df = pd.DataFrame(tweets.loc[:,'text'].apply(lambda x: re.sub('[\\\'!?,.:";]', '', x.lower())))
word_count = texts_df.loc[:,'text'].str.split(expand=True).stack().value_counts()
word_count_df = pd.DataFrame(word_count, columns = ['frequency'])

# get the frequency of my own words
institutions_freq = {}
for key in institutions:
    try:
        institutions_freq[key] = word_count[key.lower()]
    except:
        print(key, '-')

# Get the random words based on my own list of words with a margin of 5% 
random.seed(seed = 10)
rand_bow = []
for word in institutions_freq.keys():
    lower, upper = margins(institutions_freq[word])
    potential = list(word_count_df.loc[(word_count_df.loc[:,'frequency'] >= lower) & (word_count_df.loc[:,'frequency'] <= upper) ,:].index)
    chosen = False
    while not(chosen):
        chosen_word = potential.pop(random.randint(len(potential)))
        if re.search('[0-9-()$%&?!<>/\\\.]', chosen_word):
            continue
        else:
            rand_bow.append(chosen_word)
            chosen = True
            
# put the word list in this list so we have 2 columns which indicates where the tweet belongs to. 
tweets.loc[:,'random'] = tweets.loc[:,'text'].apply(lambda x: check_inst(x, rand_bow))
tweets.loc[:, 'institutions'] = tweets.loc[:,'text'].apply(lambda x: check_inst(x, institutions))

In [16]:
tweets = tweets.loc[:,['date', 'favorites', 'geo', 'hashtags', 'id', 'institutions','langdetect', 'mentions', 'permalink', 
                       'replies', 'retweets', 'sentiment', 'text', 'to', 'username', 'random',]]

In [17]:
tweets.loc[tweets.loc[:,'langdetect'] == 'nl', 'sentiment'] = tweets.loc[tweets.loc[:,'langdetect'] == 'nl','text'].apply(lambda x: nl_sent(x))
tweets.loc[tweets.loc[:,'langdetect'] == 'en', 'sentiment'] = tweets.loc[tweets.loc[:,'langdetect'] == 'en','text'].apply(lambda x: en_sent(x))

# Wordcount of new word list

## quaterly and monthly results

In [18]:
tweets.loc[:,'quarter'] = tweets.date.dt.to_period('Q')
tweets.loc[:,'month'] = tweets.date.dt.to_period('M')
tweets.loc[:,'year'] = tweets.date.dt.to_period('Y')

threshold = 0.05

tweets.loc[:,'pos'] = tweets.loc[:, 'sentiment'].apply(lambda x: is_pos(x, threshold = threshold))
tweets.loc[:,'neg'] = tweets.loc[:, 'sentiment'].apply(lambda x: is_neg(x, threshold = threshold))

#### A checkpoint for the data to avoid running previous analysis

In [16]:
#################################################################################################################################

tweets.to_pickle(pickles+'tweets 12-6-2020.pkl')

#################################################################################################################################
# Dataframe with sentiment of all tweets and indication for random words, institutions old and newest word list. Ready to be 
# calculated in quarter and month and correlate with 

In [8]:
tweets = pd.read_pickle(pickles+'tweets 12-6-2020.pkl')

## An analysis of the pre-processed tweets

In [19]:
tweets.loc[tweets.loc[:,'langdetect'] == 'en', 'year'].value_counts()

2014    34050
2019    15519
2015    12539
2016    5971 
2017    4913 
2018    4600 
Freq: A-DEC, Name: year, dtype: int64

In [20]:
tweets.loc[tweets.loc[:,'langdetect'] == 'nl', 'year'].value_counts()

2014    123177
2019    59440 
2015    36200 
2018    13834 
2016    8557  
2017    6767  
Freq: A-DEC, Name: year, dtype: int64

In [21]:
def has_sent(pos, neg):
    if pos == 1 or neg == 1:
        return 1
    return 0

tweets.loc[:,'has_sent'] = tweets.loc[:,:].apply(lambda x: has_sent(x['pos'], x['neg']), axis = 'columns')

In [22]:
time = 'month'
monthly_sent = tweets.loc[tweets.loc[:,'institutions'] == 1, [time, 'sentiment', 'pos', 'neg']].groupby(time).mean()
has_sent_df = tweets.loc[tweets.loc[:,'institutions'] == 1, [time, 'has_sent']].groupby(time).sum()

In [26]:
temp = tweets.loc[:,['date','id',  'institutions', 'langdetect','sentiment', 'text', 'username', 'random','quarter', 'month','pos','neg', 'has_sent']]
temp.to_csv(data+'Final sentiment.csv')

# Emotions analysis

In [27]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.tokenize.casual import TweetTokenizer

import spacy
import string
from collections import Counter

langs = ['en', 'nl']
# You have to instll the right lemmatizers in order to lemmatize your text.
lemm_dict = {'nl': spacy.load('nl_core_news_sm'), 'en':spacy.load("en_core_web_sm")}

text = tweets.loc[:,['text', 'langdetect', 'date', 'id', 'username']]
text.loc[:,'old text'] = text.loc[:,'text']

In [28]:
# tokenize and lowercase
tknzr = TweetTokenizer(preserve_case=False, reduce_len=False, strip_handles=False)
text.loc[:, "text"] = text.loc[:, "text"].apply(lambda x: tknzr.tokenize(x))

# remove punctuation
punct_to_remove = string.punctuation
text.loc[:, "text"] = text.loc[:, "text"].apply(lambda txt: [x for x in txt if x not in punct_to_remove])

In [29]:
# remove stopwords
stop_dict = {'en':stopwords.words('english'), 'nl': stopwords.words('dutch')}
for lang in langs:
     text.loc[text.loc[:,'langdetect'] == lang, "text"] = text.loc[text.loc[:,'langdetect'] == lang, 'text'].apply(lambda txt: [x for x in txt if x not in stop_dict[lang]])

In [30]:
# translating the emoticons to their textual equivalent
vert = {'happy':'blij', 'wink':'knipoog', 'sad':'verdrietig', 
        'cheeky':'ondeugend', 'crying':'huilen', 'annoyed':'geirriteerd'}
emoticons_en = {':)': 'happy', 
                ';)': 'wink', 
                ':(': 'sad', 
                ":p": 'cheeky', 
                ";-)": 'wink', 
                ":-)": 'happy', 
                ":D": 'happy', 
                "(:":'happy', 
                "]:":'sad', 
                ":')":'crying', 
                ':-/':'annoyed', 
                ':-p': 'cheeky',
                ':-(':'sad', 
                '):': 'sad', 
                ":'(": 'crying'}

emoticons_nl = {}
for emoticon in emoticons_en.keys():
    emoticons_nl[emoticon] = vert[emoticons_en[emoticon]]
emoticons = {'nl':emoticons_nl, 'en': emoticons_en}

def trans_emoticon(text, emoticon_dict):
    '''Changes the emoticon into the word'''
    try:
        for i, word in enumerate(text):
            if word in emoticon_dict.keys():
                text[i] = emoticon_dict[word]
    except:
        print(text)
    return text
            
for lang in langs:
    text.loc[text.loc[:,'langdetect'] == lang, "text"] = text.loc[text.loc[:,'langdetect'] == lang, "text"].apply(lambda txt: trans_emoticon(txt, emoticons[lang]))

In [31]:
# lemmatization --> may take 60 mins
def lemmatize(txt, lemmatizer):
    return [word.lemma_ for word in lemmatizer(txt)]

for lang in langs:
    text.loc[text.loc[:,'langdetect'] == lang, "text"] = text.loc[text.loc[:,'langdetect'] == lang, "text"].apply(lambda txt: lemmatize(' '.join(txt), lemm_dict[lang]))

# Applying the emotion lexicon
derived emotions:
- dominant emotion = 
    - valence approach -> what is the most frequent emotion positive or negative
    - cognitive appraisal -> what is the most frequent high or low controllability and certainty
- conflicting emotion 
    - valence approach -> what is the least frequent emotion positive or negative
    - cognitive appraisal -> what is the least frequent high or low controllability and certainty
- mixed emotions (C --> Conflicting, D --> Dominant)
    - $$5*(C+1)^p - (D+1)^{1/C}$$ --> p is less than 1 (0.5) and 
    
| emotion| Valence| Controlability  | Certainty|
| - |-| -|-|
| Joy (Happiness) | positive | High | High |
| Fear (fear) | Negative | Low | Low |
| Anticipation (hope) | Positive | Low | Low |
| Anger (anger) | Negative | High | High |

## Checkpoint in between again

In [32]:
text.to_pickle(pickles+'pre-processed.p')

In [33]:
text = pd.read_pickle(pickles+'pre-processed.p')

In [34]:
# emotions are detected in the text
emo_lex = pd.read_csv('NRC/NRC-Emotion-Lexicon-v0.92/NRC-Emotion-Lexicon-v0.92-In105Languages-Nov2017Translations - corrected.csv', encoding = 'iso8859_15')

fear_nl = list(emo_lex.loc[emo_lex.loc[:,'Fear'] == 1, 'dutch new'])
anger_nl = list(emo_lex.loc[emo_lex.loc[:,'Anger'] == 1, 'dutch new'])
Surprise_nl = list(emo_lex.loc[emo_lex.loc[:,'Surprise'] == 1, 'dutch new'])
joy_nl = list(emo_lex.loc[emo_lex.loc[:,'Joy'] == 1, 'dutch new'])
fear_en = list(emo_lex.loc[emo_lex.loc[:,'Fear'] == 1, 'English (en)'])
anger_en = list(emo_lex.loc[emo_lex.loc[:,'Anger'] == 1, 'English (en)'])
Surprise_en = list(emo_lex.loc[emo_lex.loc[:,'Surprise'] == 1, 'English (en)'])
joy_en = list(emo_lex.loc[emo_lex.loc[:,'Joy'] == 1, 'English (en)'])

emo_lex_dict = {'nl': {'Fear': fear_nl, 'Anger': anger_nl, 'Surprise': Surprise_nl, 'Joy': joy_nl},
                'en': {'Fear': fear_en, 'Anger': anger_en, 'Surprise': Surprise_en, 'Joy': joy_en}}

In [35]:
emo_lex.loc[(emo_lex.loc[:, 'Surprise'] == 1) & (emo_lex.loc[:, 'Surprise'] == 1), :].count()
emo_lex.loc[(emo_lex.loc[:, 'Surprise'] == 1),'English (en)'].count()
emo_lex.loc[(emo_lex.loc[:, 'Surprise'] == 1),'English (en)'].count()

532

In [36]:
def find_emo(txt, emo_lookup):
    '''
    counts the occurence of each emotion in the text
    txt: a list of words
    emo_lookup: a dataframe where words can be looked up in the index
    returns: a list with all emotions added up
    '''
    if type(emo_lookup) != type(list()):
        raise TypeError('"emo_lookup" should be type: list')
    emo_count = 0
    for word in txt:
        if word in emo_lookup:
            emo_count += 1
    
    return emo_count

langs = ['nl', 'en']
        
for lang in langs:
    print(lang)
    for emo in emo_lex_dict[lang].keys():
        print(emo)
        text.loc[text.loc[:, 'langdetect'] == lang,emo] = text.loc[text.loc[:, 'langdetect'] == lang,'text'].apply(lambda txt: find_emo(txt, emo_lex_dict[lang][emo]))

nl
Fear
Anger
Surprise
Joy
en
Fear
Anger
Surprise
Joy


In [37]:
text = text.loc[:, ['text', 'langdetect', 'date', 'username', 'Fear', 'Anger', 'Joy', 'Surprise']]

In [38]:
text.to_csv(data+'Final emotion.csv')