In [1]:
import pandas as pd
import numpy as np
import operator
import re
from collections import Counter
from difflib import SequenceMatcher
from urllib.parse import urlencode
import webbrowser
import gensim
import bz2
import logging
from gensim import corpora, models, similarities
from gensim.corpora import WikiCorpus
from gensim.models.ldamodel import LdaModel
from gensim.models.word2vec import LineSentence, Word2Vec

doc = pd.read_csv('sample_dataset.csv', sep=';')

search_terms = [re.sub(r'(src:\w+)', '', word).strip() for word in doc['zoekopdracht'].unique()]
print(search_terms)

sickness_terms = [
    'ziek', 
    'griep', 
    'verkouden', 
    'verkoudheid',
    'koorts', 
    'hoofdpijn',
]

doc.head()

['ziek']


Unnamed: 0,zoekopdracht,datum,url,sentiment,type,discussielengte,views,auteur,volgers,invloed,GPS breedtegraad,GPS lengtegraad,bericht tekst,type bron,titel
0,ziek src:twitter,2017-09-20 10:11,https://twitter.com/HubertDeMeulder/status/910...,-,post,,14671.0,HubertDeMeulder,14671,4.8,,,#SabineHagedoren: “Op het werk wist niemand da...,twitter,
1,ziek src:twitter,2017-09-20 10:11,https://twitter.com/michielvdbroeck/status/910...,+,comment,4.0,,michielvdbroeck,1492,1.7,,,@heloisesell Haha wtf ziek nice,twitter,
2,ziek src:twitter,2017-09-20 10:10,https://twitter.com/petervalk1/status/91041589...,-,comment,2.0,1088.0,petervalk1,1088,1.8,,,"RT @AcrisiuS322: Van der Laan is ziek gemeld, ...",twitter,
3,ziek src:twitter,2017-09-20 10:09,https://twitter.com/Korneel_Evers/status/91041...,+,comment,644.0,,Korneel_Evers,4411,21.4,,,@_jazzybelle @zoekpostadres @real_Raffie @olaf...,twitter,
4,ziek src:twitter,2017-09-20 10:07,https://twitter.com/sariehimpe/status/91041518...,-,post,,332.0,sariehimpe,332,0.4,,,Iedereen gaat al een week de tijd gehad hebben...,twitter,


# Cleaning data
Here we do several things:
* remove retweets
* convert text to lowercase

In the context of our research a retweet does not add any relevant data. It actually skews the results since a specific phrasing get's repeated more often than it naturally would.

In [2]:
doc['bericht tekst'] = doc['bericht tekst'].fillna('')

In [3]:
doc['bericht tekst'] = doc['bericht tekst'].str.lower()

In [4]:
doc  = doc[~doc['bericht tekst'].str.contains('rt')]
doc  = doc[~doc['bericht tekst'].str.contains('http')]
doc  = doc[~doc['auteur'].str.contains('grieptweets')]
doc  = doc[~doc['auteur'].str.contains('kleenex_helpt')]
# doc.head()

In [5]:
doc = doc.drop_duplicates()
len(doc)

6530

# Defenition of helper function


In [6]:
re_clean = re.compile(r'(https?://\S+|@\S+)')
re_words = re.compile(r'(\w+-?\w*)')

def clean_text(text: str):
    words = []
    if text:
        text = re_clean.sub(' ', text)
        words = re_words.findall(text)
    return words

In [7]:
all_tweets = pd.read_csv('sample_alltweets.csv', sep=';')
all_tweets.head()

Unnamed: 0,zoekopdracht,datum,url,sentiment,type,discussielengte,views,auteur,volgers,invloed,GPS breedtegraad,GPS lengtegraad,bericht tekst,type bron,titel
0,src:twitter,2017-07-23 14:18,https://twitter.com/ESPNChiCubs/status/8890974...,,post,,102128.0,ESPNChiCubs,102128,2.1,,,Is Willson Contreras becoming the most valuabl...,twitter,
1,src:twitter,2017-01-19 16:58,https://twitter.com/A_Belgian_Lion/status/8221...,-,post,2.0,,A_Belgian_Lion,33,0.9,,,Dit systeem is door en door rot en moet DRINGE...,twitter,
2,src:twitter,2017-01-09 17:47,https://twitter.com/Worshpme/status/8184994120...,+,comment,,,Worshpme,426,0.3,,,@RagerrCOD @Twizzuki ha hahahahahaha get Rekt ...,twitter,
3,src:twitter,2017-04-27 18:04,https://twitter.com/klyne_nel/status/857626390...,+,comment,2.0,,klyne_nel,82,0.4,,,@iamsofiaandres sofiegos😄 woot woot,twitter,
4,src:twitter,2017-07-27 21:26,https://twitter.com/dxefne/status/890654544522...,,comment,,,dxefne,831,0.7,,,@starlgth teen wolf,twitter,


# Cleaning data
Here we do several things:
* convert messages with no content with to the message `''`
* remove retweets
* convert to lowercase

The conversion is needed since the `clean_text()` function expects a string.

In [8]:
all_tweets['bericht tekst'] = all_tweets['bericht tekst'].fillna('')

In [9]:
all_tweets['bericht tekst'] = all_tweets['bericht tekst'].str.lower()

In [10]:
all_tweets = all_tweets[~all_tweets['bericht tekst'].str.contains('rt')]
all_tweets.head()

Unnamed: 0,zoekopdracht,datum,url,sentiment,type,discussielengte,views,auteur,volgers,invloed,GPS breedtegraad,GPS lengtegraad,bericht tekst,type bron,titel
0,src:twitter,2017-07-23 14:18,https://twitter.com/ESPNChiCubs/status/8890974...,,post,,102128.0,ESPNChiCubs,102128,2.1,,,is willson contreras becoming the most valuabl...,twitter,
1,src:twitter,2017-01-19 16:58,https://twitter.com/A_Belgian_Lion/status/8221...,-,post,2.0,,A_Belgian_Lion,33,0.9,,,dit systeem is door en door rot en moet dringe...,twitter,
2,src:twitter,2017-01-09 17:47,https://twitter.com/Worshpme/status/8184994120...,+,comment,,,Worshpme,426,0.3,,,@ragerrcod @twizzuki ha hahahahahaha get rekt ...,twitter,
3,src:twitter,2017-04-27 18:04,https://twitter.com/klyne_nel/status/857626390...,+,comment,2.0,,klyne_nel,82,0.4,,,@iamsofiaandres sofiegos😄 woot woot,twitter,
4,src:twitter,2017-07-27 21:26,https://twitter.com/dxefne/status/890654544522...,,comment,,,dxefne,831,0.7,,,@starlgth teen wolf,twitter,


In [11]:
all_tweets['bericht woorden'] = all_tweets['bericht tekst'].map(clean_text)

In [12]:
counter = Counter()

for words in all_tweets['bericht woorden']:
    counter.update(words)
counter.most_common(300)

[('de', 2732),
 ('in', 2150),
 ('een', 1692),
 ('is', 1667),
 ('ik', 1552),
 ('en', 1531),
 ('van', 1478),
 ('het', 1362),
 ('the', 1161),
 ('je', 1120),
 ('op', 1068),
 ('i', 975),
 ('a', 947),
 ('voor', 923),
 ('to', 916),
 ('dat', 867),
 ('met', 864),
 ('niet', 830),
 ('of', 775),
 ('you', 612),
 ('s', 609),
 ('and', 601),
 ('te', 584),
 ('maar', 541),
 ('die', 510),
 ('zijn', 485),
 ('ook', 485),
 ('via', 482),
 ('for', 475),
 ('me', 473),
 ('er', 445),
 ('2', 440),
 ('on', 433),
 ('bij', 420),
 ('1', 405),
 ('we', 404),
 ('it', 401),
 ('nog', 396),
 ('dan', 393),
 ('wat', 389),
 ('aan', 388),
 ('als', 361),
 ('over', 360),
 ('om', 350),
 ('nu', 341),
 ('t', 332),
 ('heb', 330),
 ('naar', 327),
 ('this', 325),
 ('zo', 321),
 ('wel', 307),
 ('was', 306),
 ('al', 283),
 ('0', 281),
 ('my', 278),
 ('weer', 278),
 ('dit', 277),
 ('kan', 271),
 ('3', 267),
 ('door', 265),
 ('m', 261),
 ('mijn', 261),
 ('ben', 250),
 ('video', 250),
 ('geen', 242),
 ('uit', 240),
 ('with', 236),
 ('that'

In [13]:
common_words = set([word[0] for word in counter.most_common(300)])

In [14]:
blacklisted_words = set(common_words)
blacklisted_words.update(set(search_terms))

In [15]:
doc['bericht woorden'] = doc['bericht tekst'].map(clean_text)
counter = Counter()

for words in doc['bericht woorden']:
    words = set(words)
    filtered_words = words - blacklisted_words
    counter.update(filtered_words)
    
related_words = counter.most_common(25)
related_words

[('word', 224),
 ('thuis', 203),
 ('omdat', 164),
 ('chronisch', 153),
 ('eigen', 149),
 ('zorg', 129),
 ('voel', 126),
 ('erg', 122),
 ('bed', 115),
 ('risico', 108),
 ('beter', 103),
 ('steeds', 103),
 ('werken', 102),
 ('iemand', 102),
 ('zelf', 99),
 ('maakt', 92),
 ('geweest', 88),
 ('hele', 86),
 ('beetje', 86),
 ('geworden', 85),
 ('hebt', 84),
 ('iedereen', 81),
 ('helemaal', 80),
 ('kind', 78),
 ('mn', 77)]

In [16]:
model = Word2Vec.load('word2vec.model')

In [17]:
def scorer(row, words):
    if not 'score' in row:
        score = 0
        words = [word.replace('#', '') for word in row['bericht woorden'] if word in model.wv.vocab]
        if words:
            score = model.wv.n_similarity(sickness_terms, words)
        if row['type'] == 'comment':
            score /= 2
        row['score'] = score
    return row

# words = [word[0] for word in related_words]
doc = doc.apply(scorer, axis=1, words=words)

In [18]:
doc = doc.sort_values('score', ascending=False)

In [19]:
pd.set_option('display.max_colwidth', 250)
pd.options.display.max_rows = 999

In [20]:
doc.filter(items=['bericht tekst', 'auteur', 'type', 'score']).head(250)

Unnamed: 0,bericht tekst,auteur,type,score
3213,tyfus ziek,MilouAmann,post,0.797106
4668,ziek erge hoofdpijn.. iemand tips?,mikexyn,post,0.791987
9679,voor 2e dag ziek thuis. maag/darmen teveel prikkels. heerst er weer iets? #buikgriep #griep #ziek #ziekthuis #zzm #ziekzwakmisselijk #gatver,johnkapjr,post,0.787678
2185,ziek thuis griep en verkouden 😷,sabihaaakdeniz,post,0.762325
13391,"hoofdpijn, buikpijn, moe tekenen van ziek komen??😞",FireflyChar90,post,0.761952
12865,ziek veel stress,cedric_vandun,post,0.759097
4767,"hij is misschien psychisch ziek ""ingebeelde ziekten""",merckxchristin2,post,0.758268
17373,ziek volkje.,ronald_brok,post,0.744404
17507,tering ziek..,sel0reos,post,0.741515
17929,smerig ziek klotevolk,swimmingsigs,post,0.73824
