# Python Skript zum Digitalreport 2018
Das folgende Skript kann zum Sammeln von Daten von Facebook Pages genutzt werden.
Weitere Infos: https://digitalreport.at

In [None]:
import pandas as pd
import facepy
from facepy import GraphAPI
from urllib.parse import urlparse

In [None]:
# Facbook API Access token einfügen
# Etwa über https://developers.facebook.com/tools/explorer/
access_token = 'TOKEN_HIER_EINFÜGEN'

# Usernames (aus der URL) der Seiten kopieren, deren Daten gesammelt werden sollen
polpages = ['christian.kern.spoe', 'matthias.strolz','hcstrache', 
            'sebastiankurz.at','peterpilz',
            'diegruenen', 'NeosDasNeueOesterreich', 'Sozialdemokratie',
            'Volkspartei', 'listepilz.at', 'fpoe', 'wernerkogler']

graph = GraphAPI(access_token)

In [None]:
# Funktion zum Sammeln der Beiträge und Reactions
def get_posts_as_df(fbpage):
    since_date = '2000-01-01' #'2016-08-01'
    until_date = '2018-06-01'
    fbpageid = graph.get(fbpage)['id']
    pages = graph.get('v3.0/'+fbpageid+'/posts?fields=message,message_tags,link,created_time,type,name,id,comments.filter(stream).summary(true).limit(0),shares,from,source,status_type,updated_time,reactions.type(LIKE).limit(0).summary(1).as(like),reactions.type(LOVE).limit(0).summary(1).as(love),reactions.type(HAHA).limit(0).summary(1).as(haha),reactions.type(WOW).limit(0).summary(1).as(wow),reactions.type(SAD).limit(0).summary(1).as(sad),reactions.type(ANGRY).limit(0).summary(1).as(angry)&since='+since_date+'&until='+until_date+'&limit=100', page=True)
    posts = []
    for page in pages:
        for post in page['data']:
            #del(post['comments'])
            post['comments'] = post['comments']['summary']['total_count']
            post['like'] = post['like']['summary']['total_count']
            post['love'] = post['love']['summary']['total_count']
            post['haha'] = post['haha']['summary']['total_count']
            post['wow'] = post['wow']['summary']['total_count']
            post['sad'] = post['sad']['summary']['total_count']
            post['angry'] = post['angry']['summary']['total_count']
            post['from'] = post['from']['name']
            #post['reactions'] = post['reactions']['summary']['total_count']
            try:
                post['shares'] = post['shares']['count']
            except:
                pass
            try:
                post['hostname'] = urlparse(post['link']).hostname
            except:
                pass
            posts.append(post)
    df = pd.DataFrame(posts)
    #df['hostname'] = df['link'].apply(lambda x: urlparse(x).hostname)
    df['created_time'] = pd.to_datetime(df['created_time'])
    df.set_index('created_time', inplace=True)
    df['post'] = 1
    print(fbpage + ': ' +str(len(posts)))
    return (df)

In [None]:
# Sammelt die Beiträge für alle angegebenen Seiten
posts = get_posts_as_df(polpages[0])
for seite in polpages[1:]:
    posts = posts.append(get_posts_as_df(seite))

In [None]:
# Hostnames aufräumen
import re
def hostname_cleanup(hostname):
    replacements = [
        ('.*facebook\.com',''),
        ('^www\.',''),
        ('^ww1\.',''),
        ('^mobil\.','',),
        ('^m\.','',),
        ('.*orf\.at$','orf.at',),
        ('.*tt.com$','tt.com',),
        ('^cms.','',),
        ('.*\.spoe\.at$','spoe.at',),
        ('.*\.gruene\.at$.','gruene.at',),
        ('.*\.neos\.eu$','neos.eu',),
        ('.*\.sebastian-kurz\.at$','sebastian-kurz.at',),
        ('.*\.oe24\.at$','oe24.at',),
        ('.*instagram.com$','',),
        ('.*bit.ly$','',),
        ('.*kurz-link.at$','',),
        ('.*t.co$','',),
        ('.*yt2fb.com*$','',),
        ('.*youtube.com$','',),
        ('.*youtu.be$','',),
        ('.*yumpu.com$','',),
        ('.*giphy.com*$','',),
        ('.*gph.is*$','',),
        ('.*goo.gl*$','',),
        ('.*twibbon.com$','',),
        ('.*buff.ly$','',),
        ('.*tinyurl.com$','',),
        ('.*google.at$','',),
        ('.*lsh.re$','',),
        ('.*google.de$','',),
        ('.*snip.ly$','',),
        ('.*google.com$','',),
    ]
    
    if pd.notnull(hostname):
        for old, new in replacements:
            hostname = re.sub(old,new,hostname)
            #print(row['hostname'])
    return(hostname)
posts['hostname_clean'] = posts['hostname'].apply(hostname_cleanup)

In [None]:
# Reactions anzeigen
posts.groupby('from').aggregate(sum)

In [None]:
# Links anzeigen
posts.groupby(['from', 'hostname_clean'])['post'].aggregate(sum)

In [None]:
# Als .xlsx für Excel speichern
posts.to_excel('digitalreport.xlsx')

In [None]:
# Wordcloud erstellen
from os import path
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from stop_words import get_stop_words
import random
from colormath.color_objects import CMYKColor, HSLColor
from colormath.color_conversions import convert_color

from wordcloud import WordCloud, STOPWORDS

def wordy(page):
    d = ''

    # Read the whole text.
    text = ""
    for i, c in posts.where(posts['from'] == page)['message'].dropna().iteritems():
        text = text + " " + c
        
    def grey_color_func(word, font_size, position, orientation, random_state=None,
                    **kwargs):
        #abstufung = random.randint(20,100)/100
        abstufung = random.choice([0.2,0.4,0.6,0.8,1])
        c = 90 * abstufung 
        m = 100 * abstufung
        cmyk = CMYKColor(c,m,0,0)
        hsl = convert_color(cmyk, HSLColor)
        return "hsl({}, 100%, {}%)".format(int(hsl.hsl_h), int(hsl.hsl_l+100))
        #return "hsl(%d, 100%%, 55%%)" % random.randint(193, 246)
        

    # read the mask image
    image_mask = np.array(Image.open("cloud.png")) 

    stopwords = get_stop_words('de')
    stopwords.append("created_time")
    stopwords.append("NaN")
    stopwords.append("RT")
    stopwords.append("https")
    stopwords.append("co")
    stopwords.append("http")
    stopwords.append("goo")
    stopwords.append("gl")
    stopwords.append("ly")
    stopwords.append("bit")
    stopwords.append("be")
    stopwords.append("www")
    stopwords.append("artikel")
    stopwords.append("fpoe")
    stopwords.append("at")
    stopwords.append("gibt")
    stopwords.append("geht")
    stopwords.append("wurde")
    stopwords.append("to")
    stopwords.append("the")
    stopwords.append("and")
    stopwords.append("of")
    stopwords.append("daher")
    stopwords.append("Mehr")
    stopwords.append("goo")
    stopwords.append("beim")
    stopwords.append("link")
    stopwords.append("mehr")
    stopwords.append("Video")
    stopwords.append("Unsere")
    stopwords.append("heute")
    stopwords.append("daher")
    stopwords.append("Infos")
    stopwords.append("is")
    stopwords.append("by")
    stopwords.append("for")
    stopwords.append("it")

    wc = WordCloud(background_color="white", max_words=1000, stopwords=stopwords, width=1200, height=584, collocations=False, scale=1,
                   min_font_size=4, max_font_size=200) #mask=alice_mask,prefer_horizontal=1 regexp=r"\w[\w'/-]+"
    
    # generate text file to use with other tools
    wc.generate(text)
    with open('freq'+page+'.txt', 'w', encoding='UTF-8') as f:
        for freq, word in sorted(((value,key) for (key,value) in wc.process_text(text).items()), reverse=True):
            f.write(str(word) + ' ' + str(freq*4) + '\r\n')

    

    # show
    plt.imshow(wc.recolor(color_func=grey_color_func, random_state=3), interpolation='bilinear')
    
    # store to file
    wc.to_file("wordcloud_"+page+".png")

In [None]:
pages = ["LISTE PILZ","Peter Pilz","Die Grünen","NEOS","Matthias Strolz","SPÖ","Christian Kern","Volkspartei","Sebastian Kurz","FPÖ","HC Strache"]
for p in pages:
    wordy(p)

In [None]:
# Funktion zum Kommentare sammeln
def get_comments(postid, post_author):
    pages = graph.get(postid+'/comments?fields=attachment,message,message_tags,created_time&limit=100&filter=stream', page=True)
    comments = []
    for page in pages:
        for comment in page['data']:
            comment['post_id'] = postid
            comment['post_author'] = post_author
            if 'attachment' in comment:
                comment['attachment_type'] = comment['attachment']['type']
                comment['attachment'] = comment['attachment']['url']
            comments.append(comment)
    df = df = pd.DataFrame(comments)
    df['created_time'] = pd.to_datetime(df['created_time'])
    df.set_index('created_time', inplace=True)
    df['comment'] = 1
    return(df)

In [None]:
# Kommentare sammeln
all_comments = get_comments(allposts.iloc[-1]['id'],allposts.iloc[-1]['from'])
for index, post in allposts[1:-1].iterrows():
    #print (post['id'])
    try:
        all_comments = all_comments.append(get_comments(post['id'],post['from']))
    except:
        pass
all_comments

In [None]:
# gdf pages - websites zur Nutzung mit Gephi
with open('page-websites.gdf', 'w') as f:
    f.write('nodedef>name VARCHAR,label VARCHAR\n')
    f.write('edgedef>node1 VARCHAR,node2 VARCHAR,directed BOOLEAN\n')
    for i, row in allposts.iterrows():
        try:
            f.write(row['from'] + ',' + row['hostname'] + ',TRUE\n')
        except:
            #print(row['hostname'])
            pass

In [None]:
# gdf pages - shares zur Nutzung mit Gephi 
with open('page-shares.gdf', 'w') as f:
    usernames = {}
    f.write('nodedef>name VARCHAR,label VARCHAR\n')
    f.write('edgedef>node1 VARCHAR,node2 VARCHAR,directed BOOLEAN\n')
    for i, row in allposts["2018-1-1":"2018-6-1"].iterrows():
        try:
            if row['hostname'] == 'www.facebook.com':
                username = urlparse(row['link']).path.split('/')[1])['name']
                if username in usernames:
                    f.write(row['from'] + ',' + usernames[username] + ',TRUE\n')
                else:
                    usernames[username] = graph.get(username)['name']
                    f.write(row['from'] + ',' + usernames[username] + ',TRUE\n')
        except:
            print(row['hostname'])
            pass

In [None]:
# gdf pages - mentions zur Nutzung mit Gephi
with open('page-mentions.gdf', 'w') as f:
    f.write('nodedef>name VARCHAR,label VARCHAR\n')
    f.write('edgedef>node1 VARCHAR,node2 VARCHAR,directed BOOLEAN\n')
    for i, row in allposts["2017-1-1":"2018-6-1"].iterrows():
        try:
            for mention in row['message_tags']:
                #print(mention['name'])
                f.write(row['from'] + ',' + mention['name'] + ',TRUE\n')
        except:
            #print(row['hostname'])
            pass