In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from datetime import date, time, timedelta
from datetime import datetime as dt

import requests
from bs4 import BeautifulSoup as bs
from random import randint
from time import sleep
import time, os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import glob
from urllib.request import urlopen

%matplotlib inline

# Get article data from Sueddeutsche Zeitung (sz.de)

- Since newsticker website of newspaper Sueddeutsche Zeitung has only a lookback window of 5000 articles (approx. 6 days), data has to be scraped every approx. 5 days in order to build a large enough dataset
- Relevant for analysis: Compare published daily traffic of two main newspapers in Germany
- Hence: Feature engineering necessary for creating daily-based features from scraped articles 

In [2]:
def get_szarticles(pages):
    base_url = 'https://www.sueddeutsche.de/news/page/'
    
    df = pd.DataFrame({
    'links':[],
    'source':[],
    'title': [],
    'heading':[],
    'publishtime':[],
    'labelpaid':[],
    'author': [],
    'labelcategory': [],
    'image': []
    })
    
    for i in range(pages, 0, -1):
        url = base_url + str(i)
        response = requests.get(url)
        page = response.text
        soup = bs(page)
        
        links = []
        for i in soup.find_all(class_='entrylist__entry'):
            if i.find(class_='entrylist__link') != None:
                links.append(i.find(class_='entrylist__link')['href'])
            else:
                links.append('no_link')
                  
        source = []     
        for i in soup.find_all(class_='entrylist__entry'):
            if i.find(class_='breadcrumb-list__item') != None:
                source.append(i.find(class_='breadcrumb-list__item').text.strip())
            else:
                source.append('no_source')
        source = [x for x in source if ((x == 'dpa') or (x == 'SZ') or (x == 'SZMAGAZIN') or (x == 'JETZTDE'))]
                
        title = []
        for i in soup.find_all(class_='entrylist__entry'):
            if i.find(class_='entrylist__title') != None:
                title.append(i.find(class_='entrylist__title').text)
            else:
                title.append('no_title')
            
        heading = []
        for i in soup.find_all(class_='entrylist__entry'):
            if i.find(class_='entrylist__overline') != None:
                heading.append(i.find(class_='entrylist__overline').text)
            else:
                heading.append('no_heading')
            
        publishtime = []
        for i in soup.find_all(class_='entrylist__entry'):
            if i.find(class_='entrylist__time') != None:
                publishtime.append(i.find(class_='entrylist__time').text.strip())
            else:
                publishtime.append('no_time')
            
        labelpaid = []
        for i in soup.find_all(class_='entrylist__entry'):
            if i.find_all(class_='entrylist__detail detailed-information') != None:
                if i.find(class_='entrylist__payicon') != None:
                    labelpaid.append('paid')
                else:
                    labelpaid.append('not_paid')
            else:
                labelpaid.append('not_paid')                
                
                
        author = []
        for i in soup.find_all(class_='entrylist__detail detailed-information'):
            if i.find(class_='entrylist__author') != None:
                author.append(i.find(class_='entrylist__author').text)
            else:
                author.append('no_author')
                
        labelcategory = []
        for i in soup.find_all(class_='entrylist__content'):
            if i.find(class_='entrylist__icon') != None:
                labelcategory.append(i.find(class_='entrylist__icon').text)
            else:
                labelcategory.append('no_label')
                
        image = []
        for i in soup.find_all(class_='entrylist__entry'):
            if i.find(class_='entrylist__imagecontainer detailed-information') != None:
                image.append(i.find('img')['src'])
            else:
                image.append('no_image')
        
        df_temp = pd.DataFrame({
            'links': links,
            'source': source,
            'title': title,
            'heading': heading,
            'publishtime': publishtime,
            'labelpaid':labelpaid,
            'author': author,
            'labelcategory': labelcategory,
            'image': image
        })
        
        df = pd.concat([df, df_temp])
        df.reset_index(drop=True, inplace=True)
    
    return(df)

In [3]:
df = get_szarticles(100)
df

Unnamed: 0,links,source,title,heading,publishtime,labelpaid,author,labelcategory,image
0,https://www.sueddeutsche.de/sport/handball-met...,dpa,Halle verliert trotz starker Vorstellung 30:33...,Handball - Metzingen,07.05.2022 | 19:07,not_paid,no_author,no_label,https://media-cdn.sueddeutsche.de/image/dpa.ur...
1,https://www.sueddeutsche.de/politik/wahlen-nor...,dpa,Nordirland-Wahl: Sinn Fein erstmals stärkste P...,Wahlen,07.05.2022 | 19:02,not_paid,no_author,no_label,https://media-cdn.sueddeutsche.de/image/dpa.ur...
2,https://www.sueddeutsche.de/sport/sc-freiburg-...,SZ,"""Halten Sie dem SC die Treue!""",SC Freiburg,07.05.2022 | 19:00,not_paid,"Von Ron Ulrich, Freiburg",no_label,https://media-cdn.sueddeutsche.de/image/sz.1.5...
3,https://www.sueddeutsche.de/sport/wasserball-k...,dpa,Spandaus Wasserball-Teams mit Frauen-Sieg und ...,Wasserball - Krefeld,07.05.2022 | 18:58,not_paid,no_author,no_label,https://media-cdn.sueddeutsche.de/image/dpa.ur...
4,https://www.sueddeutsche.de/sport/handball-ede...,dpa,Souveräner BSV festigt Platz drei: Niederlage ...,Handball - Edertal,07.05.2022 | 18:53,not_paid,no_author,no_label,https://media-cdn.sueddeutsche.de/image/dpa.ur...
...,...,...,...,...,...,...,...,...,...
4995,https://www.sueddeutsche.de/bayern/fussball-mu...,dpa,Wirbel um Lewandowski: Fürth verabschiedet sich,Fußball - München,00:30,not_paid,no_author,no_label,https://media-cdn.sueddeutsche.de/image/dpa.ur...
4996,https://www.sueddeutsche.de/bayern/parteien-mu...,dpa,Habeck bei Parteitag der bayerischen Grünen zu...,Parteien - München,00:30,not_paid,no_author,no_label,https://media-cdn.sueddeutsche.de/image/dpa.ur...
4997,https://www.sueddeutsche.de/bayern/unfaelle-di...,dpa,Autos stoßen beim Abbiegen zusammen: 83-Jährig...,Unfälle - Dietramszell,00:27,not_paid,no_author,no_label,https://media-cdn.sueddeutsche.de/image/dpa.ur...
4998,https://www.sueddeutsche.de/wirtschaft/bahn-be...,dpa,Bahn will mit Zentralrat beim Thema Antizigani...,Bahn - Berlin,00:25,not_paid,no_author,no_label,no_image


In [4]:
df.head()

Unnamed: 0,links,source,title,heading,publishtime,labelpaid,author,labelcategory,image
0,https://www.sueddeutsche.de/sport/handball-met...,dpa,Halle verliert trotz starker Vorstellung 30:33...,Handball - Metzingen,07.05.2022 | 19:07,not_paid,no_author,no_label,https://media-cdn.sueddeutsche.de/image/dpa.ur...
1,https://www.sueddeutsche.de/politik/wahlen-nor...,dpa,Nordirland-Wahl: Sinn Fein erstmals stärkste P...,Wahlen,07.05.2022 | 19:02,not_paid,no_author,no_label,https://media-cdn.sueddeutsche.de/image/dpa.ur...
2,https://www.sueddeutsche.de/sport/sc-freiburg-...,SZ,"""Halten Sie dem SC die Treue!""",SC Freiburg,07.05.2022 | 19:00,not_paid,"Von Ron Ulrich, Freiburg",no_label,https://media-cdn.sueddeutsche.de/image/sz.1.5...
3,https://www.sueddeutsche.de/sport/wasserball-k...,dpa,Spandaus Wasserball-Teams mit Frauen-Sieg und ...,Wasserball - Krefeld,07.05.2022 | 18:58,not_paid,no_author,no_label,https://media-cdn.sueddeutsche.de/image/dpa.ur...
4,https://www.sueddeutsche.de/sport/handball-ede...,dpa,Souveräner BSV festigt Platz drei: Niederlage ...,Handball - Edertal,07.05.2022 | 18:53,not_paid,no_author,no_label,https://media-cdn.sueddeutsche.de/image/dpa.ur...


In [5]:
df_meta_empty = pd.DataFrame({
    'keywords_meta':[],
    'labelpaid_meta':[],
    'opinion_meta': [],
    'articletype_meta':[],
    'loc_meta':[],
    'imagewidth_meta':[],
    'imageheight_meta': [],
    'author_meta': [],
    'readtime_meta': []
    })

def get_szmeta(szarticles_df):
    
    df_meta_empty = pd.DataFrame({
    'keywords_meta':[],
    'labelpaid_meta':[],
    'opinion_meta': [],
    'articletype_meta':[],
    'loc_meta':[],
    'imagewidth_meta':[],
    'imageheight_meta': [],
    'author_meta': [],
    'readtime_meta': []
    })
    
    keywords_meta = []
    labelpaid_meta = []
    opinion_meta = []
    articletype_meta = []
    loc_meta = []
    imagewidth_meta = []
    imageheight_meta = []
    author_meta = []
    readtime_meta = []
    
    for i in range(len(szarticles_df['links'])):
        url = szarticles_df['links'][i]
        response = requests.get(url)
        page = response.text
        soup = bs(page)
    
        try:
            keywords_meta.append(soup.find('meta', {'name': 'keywords'})['content'])
        except:
            keywords_meta.append('no_info')
        
        try:
            labelpaid_meta.append(soup.find('meta', {'property': 'article:content_tier'})['content'])
        except:
            labelpaid_meta.append('no_info')
        
        try:
            opinion_meta.append(soup.find('meta', {'property': 'article:opinion'})['content'])
        except:
            opinion_meta.append('no_info')
        
        try:
            articletype_meta.append(soup.find('meta', {'property': 'og:type'})['content'])
        except:
            articletype_meta.append('no_info')
        
        try: 
            loc_meta.append(soup.find('meta', {'property': 'og:locale'})['content'])
        except:
            loc_meta.append('no_info')
        
        try:
            imagewidth_meta.append(soup.find('meta', {'property': 'og:image:width'})['content'])
        except:
            imagewidth_meta.append('no_info')
        
        try:
            imageheight_meta.append(soup.find('meta', {'property': 'og:image:height'})['content'])
        except:
            imageheight_meta.append('no_info')
        
        try:
            author_meta.append(soup.find('meta', {'name': 'author'})['content'])
        except:
            author_meta.append('no_info')
        
        try:
            readtime_meta.append(soup.find(class_="css-dgxek7").text)
        except:
            readtime_meta.append('no_readtime_shown')

        print(f'Article number {i} of 5000 scraping completed.')
            
    df_meta_temp = pd.DataFrame({
        'keywords_meta': keywords_meta,
        'labelpaid_meta': labelpaid_meta,
        'opinion_meta': opinion_meta,
        'articletype_meta': articletype_meta,
        'loc_meta': loc_meta,
        'imagewidth_meta': imagewidth_meta,
        'imageheight_meta': imageheight_meta,
        'author_meta': author_meta,
        'readtime_meta': readtime_meta
    })
            
    return(df_meta_temp)

In [6]:
df_sz_articles = get_szmeta(df)

Article number 0 of 5000 scraping completed.
Article number 1 of 5000 scraping completed.
Article number 2 of 5000 scraping completed.
Article number 3 of 5000 scraping completed.
Article number 4 of 5000 scraping completed.
Article number 5 of 5000 scraping completed.
Article number 6 of 5000 scraping completed.
Article number 7 of 5000 scraping completed.
Article number 8 of 5000 scraping completed.
Article number 9 of 5000 scraping completed.
Article number 10 of 5000 scraping completed.
Article number 11 of 5000 scraping completed.
Article number 12 of 5000 scraping completed.
Article number 13 of 5000 scraping completed.
Article number 14 of 5000 scraping completed.
Article number 15 of 5000 scraping completed.
Article number 16 of 5000 scraping completed.
Article number 17 of 5000 scraping completed.
Article number 18 of 5000 scraping completed.
Article number 19 of 5000 scraping completed.
Article number 20 of 5000 scraping completed.
Article number 21 of 5000 scraping completed

Article number 177 of 5000 scraping completed.
Article number 178 of 5000 scraping completed.
Article number 179 of 5000 scraping completed.
Article number 180 of 5000 scraping completed.
Article number 181 of 5000 scraping completed.
Article number 182 of 5000 scraping completed.
Article number 183 of 5000 scraping completed.
Article number 184 of 5000 scraping completed.
Article number 185 of 5000 scraping completed.
Article number 186 of 5000 scraping completed.
Article number 187 of 5000 scraping completed.
Article number 188 of 5000 scraping completed.
Article number 189 of 5000 scraping completed.
Article number 190 of 5000 scraping completed.
Article number 191 of 5000 scraping completed.
Article number 192 of 5000 scraping completed.
Article number 193 of 5000 scraping completed.
Article number 194 of 5000 scraping completed.
Article number 195 of 5000 scraping completed.
Article number 196 of 5000 scraping completed.
Article number 197 of 5000 scraping completed.
Article numbe

Article number 352 of 5000 scraping completed.
Article number 353 of 5000 scraping completed.
Article number 354 of 5000 scraping completed.
Article number 355 of 5000 scraping completed.
Article number 356 of 5000 scraping completed.
Article number 357 of 5000 scraping completed.
Article number 358 of 5000 scraping completed.
Article number 359 of 5000 scraping completed.
Article number 360 of 5000 scraping completed.
Article number 361 of 5000 scraping completed.
Article number 362 of 5000 scraping completed.
Article number 363 of 5000 scraping completed.
Article number 364 of 5000 scraping completed.
Article number 365 of 5000 scraping completed.
Article number 366 of 5000 scraping completed.
Article number 367 of 5000 scraping completed.
Article number 368 of 5000 scraping completed.
Article number 369 of 5000 scraping completed.
Article number 370 of 5000 scraping completed.
Article number 371 of 5000 scraping completed.
Article number 372 of 5000 scraping completed.
Article numbe

Article number 528 of 5000 scraping completed.
Article number 529 of 5000 scraping completed.
Article number 530 of 5000 scraping completed.
Article number 531 of 5000 scraping completed.
Article number 532 of 5000 scraping completed.
Article number 533 of 5000 scraping completed.
Article number 534 of 5000 scraping completed.
Article number 535 of 5000 scraping completed.
Article number 536 of 5000 scraping completed.
Article number 537 of 5000 scraping completed.
Article number 538 of 5000 scraping completed.
Article number 539 of 5000 scraping completed.
Article number 540 of 5000 scraping completed.
Article number 541 of 5000 scraping completed.
Article number 542 of 5000 scraping completed.
Article number 543 of 5000 scraping completed.
Article number 544 of 5000 scraping completed.
Article number 545 of 5000 scraping completed.
Article number 546 of 5000 scraping completed.
Article number 547 of 5000 scraping completed.
Article number 548 of 5000 scraping completed.
Article numbe

Article number 704 of 5000 scraping completed.
Article number 705 of 5000 scraping completed.
Article number 706 of 5000 scraping completed.
Article number 707 of 5000 scraping completed.
Article number 708 of 5000 scraping completed.
Article number 709 of 5000 scraping completed.
Article number 710 of 5000 scraping completed.
Article number 711 of 5000 scraping completed.
Article number 712 of 5000 scraping completed.
Article number 713 of 5000 scraping completed.
Article number 714 of 5000 scraping completed.
Article number 715 of 5000 scraping completed.
Article number 716 of 5000 scraping completed.
Article number 717 of 5000 scraping completed.
Article number 718 of 5000 scraping completed.
Article number 719 of 5000 scraping completed.
Article number 720 of 5000 scraping completed.
Article number 721 of 5000 scraping completed.
Article number 722 of 5000 scraping completed.
Article number 723 of 5000 scraping completed.
Article number 724 of 5000 scraping completed.
Article numbe

Article number 879 of 5000 scraping completed.
Article number 880 of 5000 scraping completed.
Article number 881 of 5000 scraping completed.
Article number 882 of 5000 scraping completed.
Article number 883 of 5000 scraping completed.
Article number 884 of 5000 scraping completed.
Article number 885 of 5000 scraping completed.
Article number 886 of 5000 scraping completed.
Article number 887 of 5000 scraping completed.
Article number 888 of 5000 scraping completed.
Article number 889 of 5000 scraping completed.
Article number 890 of 5000 scraping completed.
Article number 891 of 5000 scraping completed.
Article number 892 of 5000 scraping completed.
Article number 893 of 5000 scraping completed.
Article number 894 of 5000 scraping completed.
Article number 895 of 5000 scraping completed.
Article number 896 of 5000 scraping completed.
Article number 897 of 5000 scraping completed.
Article number 898 of 5000 scraping completed.
Article number 899 of 5000 scraping completed.
Article numbe

Article number 1054 of 5000 scraping completed.
Article number 1055 of 5000 scraping completed.
Article number 1056 of 5000 scraping completed.
Article number 1057 of 5000 scraping completed.
Article number 1058 of 5000 scraping completed.
Article number 1059 of 5000 scraping completed.
Article number 1060 of 5000 scraping completed.
Article number 1061 of 5000 scraping completed.
Article number 1062 of 5000 scraping completed.
Article number 1063 of 5000 scraping completed.
Article number 1064 of 5000 scraping completed.
Article number 1065 of 5000 scraping completed.
Article number 1066 of 5000 scraping completed.
Article number 1067 of 5000 scraping completed.
Article number 1068 of 5000 scraping completed.
Article number 1069 of 5000 scraping completed.
Article number 1070 of 5000 scraping completed.
Article number 1071 of 5000 scraping completed.
Article number 1072 of 5000 scraping completed.
Article number 1073 of 5000 scraping completed.
Article number 1074 of 5000 scraping com

Article number 1226 of 5000 scraping completed.
Article number 1227 of 5000 scraping completed.
Article number 1228 of 5000 scraping completed.
Article number 1229 of 5000 scraping completed.
Article number 1230 of 5000 scraping completed.
Article number 1231 of 5000 scraping completed.
Article number 1232 of 5000 scraping completed.
Article number 1233 of 5000 scraping completed.
Article number 1234 of 5000 scraping completed.
Article number 1235 of 5000 scraping completed.
Article number 1236 of 5000 scraping completed.
Article number 1237 of 5000 scraping completed.
Article number 1238 of 5000 scraping completed.
Article number 1239 of 5000 scraping completed.
Article number 1240 of 5000 scraping completed.
Article number 1241 of 5000 scraping completed.
Article number 1242 of 5000 scraping completed.
Article number 1243 of 5000 scraping completed.
Article number 1244 of 5000 scraping completed.
Article number 1245 of 5000 scraping completed.
Article number 1246 of 5000 scraping com

Article number 1398 of 5000 scraping completed.
Article number 1399 of 5000 scraping completed.
Article number 1400 of 5000 scraping completed.
Article number 1401 of 5000 scraping completed.
Article number 1402 of 5000 scraping completed.
Article number 1403 of 5000 scraping completed.
Article number 1404 of 5000 scraping completed.
Article number 1405 of 5000 scraping completed.
Article number 1406 of 5000 scraping completed.
Article number 1407 of 5000 scraping completed.
Article number 1408 of 5000 scraping completed.
Article number 1409 of 5000 scraping completed.
Article number 1410 of 5000 scraping completed.
Article number 1411 of 5000 scraping completed.
Article number 1412 of 5000 scraping completed.
Article number 1413 of 5000 scraping completed.
Article number 1414 of 5000 scraping completed.
Article number 1415 of 5000 scraping completed.
Article number 1416 of 5000 scraping completed.
Article number 1417 of 5000 scraping completed.
Article number 1418 of 5000 scraping com

Article number 1570 of 5000 scraping completed.
Article number 1571 of 5000 scraping completed.
Article number 1572 of 5000 scraping completed.
Article number 1573 of 5000 scraping completed.
Article number 1574 of 5000 scraping completed.
Article number 1575 of 5000 scraping completed.
Article number 1576 of 5000 scraping completed.
Article number 1577 of 5000 scraping completed.
Article number 1578 of 5000 scraping completed.
Article number 1579 of 5000 scraping completed.
Article number 1580 of 5000 scraping completed.
Article number 1581 of 5000 scraping completed.
Article number 1582 of 5000 scraping completed.
Article number 1583 of 5000 scraping completed.
Article number 1584 of 5000 scraping completed.
Article number 1585 of 5000 scraping completed.
Article number 1586 of 5000 scraping completed.
Article number 1587 of 5000 scraping completed.
Article number 1588 of 5000 scraping completed.
Article number 1589 of 5000 scraping completed.
Article number 1590 of 5000 scraping com

Article number 1741 of 5000 scraping completed.
Article number 1742 of 5000 scraping completed.
Article number 1743 of 5000 scraping completed.
Article number 1744 of 5000 scraping completed.
Article number 1745 of 5000 scraping completed.
Article number 1746 of 5000 scraping completed.
Article number 1747 of 5000 scraping completed.
Article number 1748 of 5000 scraping completed.
Article number 1749 of 5000 scraping completed.
Article number 1750 of 5000 scraping completed.
Article number 1751 of 5000 scraping completed.
Article number 1752 of 5000 scraping completed.
Article number 1753 of 5000 scraping completed.
Article number 1754 of 5000 scraping completed.
Article number 1755 of 5000 scraping completed.
Article number 1756 of 5000 scraping completed.
Article number 1757 of 5000 scraping completed.
Article number 1758 of 5000 scraping completed.
Article number 1759 of 5000 scraping completed.
Article number 1760 of 5000 scraping completed.
Article number 1761 of 5000 scraping com

Article number 1912 of 5000 scraping completed.
Article number 1913 of 5000 scraping completed.
Article number 1914 of 5000 scraping completed.
Article number 1915 of 5000 scraping completed.
Article number 1916 of 5000 scraping completed.
Article number 1917 of 5000 scraping completed.
Article number 1918 of 5000 scraping completed.
Article number 1919 of 5000 scraping completed.
Article number 1920 of 5000 scraping completed.
Article number 1921 of 5000 scraping completed.
Article number 1922 of 5000 scraping completed.
Article number 1923 of 5000 scraping completed.
Article number 1924 of 5000 scraping completed.
Article number 1925 of 5000 scraping completed.
Article number 1926 of 5000 scraping completed.
Article number 1927 of 5000 scraping completed.
Article number 1928 of 5000 scraping completed.
Article number 1929 of 5000 scraping completed.
Article number 1930 of 5000 scraping completed.
Article number 1931 of 5000 scraping completed.
Article number 1932 of 5000 scraping com

Article number 2084 of 5000 scraping completed.
Article number 2085 of 5000 scraping completed.
Article number 2086 of 5000 scraping completed.
Article number 2087 of 5000 scraping completed.
Article number 2088 of 5000 scraping completed.
Article number 2089 of 5000 scraping completed.
Article number 2090 of 5000 scraping completed.
Article number 2091 of 5000 scraping completed.
Article number 2092 of 5000 scraping completed.
Article number 2093 of 5000 scraping completed.
Article number 2094 of 5000 scraping completed.
Article number 2095 of 5000 scraping completed.
Article number 2096 of 5000 scraping completed.
Article number 2097 of 5000 scraping completed.
Article number 2098 of 5000 scraping completed.
Article number 2099 of 5000 scraping completed.
Article number 2100 of 5000 scraping completed.
Article number 2101 of 5000 scraping completed.
Article number 2102 of 5000 scraping completed.
Article number 2103 of 5000 scraping completed.
Article number 2104 of 5000 scraping com

Article number 2256 of 5000 scraping completed.
Article number 2257 of 5000 scraping completed.
Article number 2258 of 5000 scraping completed.
Article number 2259 of 5000 scraping completed.
Article number 2260 of 5000 scraping completed.
Article number 2261 of 5000 scraping completed.
Article number 2262 of 5000 scraping completed.
Article number 2263 of 5000 scraping completed.
Article number 2264 of 5000 scraping completed.
Article number 2265 of 5000 scraping completed.
Article number 2266 of 5000 scraping completed.
Article number 2267 of 5000 scraping completed.
Article number 2268 of 5000 scraping completed.
Article number 2269 of 5000 scraping completed.
Article number 2270 of 5000 scraping completed.
Article number 2271 of 5000 scraping completed.
Article number 2272 of 5000 scraping completed.
Article number 2273 of 5000 scraping completed.
Article number 2274 of 5000 scraping completed.
Article number 2275 of 5000 scraping completed.
Article number 2276 of 5000 scraping com

Article number 2428 of 5000 scraping completed.
Article number 2429 of 5000 scraping completed.
Article number 2430 of 5000 scraping completed.
Article number 2431 of 5000 scraping completed.
Article number 2432 of 5000 scraping completed.
Article number 2433 of 5000 scraping completed.
Article number 2434 of 5000 scraping completed.
Article number 2435 of 5000 scraping completed.
Article number 2436 of 5000 scraping completed.
Article number 2437 of 5000 scraping completed.
Article number 2438 of 5000 scraping completed.
Article number 2439 of 5000 scraping completed.
Article number 2440 of 5000 scraping completed.
Article number 2441 of 5000 scraping completed.
Article number 2442 of 5000 scraping completed.
Article number 2443 of 5000 scraping completed.
Article number 2444 of 5000 scraping completed.
Article number 2445 of 5000 scraping completed.
Article number 2446 of 5000 scraping completed.
Article number 2447 of 5000 scraping completed.
Article number 2448 of 5000 scraping com

Article number 2599 of 5000 scraping completed.
Article number 2600 of 5000 scraping completed.
Article number 2601 of 5000 scraping completed.
Article number 2602 of 5000 scraping completed.
Article number 2603 of 5000 scraping completed.
Article number 2604 of 5000 scraping completed.
Article number 2605 of 5000 scraping completed.
Article number 2606 of 5000 scraping completed.
Article number 2607 of 5000 scraping completed.
Article number 2608 of 5000 scraping completed.
Article number 2609 of 5000 scraping completed.
Article number 2610 of 5000 scraping completed.
Article number 2611 of 5000 scraping completed.
Article number 2612 of 5000 scraping completed.
Article number 2613 of 5000 scraping completed.
Article number 2614 of 5000 scraping completed.
Article number 2615 of 5000 scraping completed.
Article number 2616 of 5000 scraping completed.
Article number 2617 of 5000 scraping completed.
Article number 2618 of 5000 scraping completed.
Article number 2619 of 5000 scraping com

Article number 2771 of 5000 scraping completed.
Article number 2772 of 5000 scraping completed.
Article number 2773 of 5000 scraping completed.
Article number 2774 of 5000 scraping completed.
Article number 2775 of 5000 scraping completed.
Article number 2776 of 5000 scraping completed.
Article number 2777 of 5000 scraping completed.
Article number 2778 of 5000 scraping completed.
Article number 2779 of 5000 scraping completed.
Article number 2780 of 5000 scraping completed.
Article number 2781 of 5000 scraping completed.
Article number 2782 of 5000 scraping completed.
Article number 2783 of 5000 scraping completed.
Article number 2784 of 5000 scraping completed.
Article number 2785 of 5000 scraping completed.
Article number 2786 of 5000 scraping completed.
Article number 2787 of 5000 scraping completed.
Article number 2788 of 5000 scraping completed.
Article number 2789 of 5000 scraping completed.
Article number 2790 of 5000 scraping completed.
Article number 2791 of 5000 scraping com

Article number 2942 of 5000 scraping completed.
Article number 2943 of 5000 scraping completed.
Article number 2944 of 5000 scraping completed.
Article number 2945 of 5000 scraping completed.
Article number 2946 of 5000 scraping completed.
Article number 2947 of 5000 scraping completed.
Article number 2948 of 5000 scraping completed.
Article number 2949 of 5000 scraping completed.
Article number 2950 of 5000 scraping completed.
Article number 2951 of 5000 scraping completed.
Article number 2952 of 5000 scraping completed.
Article number 2953 of 5000 scraping completed.
Article number 2954 of 5000 scraping completed.
Article number 2955 of 5000 scraping completed.
Article number 2956 of 5000 scraping completed.
Article number 2957 of 5000 scraping completed.
Article number 2958 of 5000 scraping completed.
Article number 2959 of 5000 scraping completed.
Article number 2960 of 5000 scraping completed.
Article number 2961 of 5000 scraping completed.
Article number 2962 of 5000 scraping com

Article number 3113 of 5000 scraping completed.
Article number 3114 of 5000 scraping completed.
Article number 3115 of 5000 scraping completed.
Article number 3116 of 5000 scraping completed.
Article number 3117 of 5000 scraping completed.
Article number 3118 of 5000 scraping completed.
Article number 3119 of 5000 scraping completed.
Article number 3120 of 5000 scraping completed.
Article number 3121 of 5000 scraping completed.
Article number 3122 of 5000 scraping completed.
Article number 3123 of 5000 scraping completed.
Article number 3124 of 5000 scraping completed.
Article number 3125 of 5000 scraping completed.
Article number 3126 of 5000 scraping completed.
Article number 3127 of 5000 scraping completed.
Article number 3128 of 5000 scraping completed.
Article number 3129 of 5000 scraping completed.
Article number 3130 of 5000 scraping completed.
Article number 3131 of 5000 scraping completed.
Article number 3132 of 5000 scraping completed.
Article number 3133 of 5000 scraping com

Article number 3285 of 5000 scraping completed.
Article number 3286 of 5000 scraping completed.
Article number 3287 of 5000 scraping completed.
Article number 3288 of 5000 scraping completed.
Article number 3289 of 5000 scraping completed.
Article number 3290 of 5000 scraping completed.
Article number 3291 of 5000 scraping completed.
Article number 3292 of 5000 scraping completed.
Article number 3293 of 5000 scraping completed.
Article number 3294 of 5000 scraping completed.
Article number 3295 of 5000 scraping completed.
Article number 3296 of 5000 scraping completed.
Article number 3297 of 5000 scraping completed.
Article number 3298 of 5000 scraping completed.
Article number 3299 of 5000 scraping completed.
Article number 3300 of 5000 scraping completed.
Article number 3301 of 5000 scraping completed.
Article number 3302 of 5000 scraping completed.
Article number 3303 of 5000 scraping completed.
Article number 3304 of 5000 scraping completed.
Article number 3305 of 5000 scraping com

Article number 3457 of 5000 scraping completed.
Article number 3458 of 5000 scraping completed.
Article number 3459 of 5000 scraping completed.
Article number 3460 of 5000 scraping completed.
Article number 3461 of 5000 scraping completed.
Article number 3462 of 5000 scraping completed.
Article number 3463 of 5000 scraping completed.
Article number 3464 of 5000 scraping completed.
Article number 3465 of 5000 scraping completed.
Article number 3466 of 5000 scraping completed.
Article number 3467 of 5000 scraping completed.
Article number 3468 of 5000 scraping completed.
Article number 3469 of 5000 scraping completed.
Article number 3470 of 5000 scraping completed.
Article number 3471 of 5000 scraping completed.
Article number 3472 of 5000 scraping completed.
Article number 3473 of 5000 scraping completed.
Article number 3474 of 5000 scraping completed.
Article number 3475 of 5000 scraping completed.
Article number 3476 of 5000 scraping completed.
Article number 3477 of 5000 scraping com

Article number 3628 of 5000 scraping completed.
Article number 3629 of 5000 scraping completed.
Article number 3630 of 5000 scraping completed.
Article number 3631 of 5000 scraping completed.
Article number 3632 of 5000 scraping completed.
Article number 3633 of 5000 scraping completed.
Article number 3634 of 5000 scraping completed.
Article number 3635 of 5000 scraping completed.
Article number 3636 of 5000 scraping completed.
Article number 3637 of 5000 scraping completed.
Article number 3638 of 5000 scraping completed.
Article number 3639 of 5000 scraping completed.
Article number 3640 of 5000 scraping completed.
Article number 3641 of 5000 scraping completed.
Article number 3642 of 5000 scraping completed.
Article number 3643 of 5000 scraping completed.
Article number 3644 of 5000 scraping completed.
Article number 3645 of 5000 scraping completed.
Article number 3646 of 5000 scraping completed.
Article number 3647 of 5000 scraping completed.
Article number 3648 of 5000 scraping com

Article number 3799 of 5000 scraping completed.
Article number 3800 of 5000 scraping completed.
Article number 3801 of 5000 scraping completed.
Article number 3802 of 5000 scraping completed.
Article number 3803 of 5000 scraping completed.
Article number 3804 of 5000 scraping completed.
Article number 3805 of 5000 scraping completed.
Article number 3806 of 5000 scraping completed.
Article number 3807 of 5000 scraping completed.
Article number 3808 of 5000 scraping completed.
Article number 3809 of 5000 scraping completed.
Article number 3810 of 5000 scraping completed.
Article number 3811 of 5000 scraping completed.
Article number 3812 of 5000 scraping completed.
Article number 3813 of 5000 scraping completed.
Article number 3814 of 5000 scraping completed.
Article number 3815 of 5000 scraping completed.
Article number 3816 of 5000 scraping completed.
Article number 3817 of 5000 scraping completed.
Article number 3818 of 5000 scraping completed.
Article number 3819 of 5000 scraping com

Article number 3971 of 5000 scraping completed.
Article number 3972 of 5000 scraping completed.
Article number 3973 of 5000 scraping completed.
Article number 3974 of 5000 scraping completed.
Article number 3975 of 5000 scraping completed.
Article number 3976 of 5000 scraping completed.
Article number 3977 of 5000 scraping completed.
Article number 3978 of 5000 scraping completed.
Article number 3979 of 5000 scraping completed.
Article number 3980 of 5000 scraping completed.
Article number 3981 of 5000 scraping completed.
Article number 3982 of 5000 scraping completed.
Article number 3983 of 5000 scraping completed.
Article number 3984 of 5000 scraping completed.
Article number 3985 of 5000 scraping completed.
Article number 3986 of 5000 scraping completed.
Article number 3987 of 5000 scraping completed.
Article number 3988 of 5000 scraping completed.
Article number 3989 of 5000 scraping completed.
Article number 3990 of 5000 scraping completed.
Article number 3991 of 5000 scraping com

Article number 4143 of 5000 scraping completed.
Article number 4144 of 5000 scraping completed.
Article number 4145 of 5000 scraping completed.
Article number 4146 of 5000 scraping completed.
Article number 4147 of 5000 scraping completed.
Article number 4148 of 5000 scraping completed.
Article number 4149 of 5000 scraping completed.
Article number 4150 of 5000 scraping completed.
Article number 4151 of 5000 scraping completed.
Article number 4152 of 5000 scraping completed.
Article number 4153 of 5000 scraping completed.
Article number 4154 of 5000 scraping completed.
Article number 4155 of 5000 scraping completed.
Article number 4156 of 5000 scraping completed.
Article number 4157 of 5000 scraping completed.
Article number 4158 of 5000 scraping completed.
Article number 4159 of 5000 scraping completed.
Article number 4160 of 5000 scraping completed.
Article number 4161 of 5000 scraping completed.
Article number 4162 of 5000 scraping completed.
Article number 4163 of 5000 scraping com

Article number 4314 of 5000 scraping completed.
Article number 4315 of 5000 scraping completed.
Article number 4316 of 5000 scraping completed.
Article number 4317 of 5000 scraping completed.
Article number 4318 of 5000 scraping completed.
Article number 4319 of 5000 scraping completed.
Article number 4320 of 5000 scraping completed.
Article number 4321 of 5000 scraping completed.
Article number 4322 of 5000 scraping completed.
Article number 4323 of 5000 scraping completed.
Article number 4324 of 5000 scraping completed.
Article number 4325 of 5000 scraping completed.
Article number 4326 of 5000 scraping completed.
Article number 4327 of 5000 scraping completed.
Article number 4328 of 5000 scraping completed.
Article number 4329 of 5000 scraping completed.
Article number 4330 of 5000 scraping completed.
Article number 4331 of 5000 scraping completed.
Article number 4332 of 5000 scraping completed.
Article number 4333 of 5000 scraping completed.
Article number 4334 of 5000 scraping com

Article number 4485 of 5000 scraping completed.
Article number 4486 of 5000 scraping completed.
Article number 4487 of 5000 scraping completed.
Article number 4488 of 5000 scraping completed.
Article number 4489 of 5000 scraping completed.
Article number 4490 of 5000 scraping completed.
Article number 4491 of 5000 scraping completed.
Article number 4492 of 5000 scraping completed.
Article number 4493 of 5000 scraping completed.
Article number 4494 of 5000 scraping completed.
Article number 4495 of 5000 scraping completed.
Article number 4496 of 5000 scraping completed.
Article number 4497 of 5000 scraping completed.
Article number 4498 of 5000 scraping completed.
Article number 4499 of 5000 scraping completed.
Article number 4500 of 5000 scraping completed.
Article number 4501 of 5000 scraping completed.
Article number 4502 of 5000 scraping completed.
Article number 4503 of 5000 scraping completed.
Article number 4504 of 5000 scraping completed.
Article number 4505 of 5000 scraping com

Article number 4656 of 5000 scraping completed.
Article number 4657 of 5000 scraping completed.
Article number 4658 of 5000 scraping completed.
Article number 4659 of 5000 scraping completed.
Article number 4660 of 5000 scraping completed.
Article number 4661 of 5000 scraping completed.
Article number 4662 of 5000 scraping completed.
Article number 4663 of 5000 scraping completed.
Article number 4664 of 5000 scraping completed.
Article number 4665 of 5000 scraping completed.
Article number 4666 of 5000 scraping completed.
Article number 4667 of 5000 scraping completed.
Article number 4668 of 5000 scraping completed.
Article number 4669 of 5000 scraping completed.
Article number 4670 of 5000 scraping completed.
Article number 4671 of 5000 scraping completed.
Article number 4672 of 5000 scraping completed.
Article number 4673 of 5000 scraping completed.
Article number 4674 of 5000 scraping completed.
Article number 4675 of 5000 scraping completed.
Article number 4676 of 5000 scraping com

Article number 4827 of 5000 scraping completed.
Article number 4828 of 5000 scraping completed.
Article number 4829 of 5000 scraping completed.
Article number 4830 of 5000 scraping completed.
Article number 4831 of 5000 scraping completed.
Article number 4832 of 5000 scraping completed.
Article number 4833 of 5000 scraping completed.
Article number 4834 of 5000 scraping completed.
Article number 4835 of 5000 scraping completed.
Article number 4836 of 5000 scraping completed.
Article number 4837 of 5000 scraping completed.
Article number 4838 of 5000 scraping completed.
Article number 4839 of 5000 scraping completed.
Article number 4840 of 5000 scraping completed.
Article number 4841 of 5000 scraping completed.
Article number 4842 of 5000 scraping completed.
Article number 4843 of 5000 scraping completed.
Article number 4844 of 5000 scraping completed.
Article number 4845 of 5000 scraping completed.
Article number 4846 of 5000 scraping completed.
Article number 4847 of 5000 scraping com

Article number 4998 of 5000 scraping completed.
Article number 4999 of 5000 scraping completed.


In [7]:
df_sz_articles

Unnamed: 0,keywords_meta,labelpaid_meta,opinion_meta,articletype_meta,loc_meta,imagewidth_meta,imageheight_meta,author_meta,readtime_meta
0,"Bundesliga,Deutschland,Frauen,Halle,Handball,R...",free,false,article,de_DE,1200,675,Süddeutsche Zeitung,no_readtime_shown
1,"Politicker,Wahlen,Parteien,Europäische Union,S...",free,false,article,de_DE,1200,675,Süddeutsche Zeitung,Lesezeit: 2 min
2,"Bundesliga,1. FC Union Berlin,Champions League...",metered,false,article,de_DE,1200,675,Süddeutsche Zeitung,Lesezeit: 3 min
3,"Krefeld,Berlin,Deutschland,Duisburg,Frauen,Nor...",free,false,article,de_DE,1200,675,Süddeutsche Zeitung,no_readtime_shown
4,"Bundesliga,Niedersachsen,Deutschland,Frauen,Ha...",free,false,article,de_DE,1200,675,Süddeutsche Zeitung,no_readtime_shown
...,...,...,...,...,...,...,...,...,...
4995,"München,Augsburg,Bayern,Bundesliga,Deutschland...",free,false,article,de_DE,1200,675,Süddeutsche Zeitung,no_readtime_shown
4996,"München,Bayern,Deutschland,Die Grünen,Parteien...",free,false,article,de_DE,1200,675,Süddeutsche Zeitung,no_readtime_shown
4997,"Dietramszell,Landkreis Bad Tölz-Wolfratshausen...",free,false,article,de_DE,1200,675,Süddeutsche Zeitung,no_readtime_shown
4998,"Berlin,Baden-Württemberg,Bahn,Deutschland,Mind...",free,false,article,de_DE,1200,630,Süddeutsche Zeitung,no_readtime_shown


In [8]:
df_meta = pd.concat([df, df_sz_articles], axis=1)
df_meta.reset_index(drop=True, inplace=True)
df_meta.tail()

Unnamed: 0,links,source,title,heading,publishtime,labelpaid,author,labelcategory,image,keywords_meta,labelpaid_meta,opinion_meta,articletype_meta,loc_meta,imagewidth_meta,imageheight_meta,author_meta,readtime_meta
4995,https://www.sueddeutsche.de/bayern/fussball-mu...,dpa,Wirbel um Lewandowski: Fürth verabschiedet sich,Fußball - München,00:30,not_paid,no_author,no_label,https://media-cdn.sueddeutsche.de/image/dpa.ur...,"München,Augsburg,Bayern,Bundesliga,Deutschland...",free,False,article,de_DE,1200,675,Süddeutsche Zeitung,no_readtime_shown
4996,https://www.sueddeutsche.de/bayern/parteien-mu...,dpa,Habeck bei Parteitag der bayerischen Grünen zu...,Parteien - München,00:30,not_paid,no_author,no_label,https://media-cdn.sueddeutsche.de/image/dpa.ur...,"München,Bayern,Deutschland,Die Grünen,Parteien...",free,False,article,de_DE,1200,675,Süddeutsche Zeitung,no_readtime_shown
4997,https://www.sueddeutsche.de/bayern/unfaelle-di...,dpa,Autos stoßen beim Abbiegen zusammen: 83-Jährig...,Unfälle - Dietramszell,00:27,not_paid,no_author,no_label,https://media-cdn.sueddeutsche.de/image/dpa.ur...,"Dietramszell,Landkreis Bad Tölz-Wolfratshausen...",free,False,article,de_DE,1200,675,Süddeutsche Zeitung,no_readtime_shown
4998,https://www.sueddeutsche.de/wirtschaft/bahn-be...,dpa,Bahn will mit Zentralrat beim Thema Antizigani...,Bahn - Berlin,00:25,not_paid,no_author,no_label,no_image,"Berlin,Baden-Württemberg,Bahn,Deutschland,Mind...",free,False,article,de_DE,1200,630,Süddeutsche Zeitung,no_readtime_shown
4999,https://www.sueddeutsche.de/politik/regierung-...,dpa,Ukrainischer Außenminister dankt für Hilfe vom...,Regierung - Lübeck,00:18,not_paid,no_author,no_label,https://media-cdn.sueddeutsche.de/image/dpa.ur...,"Lübeck,Deutschland,International,Konferenzen,K...",free,False,article,de_DE,1200,675,Süddeutsche Zeitung,Lesezeit: 1 min


In [9]:
df_meta[4989:4995]

Unnamed: 0,links,source,title,heading,publishtime,labelpaid,author,labelcategory,image,keywords_meta,labelpaid_meta,opinion_meta,articletype_meta,loc_meta,imagewidth_meta,imageheight_meta,author_meta,readtime_meta
4989,https://www.sueddeutsche.de/gesundheit/gesundh...,dpa,Lauterbach: Delta-Variante kann wiederkommen,Gesundheit,01:15,not_paid,no_author,no_label,https://media-cdn.sueddeutsche.de/image/dpa.ur...,"Gesundheit,Krankheiten,Coronavirus,Coronavirus...",free,False,article,de_DE,1200,675,Süddeutsche Zeitung,no_readtime_shown
4990,https://www.sueddeutsche.de/politik/parteien-n...,dpa,Grüne wollen weibliche Doppelspitze wählen,Parteien - Neukieritzsch,00:50,not_paid,no_author,no_label,https://media-cdn.sueddeutsche.de/image/dpa.ur...,"Deutschland,Die Grünen,Leipzig,Parteien,Politi...",free,False,article,de_DE,1200,675,Süddeutsche Zeitung,no_readtime_shown
4991,https://www.sueddeutsche.de/politik/parteien-a...,dpa,Sachsens Linke diskutiert über aktuelle Lage d...,Parteien - Annaberg-Buchholz,00:48,not_paid,no_author,no_label,https://media-cdn.sueddeutsche.de/image/dpa.ur...,"Deutschland,Die Linke,Parteien,Politicker,Sach...",free,False,article,de_DE,1200,675,Süddeutsche Zeitung,no_readtime_shown
4992,https://www.sueddeutsche.de/leben/freizeit-ros...,dpa,Deutsche Meisterschaften im Strand-Frisbee in ...,Freizeit - Rostock,00:48,not_paid,no_author,no_label,no_image,"Rostock,Breitensport,Deutschland,Freizeit,Meck...",free,False,article,de_DE,1200,630,Süddeutsche Zeitung,no_readtime_shown
4993,https://www.sueddeutsche.de/politik/abgeordnet...,dpa,Grünen-Fraktion berät über Haushalt und Selbst...,Abgeordnetenhaus - Nauen,00:44,not_paid,no_author,no_label,no_image,"Abgeordnetenhaus Berlin,Berlin,Brandenburg,Deu...",free,False,article,de_DE,1200,630,Süddeutsche Zeitung,Lesezeit: 1 min
4994,https://www.sueddeutsche.de/sport/fussball-wol...,dpa,Wolfsburg trifft auf den deutschen Meister Bay...,Fußball - Wolfsburg,00:39,not_paid,no_author,no_label,https://media-cdn.sueddeutsche.de/image/dpa.ur...,"Wolfsburg,Bayern,Bundesliga,Deutschland,Fußbal...",free,False,article,de_DE,1200,675,Süddeutsche Zeitung,no_readtime_shown


In [10]:
df_meta['scrape_datetime'] = dt(2022,5,14,8,0)
df_meta.head()

Unnamed: 0,links,source,title,heading,publishtime,labelpaid,author,labelcategory,image,keywords_meta,labelpaid_meta,opinion_meta,articletype_meta,loc_meta,imagewidth_meta,imageheight_meta,author_meta,readtime_meta,scrape_datetime
0,https://www.sueddeutsche.de/sport/handball-met...,dpa,Halle verliert trotz starker Vorstellung 30:33...,Handball - Metzingen,07.05.2022 | 19:07,not_paid,no_author,no_label,https://media-cdn.sueddeutsche.de/image/dpa.ur...,"Bundesliga,Deutschland,Frauen,Halle,Handball,R...",free,False,article,de_DE,1200,675,Süddeutsche Zeitung,no_readtime_shown,2022-05-14 08:00:00
1,https://www.sueddeutsche.de/politik/wahlen-nor...,dpa,Nordirland-Wahl: Sinn Fein erstmals stärkste P...,Wahlen,07.05.2022 | 19:02,not_paid,no_author,no_label,https://media-cdn.sueddeutsche.de/image/dpa.ur...,"Politicker,Wahlen,Parteien,Europäische Union,S...",free,False,article,de_DE,1200,675,Süddeutsche Zeitung,Lesezeit: 2 min,2022-05-14 08:00:00
2,https://www.sueddeutsche.de/sport/sc-freiburg-...,SZ,"""Halten Sie dem SC die Treue!""",SC Freiburg,07.05.2022 | 19:00,not_paid,"Von Ron Ulrich, Freiburg",no_label,https://media-cdn.sueddeutsche.de/image/sz.1.5...,"Bundesliga,1. FC Union Berlin,Champions League...",metered,False,article,de_DE,1200,675,Süddeutsche Zeitung,Lesezeit: 3 min,2022-05-14 08:00:00
3,https://www.sueddeutsche.de/sport/wasserball-k...,dpa,Spandaus Wasserball-Teams mit Frauen-Sieg und ...,Wasserball - Krefeld,07.05.2022 | 18:58,not_paid,no_author,no_label,https://media-cdn.sueddeutsche.de/image/dpa.ur...,"Krefeld,Berlin,Deutschland,Duisburg,Frauen,Nor...",free,False,article,de_DE,1200,675,Süddeutsche Zeitung,no_readtime_shown,2022-05-14 08:00:00
4,https://www.sueddeutsche.de/sport/handball-ede...,dpa,Souveräner BSV festigt Platz drei: Niederlage ...,Handball - Edertal,07.05.2022 | 18:53,not_paid,no_author,no_label,https://media-cdn.sueddeutsche.de/image/dpa.ur...,"Bundesliga,Niedersachsen,Deutschland,Frauen,Ha...",free,False,article,de_DE,1200,675,Süddeutsche Zeitung,no_readtime_shown,2022-05-14 08:00:00


In [11]:
# Create data checkpoint for scraped meta-information on each article
df_meta.to_csv('2022-05-14-(08:00)_articles_sz.csv')

# Get article-data from Frankfurter Allgemeine Zeitung (faz.net)

- Since newsticker website of newspaper F.A.Z. provides a lookback window of unlimited days, data is scraped for all articles published within last 4 months (01/01/2022 - 06/05/2022)
- Relevant for analysis: Compare published daily traffic of two main newspapers in Germany
- Hence: Feature engineering necessary for creating daily-based features from scraped articles 

In [12]:
def get_fazarticles(to, fr):
     
    links = []
    source = []
    title = []
    heading = []
    publishtime = []
    labelpaid = []
    author = []

    chromedriver = "/media/fabian/VM_space/Metis/02_Regression/chrome_driver/chromedriver" # path to the chromedriver executable
    os.environ["webdriver.chrome.driver"] = chromedriver
    driver = webdriver.Chrome(chromedriver)

    faz_newsticker_url = 'https://www.faz.net/faz-live-p1'
    driver.get(faz_newsticker_url)

    time.sleep(1)
    driver.switch_to.frame('sp_message_iframe_629848')
    consent_picker = driver.find_element_by_xpath('//*[@id="notice"]/div[4]/div[1]/button[2]')
    consent_picker.click()

    for i in range(int((pd.to_datetime(to) - pd.to_datetime(fr)).days)+1):
        d = pd.to_datetime(to) - timedelta(days=i)
        date_scrape1 = str(pd.to_datetime(d).month)
        date_scrape2 = str(pd.to_datetime(d).day)
        date_scrape3 = str(pd.to_datetime(d).year)


        for i in range(1, 15):
            faz_newsticker_url = 'https://www.faz.net/faz-live-p' + str(i)
            driver.get(faz_newsticker_url)

            date_picker = driver.find_element_by_xpath('//*[@id="from"]')
            date_picker.send_keys(Keys.ARROW_LEFT)
            date_picker.send_keys(Keys.ARROW_LEFT)
            if len(date_scrape1) == 1:
                date_picker.send_keys('0'+date_scrape1)
            else:
                date_picker.send_keys(date_scrape1)
            if len(date_scrape2) == 1:
                date_picker.send_keys('0'+date_scrape2)
            else:
                date_picker.send_keys(date_scrape2)
            date_picker.send_keys(date_scrape3)
            
            date_picker = driver.find_element_by_xpath('//*[@id="till"]')
            date_picker.send_keys(Keys.ARROW_LEFT)
            date_picker.send_keys(Keys.ARROW_LEFT)
            
            if len(date_scrape1) == 1:
                date_picker.send_keys('0'+date_scrape1)
            else:
                date_picker.send_keys(date_scrape1)
            if len(date_scrape2) == 1:
                date_picker.send_keys('0'+date_scrape2)
            else:
                date_picker.send_keys(date_scrape2)
            date_picker.send_keys(date_scrape3)

            apply_button = driver.find_element_by_xpath('//*[@id="contentHeaderFooter"]/div[1]/div/div/form/div[2]/div[3]/button')
            apply_button.click()
            time.sleep(1)

            soup = bs(driver.page_source)

            for i in soup.find_all(class_='ticker-news-item'):
                if i.find('a')['href'] != None:
                    links.append(i.find('a')['href'])
                else:
                    links.append('no_link')

            for i in soup.find_all(class_='ticker-news-item'):
                if i.find(class_='ticker-news-type').find('img')['alt'] != None:
                    source.append(i.find(class_='ticker-news-type').find('img')['alt'])
                else:
                    source.append('no_info')

            for i in soup.find_all(class_='ticker-news-text'):
                if i.find('a').text != None:
                    title.append(i.find('a').text.strip())
                else:
                    title.append('no_info')

            for i in soup.find_all(class_='ticker-news-text'):
                try:
                    heading.append(i.find(class_='ticker-news-super').text.strip().replace(' :', ''))
                except:
                    heading.append('no_info')

            for i in soup.find_all(class_='ticker-news-item'):
                if i.find('time').text != None:
                    publishtime.append(i.find('time').text)
                else:
                    publishtime.append('no_info')

            for i in soup.find_all(class_='ticker-news-item'):
                if i.find(class_='ticker-news-type').find('img')['alt'] == 'FAZ+':
                    labelpaid.append(True)
                else:
                    labelpaid.append(False)

            for i in soup.find_all(class_='ticker-news-item'):
                try:
                    author.append(i.find(class_='ticker-news-author').text)
                except:
                    author.append('no_info')

    df_faz = pd.DataFrame({
        'links': links,
        'source': source,
        'title': title,
        'heading': heading,
        'publishtime': publishtime,
        'labelpaid':labelpaid,
        'author': author
    })

    df_faz.drop_duplicates(inplace=True)
    df_faz.reset_index(drop=True, inplace=True)
    
    driver.quit()

    return(df_faz)

In [13]:
df_faz = get_fazarticles('2022-05-13', '2022-05-07')

  driver = webdriver.Chrome(chromedriver)
  consent_picker = driver.find_element_by_xpath('//*[@id="notice"]/div[4]/div[1]/button[2]')
  date_picker = driver.find_element_by_xpath('//*[@id="from"]')
  date_picker = driver.find_element_by_xpath('//*[@id="till"]')
  apply_button = driver.find_element_by_xpath('//*[@id="contentHeaderFooter"]/div[1]/div/div/form/div[2]/div[3]/button')


In [14]:
df_faz

Unnamed: 0,links,source,title,heading,publishtime,labelpaid,author
0,https://www.faz.net/agenturmeldungen/dpa/gruen...,Agenturmeldung,Grüne: Vernetzung rechter Parteien näher unter...,no_info,14.05.2022 01:26 Uhr,False,no_info
1,https://www.faz.net/agenturmeldungen/dpa/thyss...,Agenturmeldung,Thyssenkrupp Marine Systems zeigt Interesse an...,no_info,14.05.2022 01:16 Uhr,False,no_info
2,https://www.faz.net/agenturmeldungen/dpa/meste...,Agenturmeldung,"Mester, Casselly und Ullmann im «Let's Dance»-...",no_info,14.05.2022 01:02 Uhr,False,no_info
3,https://www.faz.net/aktuell/sport/tennis-maste...,FAZ+,"Zverevs Wohl, Nadals Wehe",Tennis-Masters in Rom,13.05.2022 23:29 Uhr,True,no_info
4,https://www.faz.net/aktuell/sport/eishockey-wm...,FAZ+,Missglückter Start für die Deutschen,Eishockey-WM,13.05.2022 23:28 Uhr,True,no_info
...,...,...,...,...,...,...,...
4549,https://www.faz.net/agenturmeldungen/dpa/pudel...,Agenturmeldung,Pudelwohl zurück: Astronaut Maurer wieder in D...,no_info,07.05.2022 04:10 Uhr,False,no_info
4550,https://www.faz.net/agenturmeldungen/dpa/prote...,Agenturmeldung,Proteste zum Jahrestag des blutigen Polizeiein...,no_info,07.05.2022 03:57 Uhr,False,no_info
4551,https://www.faz.net/aktuell/politik/ausland/do...,FAZ,Gericht weist Trumps Klage gegen Twitter-Sperr...,Schlappe für Ex-Präsidenten,07.05.2022 03:34 Uhr,False,no_info
4552,https://www.faz.net/agenturmeldungen/dpa/genua...,Agenturmeldung,Genua um Coach Blessin schlägt Juve und hofft ...,no_info,07.05.2022 03:34 Uhr,False,no_info


In [15]:
# Create data checkpoint for scraped meta-information on each article
df_faz.to_csv('2022-05-13_to_2022-05-07_articles_faz_temp.csv')

In [16]:
def get_fazmeta(fazarticles_df):
    
    image_meta = []
    labelpaid_meta = []
    opinion_meta = []
    author_meta = []
    readtime_meta = []
    source_meta = []
    keywords_meta = []
    
    for i in range(len(fazarticles_df['links'])):
        url = fazarticles_df['links'][i]
        
        try:
            response = requests.get(url)
        except:
            try:
                response = requests.get(url)
            except:
                response = requests.get(url)
        page = response.text
        soup = bs(page)
    
        try:
            image_meta.append(soup.find('meta', {'property': 'og:image'})['content'])
        except:
            image_meta.append('no_info')
        
        try:
            labelpaid_meta.append(str(soup.find_all('script')).find('window.isPaidContent = true') > 1)
        except:
            labelpaid_meta.append('no_info')
        
        try:
            opinion_meta.append(soup.find(class_= 'atc-MetaAuthorText').text.strip())
        except:
            opinion_meta.append('no_info')
        
        try:
            author_meta.append(soup.find(class_= 'atc-MetaAuthor').text.strip())
        except:
            author_meta.append('no_info')
        
        try:
            readtime_meta.append(soup.find(class_= 'atc-ReadTime_Text').text.strip())
        except:
            readtime_meta.append('no_readtime_shown')
        
        try:
            source_meta.append(str(soup.find_all('script'))[str(soup.find_all('script')).find('"source"')+10:str(soup.find_all('script')).find('"source"')+30])
        except:
            source_meta.append('no_info')
            
            
        try:
            keywords_meta.append(soup.find('meta', attrs={'name':"keywords"}).attrs['content'])
        except:
            keywords_meta.append('no_info')
            

        print(f'Article number {i} scraping completed.')
            
    df_meta_temp = pd.DataFrame({
        'image_meta': image_meta,
        'labelpaid_meta': labelpaid_meta,
        'opinion_meta': opinion_meta,
        'author_meta': author_meta,
        'readtime_meta': readtime_meta,
        'source_meta': source_meta,
        'keywords_meta': keywords_meta
    })
            
    return(df_meta_temp)

In [17]:
df_faz_articles = get_fazmeta(df_faz)

Article number 0 scraping completed.
Article number 1 scraping completed.
Article number 2 scraping completed.
Article number 3 scraping completed.
Article number 4 scraping completed.
Article number 5 scraping completed.
Article number 6 scraping completed.
Article number 7 scraping completed.
Article number 8 scraping completed.
Article number 9 scraping completed.
Article number 10 scraping completed.
Article number 11 scraping completed.
Article number 12 scraping completed.
Article number 13 scraping completed.
Article number 14 scraping completed.
Article number 15 scraping completed.
Article number 16 scraping completed.
Article number 17 scraping completed.
Article number 18 scraping completed.
Article number 19 scraping completed.
Article number 20 scraping completed.
Article number 21 scraping completed.
Article number 22 scraping completed.
Article number 23 scraping completed.
Article number 24 scraping completed.
Article number 25 scraping completed.
Article number 26 scra

Article number 213 scraping completed.
Article number 214 scraping completed.
Article number 215 scraping completed.
Article number 216 scraping completed.
Article number 217 scraping completed.
Article number 218 scraping completed.
Article number 219 scraping completed.
Article number 220 scraping completed.
Article number 221 scraping completed.
Article number 222 scraping completed.
Article number 223 scraping completed.
Article number 224 scraping completed.
Article number 225 scraping completed.
Article number 226 scraping completed.
Article number 227 scraping completed.
Article number 228 scraping completed.
Article number 229 scraping completed.
Article number 230 scraping completed.
Article number 231 scraping completed.
Article number 232 scraping completed.
Article number 233 scraping completed.
Article number 234 scraping completed.
Article number 235 scraping completed.
Article number 236 scraping completed.
Article number 237 scraping completed.
Article number 238 scrapi

Article number 424 scraping completed.
Article number 425 scraping completed.
Article number 426 scraping completed.
Article number 427 scraping completed.
Article number 428 scraping completed.
Article number 429 scraping completed.
Article number 430 scraping completed.
Article number 431 scraping completed.
Article number 432 scraping completed.
Article number 433 scraping completed.
Article number 434 scraping completed.
Article number 435 scraping completed.
Article number 436 scraping completed.
Article number 437 scraping completed.
Article number 438 scraping completed.
Article number 439 scraping completed.
Article number 440 scraping completed.
Article number 441 scraping completed.
Article number 442 scraping completed.
Article number 443 scraping completed.
Article number 444 scraping completed.
Article number 445 scraping completed.
Article number 446 scraping completed.
Article number 447 scraping completed.
Article number 448 scraping completed.
Article number 449 scrapi

Article number 635 scraping completed.
Article number 636 scraping completed.
Article number 637 scraping completed.
Article number 638 scraping completed.
Article number 639 scraping completed.
Article number 640 scraping completed.
Article number 641 scraping completed.
Article number 642 scraping completed.
Article number 643 scraping completed.
Article number 644 scraping completed.
Article number 645 scraping completed.
Article number 646 scraping completed.
Article number 647 scraping completed.
Article number 648 scraping completed.
Article number 649 scraping completed.
Article number 650 scraping completed.
Article number 651 scraping completed.
Article number 652 scraping completed.
Article number 653 scraping completed.
Article number 654 scraping completed.
Article number 655 scraping completed.
Article number 656 scraping completed.
Article number 657 scraping completed.
Article number 658 scraping completed.
Article number 659 scraping completed.
Article number 660 scrapi

Article number 846 scraping completed.
Article number 847 scraping completed.
Article number 848 scraping completed.
Article number 849 scraping completed.
Article number 850 scraping completed.
Article number 851 scraping completed.
Article number 852 scraping completed.
Article number 853 scraping completed.
Article number 854 scraping completed.
Article number 855 scraping completed.
Article number 856 scraping completed.
Article number 857 scraping completed.
Article number 858 scraping completed.
Article number 859 scraping completed.
Article number 860 scraping completed.
Article number 861 scraping completed.
Article number 862 scraping completed.
Article number 863 scraping completed.
Article number 864 scraping completed.
Article number 865 scraping completed.
Article number 866 scraping completed.
Article number 867 scraping completed.
Article number 868 scraping completed.
Article number 869 scraping completed.
Article number 870 scraping completed.
Article number 871 scrapi

Article number 1055 scraping completed.
Article number 1056 scraping completed.
Article number 1057 scraping completed.
Article number 1058 scraping completed.
Article number 1059 scraping completed.
Article number 1060 scraping completed.
Article number 1061 scraping completed.
Article number 1062 scraping completed.
Article number 1063 scraping completed.
Article number 1064 scraping completed.
Article number 1065 scraping completed.
Article number 1066 scraping completed.
Article number 1067 scraping completed.
Article number 1068 scraping completed.
Article number 1069 scraping completed.
Article number 1070 scraping completed.
Article number 1071 scraping completed.
Article number 1072 scraping completed.
Article number 1073 scraping completed.
Article number 1074 scraping completed.
Article number 1075 scraping completed.
Article number 1076 scraping completed.
Article number 1077 scraping completed.
Article number 1078 scraping completed.
Article number 1079 scraping completed.


Article number 1260 scraping completed.
Article number 1261 scraping completed.
Article number 1262 scraping completed.
Article number 1263 scraping completed.
Article number 1264 scraping completed.
Article number 1265 scraping completed.
Article number 1266 scraping completed.
Article number 1267 scraping completed.
Article number 1268 scraping completed.
Article number 1269 scraping completed.
Article number 1270 scraping completed.
Article number 1271 scraping completed.
Article number 1272 scraping completed.
Article number 1273 scraping completed.
Article number 1274 scraping completed.
Article number 1275 scraping completed.
Article number 1276 scraping completed.
Article number 1277 scraping completed.
Article number 1278 scraping completed.
Article number 1279 scraping completed.
Article number 1280 scraping completed.
Article number 1281 scraping completed.
Article number 1282 scraping completed.
Article number 1283 scraping completed.
Article number 1284 scraping completed.


Article number 1466 scraping completed.
Article number 1467 scraping completed.
Article number 1468 scraping completed.
Article number 1469 scraping completed.
Article number 1470 scraping completed.
Article number 1471 scraping completed.
Article number 1472 scraping completed.
Article number 1473 scraping completed.
Article number 1474 scraping completed.
Article number 1475 scraping completed.
Article number 1476 scraping completed.
Article number 1477 scraping completed.
Article number 1478 scraping completed.
Article number 1479 scraping completed.
Article number 1480 scraping completed.
Article number 1481 scraping completed.
Article number 1482 scraping completed.
Article number 1483 scraping completed.
Article number 1484 scraping completed.
Article number 1485 scraping completed.
Article number 1486 scraping completed.
Article number 1487 scraping completed.
Article number 1488 scraping completed.
Article number 1489 scraping completed.
Article number 1490 scraping completed.


Article number 1671 scraping completed.
Article number 1672 scraping completed.
Article number 1673 scraping completed.
Article number 1674 scraping completed.
Article number 1675 scraping completed.
Article number 1676 scraping completed.
Article number 1677 scraping completed.
Article number 1678 scraping completed.
Article number 1679 scraping completed.
Article number 1680 scraping completed.
Article number 1681 scraping completed.
Article number 1682 scraping completed.
Article number 1683 scraping completed.
Article number 1684 scraping completed.
Article number 1685 scraping completed.
Article number 1686 scraping completed.
Article number 1687 scraping completed.
Article number 1688 scraping completed.
Article number 1689 scraping completed.
Article number 1690 scraping completed.
Article number 1691 scraping completed.
Article number 1692 scraping completed.
Article number 1693 scraping completed.
Article number 1694 scraping completed.
Article number 1695 scraping completed.


Article number 1876 scraping completed.
Article number 1877 scraping completed.
Article number 1878 scraping completed.
Article number 1879 scraping completed.
Article number 1880 scraping completed.
Article number 1881 scraping completed.
Article number 1882 scraping completed.
Article number 1883 scraping completed.
Article number 1884 scraping completed.
Article number 1885 scraping completed.
Article number 1886 scraping completed.
Article number 1887 scraping completed.
Article number 1888 scraping completed.
Article number 1889 scraping completed.
Article number 1890 scraping completed.
Article number 1891 scraping completed.
Article number 1892 scraping completed.
Article number 1893 scraping completed.
Article number 1894 scraping completed.
Article number 1895 scraping completed.
Article number 1896 scraping completed.
Article number 1897 scraping completed.
Article number 1898 scraping completed.
Article number 1899 scraping completed.
Article number 1900 scraping completed.


Article number 2081 scraping completed.
Article number 2082 scraping completed.
Article number 2083 scraping completed.
Article number 2084 scraping completed.
Article number 2085 scraping completed.
Article number 2086 scraping completed.
Article number 2087 scraping completed.
Article number 2088 scraping completed.
Article number 2089 scraping completed.
Article number 2090 scraping completed.
Article number 2091 scraping completed.
Article number 2092 scraping completed.
Article number 2093 scraping completed.
Article number 2094 scraping completed.
Article number 2095 scraping completed.
Article number 2096 scraping completed.
Article number 2097 scraping completed.
Article number 2098 scraping completed.
Article number 2099 scraping completed.
Article number 2100 scraping completed.
Article number 2101 scraping completed.
Article number 2102 scraping completed.
Article number 2103 scraping completed.
Article number 2104 scraping completed.
Article number 2105 scraping completed.


Article number 2286 scraping completed.
Article number 2287 scraping completed.
Article number 2288 scraping completed.
Article number 2289 scraping completed.
Article number 2290 scraping completed.
Article number 2291 scraping completed.
Article number 2292 scraping completed.
Article number 2293 scraping completed.
Article number 2294 scraping completed.
Article number 2295 scraping completed.
Article number 2296 scraping completed.
Article number 2297 scraping completed.
Article number 2298 scraping completed.
Article number 2299 scraping completed.
Article number 2300 scraping completed.
Article number 2301 scraping completed.
Article number 2302 scraping completed.
Article number 2303 scraping completed.
Article number 2304 scraping completed.
Article number 2305 scraping completed.
Article number 2306 scraping completed.
Article number 2307 scraping completed.
Article number 2308 scraping completed.
Article number 2309 scraping completed.
Article number 2310 scraping completed.


Article number 2492 scraping completed.
Article number 2493 scraping completed.
Article number 2494 scraping completed.
Article number 2495 scraping completed.
Article number 2496 scraping completed.
Article number 2497 scraping completed.
Article number 2498 scraping completed.
Article number 2499 scraping completed.
Article number 2500 scraping completed.
Article number 2501 scraping completed.
Article number 2502 scraping completed.
Article number 2503 scraping completed.
Article number 2504 scraping completed.
Article number 2505 scraping completed.
Article number 2506 scraping completed.
Article number 2507 scraping completed.
Article number 2508 scraping completed.
Article number 2509 scraping completed.
Article number 2510 scraping completed.
Article number 2511 scraping completed.
Article number 2512 scraping completed.
Article number 2513 scraping completed.
Article number 2514 scraping completed.
Article number 2515 scraping completed.
Article number 2516 scraping completed.


Article number 2697 scraping completed.
Article number 2698 scraping completed.
Article number 2699 scraping completed.
Article number 2700 scraping completed.
Article number 2701 scraping completed.
Article number 2702 scraping completed.
Article number 2703 scraping completed.
Article number 2704 scraping completed.
Article number 2705 scraping completed.
Article number 2706 scraping completed.
Article number 2707 scraping completed.
Article number 2708 scraping completed.
Article number 2709 scraping completed.
Article number 2710 scraping completed.
Article number 2711 scraping completed.
Article number 2712 scraping completed.
Article number 2713 scraping completed.
Article number 2714 scraping completed.
Article number 2715 scraping completed.
Article number 2716 scraping completed.
Article number 2717 scraping completed.
Article number 2718 scraping completed.
Article number 2719 scraping completed.
Article number 2720 scraping completed.
Article number 2721 scraping completed.


Article number 2902 scraping completed.
Article number 2903 scraping completed.
Article number 2904 scraping completed.
Article number 2905 scraping completed.
Article number 2906 scraping completed.
Article number 2907 scraping completed.
Article number 2908 scraping completed.
Article number 2909 scraping completed.
Article number 2910 scraping completed.
Article number 2911 scraping completed.
Article number 2912 scraping completed.
Article number 2913 scraping completed.
Article number 2914 scraping completed.
Article number 2915 scraping completed.
Article number 2916 scraping completed.
Article number 2917 scraping completed.
Article number 2918 scraping completed.
Article number 2919 scraping completed.
Article number 2920 scraping completed.
Article number 2921 scraping completed.
Article number 2922 scraping completed.
Article number 2923 scraping completed.
Article number 2924 scraping completed.
Article number 2925 scraping completed.
Article number 2926 scraping completed.


Article number 3107 scraping completed.
Article number 3108 scraping completed.
Article number 3109 scraping completed.
Article number 3110 scraping completed.
Article number 3111 scraping completed.
Article number 3112 scraping completed.
Article number 3113 scraping completed.
Article number 3114 scraping completed.
Article number 3115 scraping completed.
Article number 3116 scraping completed.
Article number 3117 scraping completed.
Article number 3118 scraping completed.
Article number 3119 scraping completed.
Article number 3120 scraping completed.
Article number 3121 scraping completed.
Article number 3122 scraping completed.
Article number 3123 scraping completed.
Article number 3124 scraping completed.
Article number 3125 scraping completed.
Article number 3126 scraping completed.
Article number 3127 scraping completed.
Article number 3128 scraping completed.
Article number 3129 scraping completed.
Article number 3130 scraping completed.
Article number 3131 scraping completed.


Article number 3312 scraping completed.
Article number 3313 scraping completed.
Article number 3314 scraping completed.
Article number 3315 scraping completed.
Article number 3316 scraping completed.
Article number 3317 scraping completed.
Article number 3318 scraping completed.
Article number 3319 scraping completed.
Article number 3320 scraping completed.
Article number 3321 scraping completed.
Article number 3322 scraping completed.
Article number 3323 scraping completed.
Article number 3324 scraping completed.
Article number 3325 scraping completed.
Article number 3326 scraping completed.
Article number 3327 scraping completed.
Article number 3328 scraping completed.
Article number 3329 scraping completed.
Article number 3330 scraping completed.
Article number 3331 scraping completed.
Article number 3332 scraping completed.
Article number 3333 scraping completed.
Article number 3334 scraping completed.
Article number 3335 scraping completed.
Article number 3336 scraping completed.


Article number 3517 scraping completed.
Article number 3518 scraping completed.
Article number 3519 scraping completed.
Article number 3520 scraping completed.
Article number 3521 scraping completed.
Article number 3522 scraping completed.
Article number 3523 scraping completed.
Article number 3524 scraping completed.
Article number 3525 scraping completed.
Article number 3526 scraping completed.
Article number 3527 scraping completed.
Article number 3528 scraping completed.
Article number 3529 scraping completed.
Article number 3530 scraping completed.
Article number 3531 scraping completed.
Article number 3532 scraping completed.
Article number 3533 scraping completed.
Article number 3534 scraping completed.
Article number 3535 scraping completed.
Article number 3536 scraping completed.
Article number 3537 scraping completed.
Article number 3538 scraping completed.
Article number 3539 scraping completed.
Article number 3540 scraping completed.
Article number 3541 scraping completed.


Article number 3722 scraping completed.
Article number 3723 scraping completed.
Article number 3724 scraping completed.
Article number 3725 scraping completed.
Article number 3726 scraping completed.
Article number 3727 scraping completed.
Article number 3728 scraping completed.
Article number 3729 scraping completed.
Article number 3730 scraping completed.
Article number 3731 scraping completed.
Article number 3732 scraping completed.
Article number 3733 scraping completed.
Article number 3734 scraping completed.
Article number 3735 scraping completed.
Article number 3736 scraping completed.
Article number 3737 scraping completed.
Article number 3738 scraping completed.
Article number 3739 scraping completed.
Article number 3740 scraping completed.
Article number 3741 scraping completed.
Article number 3742 scraping completed.
Article number 3743 scraping completed.
Article number 3744 scraping completed.
Article number 3745 scraping completed.
Article number 3746 scraping completed.


Article number 3927 scraping completed.
Article number 3928 scraping completed.
Article number 3929 scraping completed.
Article number 3930 scraping completed.
Article number 3931 scraping completed.
Article number 3932 scraping completed.
Article number 3933 scraping completed.
Article number 3934 scraping completed.
Article number 3935 scraping completed.
Article number 3936 scraping completed.
Article number 3937 scraping completed.
Article number 3938 scraping completed.
Article number 3939 scraping completed.
Article number 3940 scraping completed.
Article number 3941 scraping completed.
Article number 3942 scraping completed.
Article number 3943 scraping completed.
Article number 3944 scraping completed.
Article number 3945 scraping completed.
Article number 3946 scraping completed.
Article number 3947 scraping completed.
Article number 3948 scraping completed.
Article number 3949 scraping completed.
Article number 3950 scraping completed.
Article number 3951 scraping completed.


Article number 4132 scraping completed.
Article number 4133 scraping completed.
Article number 4134 scraping completed.
Article number 4135 scraping completed.
Article number 4136 scraping completed.
Article number 4137 scraping completed.
Article number 4138 scraping completed.
Article number 4139 scraping completed.
Article number 4140 scraping completed.
Article number 4141 scraping completed.
Article number 4142 scraping completed.
Article number 4143 scraping completed.
Article number 4144 scraping completed.
Article number 4145 scraping completed.
Article number 4146 scraping completed.
Article number 4147 scraping completed.
Article number 4148 scraping completed.
Article number 4149 scraping completed.
Article number 4150 scraping completed.
Article number 4151 scraping completed.
Article number 4152 scraping completed.
Article number 4153 scraping completed.
Article number 4154 scraping completed.
Article number 4155 scraping completed.
Article number 4156 scraping completed.


Article number 4337 scraping completed.
Article number 4338 scraping completed.
Article number 4339 scraping completed.
Article number 4340 scraping completed.
Article number 4341 scraping completed.
Article number 4342 scraping completed.
Article number 4343 scraping completed.
Article number 4344 scraping completed.
Article number 4345 scraping completed.
Article number 4346 scraping completed.
Article number 4347 scraping completed.
Article number 4348 scraping completed.
Article number 4349 scraping completed.
Article number 4350 scraping completed.
Article number 4351 scraping completed.
Article number 4352 scraping completed.
Article number 4353 scraping completed.
Article number 4354 scraping completed.
Article number 4355 scraping completed.
Article number 4356 scraping completed.
Article number 4357 scraping completed.
Article number 4358 scraping completed.
Article number 4359 scraping completed.
Article number 4360 scraping completed.
Article number 4361 scraping completed.


Article number 4542 scraping completed.
Article number 4543 scraping completed.
Article number 4544 scraping completed.
Article number 4545 scraping completed.
Article number 4546 scraping completed.
Article number 4547 scraping completed.
Article number 4548 scraping completed.
Article number 4549 scraping completed.
Article number 4550 scraping completed.
Article number 4551 scraping completed.
Article number 4552 scraping completed.
Article number 4553 scraping completed.


In [18]:
df_faz_meta = pd.concat([df_faz, df_faz_articles], axis=1)
df_faz_meta.reset_index(drop=True, inplace=True)
df_faz_meta.tail()

Unnamed: 0,links,source,title,heading,publishtime,labelpaid,author,image_meta,labelpaid_meta,opinion_meta,author_meta,readtime_meta,source_meta,keywords_meta
4549,https://www.faz.net/agenturmeldungen/dpa/pudel...,Agenturmeldung,Pudelwohl zurück: Astronaut Maurer wieder in D...,no_info,07.05.2022 04:10 Uhr,False,no_info,https://www.faz.net/img/faznet_logo_facebook_s...,False,no_info,no_info,2 Min.,"dpa"",""accessInfo"":{""","Matthias Maurer, Astronaut, Erik Hepp, Uwe Are..."
4550,https://www.faz.net/agenturmeldungen/dpa/prote...,Agenturmeldung,Proteste zum Jahrestag des blutigen Polizeiein...,no_info,07.05.2022 03:57 Uhr,False,no_info,https://www.faz.net/img/faznet_logo_facebook_s...,False,no_info,no_info,1 Min.,"dpa"",""accessInfo"":{""","Polizei, O Globo, Jahrestag, Rio, Jacarezinho,..."
4551,https://www.faz.net/aktuell/politik/ausland/do...,FAZ,Gericht weist Trumps Klage gegen Twitter-Sperr...,Schlappe für Ex-Präsidenten,07.05.2022 03:34 Uhr,False,no_info,https://media0.faz.net/ppmedia/aktuell/5558760...,False,no_info,no_info,1 Min.,"dpa/Reuters"",""access","Donald Trump, Twitter, Facebook, ISIN_US30303M..."
4552,https://www.faz.net/agenturmeldungen/dpa/genua...,Agenturmeldung,Genua um Coach Blessin schlägt Juve und hofft ...,no_info,07.05.2022 03:34 Uhr,False,no_info,https://www.faz.net/img/faznet_logo_facebook_s...,False,no_info,no_info,1 Min.,"dpa"",""accessInfo"":{""","Alexander Blessin, Domenico Criscito, Franck R..."
4553,https://www.faz.net/agenturmeldungen/dpa/hotel...,Agenturmeldung,Hotelexplosion in Havanna: Zahl der Toten auf ...,no_info,07.05.2022 02:00 Uhr,False,no_info,https://www.faz.net/img/faznet_logo_facebook_s...,False,no_info,no_info,1 Min.,"dpa"",""accessInfo"":{""","Miguel Díaz-Canel, Madonna, Beyoncé Knowles, T..."


In [19]:
df_faz_meta.head()

Unnamed: 0,links,source,title,heading,publishtime,labelpaid,author,image_meta,labelpaid_meta,opinion_meta,author_meta,readtime_meta,source_meta,keywords_meta
0,https://www.faz.net/agenturmeldungen/dpa/gruen...,Agenturmeldung,Grüne: Vernetzung rechter Parteien näher unter...,no_info,14.05.2022 01:26 Uhr,False,no_info,https://www.faz.net/img/faznet_logo_facebook_s...,False,no_info,no_info,1 Min.,"dpa"",""accessInfo"":{""","Irene Mihalic, Martina Renner, AfD, Redaktions..."
1,https://www.faz.net/agenturmeldungen/dpa/thyss...,Agenturmeldung,Thyssenkrupp Marine Systems zeigt Interesse an...,no_info,14.05.2022 01:16 Uhr,False,no_info,https://www.faz.net/img/faznet_logo_facebook_s...,False,no_info,no_info,1 Min.,"dpa"",""accessInfo"":{""","Oliver Burkhard, ThyssenKrupp, ISIN_DE00075000..."
2,https://www.faz.net/agenturmeldungen/dpa/meste...,Agenturmeldung,"Mester, Casselly und Ullmann im «Let's Dance»-...",no_info,14.05.2022 01:02 Uhr,False,no_info,https://www.faz.net/img/faznet_logo_facebook_s...,False,no_info,no_info,1 Min.,"dpa"",""accessInfo"":{""","Amira Pocher, Mathias Mester, René Casselly, J..."
3,https://www.faz.net/aktuell/sport/tennis-maste...,FAZ+,"Zverevs Wohl, Nadals Wehe",Tennis-Masters in Rom,13.05.2022 23:29 Uhr,True,no_info,https://www.faz.net/img/faznet_logo_facebook_s...,True,no_info,no_info,1 Min.,"F.A.Z"",""accessInfo"":","Alexander Zverev, Rafael Nadal, Christian Gari..."
4,https://www.faz.net/aktuell/sport/eishockey-wm...,FAZ+,Missglückter Start für die Deutschen,Eishockey-WM,13.05.2022 23:28 Uhr,True,no_info,https://media1.faz.net/ppmedia/aktuell/sport/4...,True,no_info,no_info,1 Min.,"F.A.Z"",""accessInfo"":","NHL, Sport1, DEB, Olympia, Eishockey-WM, Olymp..."


In [20]:
df_faz_meta['scrape_datetime'] = dt(2022,5,14,9,0)
df_faz_meta.head()

Unnamed: 0,links,source,title,heading,publishtime,labelpaid,author,image_meta,labelpaid_meta,opinion_meta,author_meta,readtime_meta,source_meta,keywords_meta,scrape_datetime
0,https://www.faz.net/agenturmeldungen/dpa/gruen...,Agenturmeldung,Grüne: Vernetzung rechter Parteien näher unter...,no_info,14.05.2022 01:26 Uhr,False,no_info,https://www.faz.net/img/faznet_logo_facebook_s...,False,no_info,no_info,1 Min.,"dpa"",""accessInfo"":{""","Irene Mihalic, Martina Renner, AfD, Redaktions...",2022-05-14 09:00:00
1,https://www.faz.net/agenturmeldungen/dpa/thyss...,Agenturmeldung,Thyssenkrupp Marine Systems zeigt Interesse an...,no_info,14.05.2022 01:16 Uhr,False,no_info,https://www.faz.net/img/faznet_logo_facebook_s...,False,no_info,no_info,1 Min.,"dpa"",""accessInfo"":{""","Oliver Burkhard, ThyssenKrupp, ISIN_DE00075000...",2022-05-14 09:00:00
2,https://www.faz.net/agenturmeldungen/dpa/meste...,Agenturmeldung,"Mester, Casselly und Ullmann im «Let's Dance»-...",no_info,14.05.2022 01:02 Uhr,False,no_info,https://www.faz.net/img/faznet_logo_facebook_s...,False,no_info,no_info,1 Min.,"dpa"",""accessInfo"":{""","Amira Pocher, Mathias Mester, René Casselly, J...",2022-05-14 09:00:00
3,https://www.faz.net/aktuell/sport/tennis-maste...,FAZ+,"Zverevs Wohl, Nadals Wehe",Tennis-Masters in Rom,13.05.2022 23:29 Uhr,True,no_info,https://www.faz.net/img/faznet_logo_facebook_s...,True,no_info,no_info,1 Min.,"F.A.Z"",""accessInfo"":","Alexander Zverev, Rafael Nadal, Christian Gari...",2022-05-14 09:00:00
4,https://www.faz.net/aktuell/sport/eishockey-wm...,FAZ+,Missglückter Start für die Deutschen,Eishockey-WM,13.05.2022 23:28 Uhr,True,no_info,https://media1.faz.net/ppmedia/aktuell/sport/4...,True,no_info,no_info,1 Min.,"F.A.Z"",""accessInfo"":","NHL, Sport1, DEB, Olympia, Eishockey-WM, Olymp...",2022-05-14 09:00:00


In [21]:
# Create data checkpoint for scraped meta-information on each article
df_faz_meta.to_csv('2022-05-13_to_2022-05-07_articles_faz.csv')

# Get daily traffic data (visits) from IVW

- Extract daily traffic data (visits) published at publisher association website for advertising customers
- Use daily visits as target variable

In [31]:
def get_ivw_daily(month_no, date_table):

    clicks_months = dt.today().month - month_no -1
    x = 1
    y = 1
    
    for i in range(len(date_table)):
        y = 1
        for j in range(len(date_table.T)):
            if (pd.isna(date_table.iloc[x-1, y-1])) == False:

                chromedriver = "/media/fabian/VM_space/Metis/02_Regression/chrome_driver/chromedriver" # path to the chromedriver executable
                os.environ["webdriver.chrome.driver"] = chromedriver
                driver = webdriver.Chrome(chromedriver)
                ivw_url = 'https://ausweisung.ivw-online.de/index.php?tagl=1&mz_szm=202202&it=1&setc=1'
                driver.get(ivw_url)
                date_picker = driver.find_element_by_xpath('//*[@id="ibody"]/div[3]/div/div/form/div/input')
                driver.execute_script("arguments[0].click();", date_picker)
                time.sleep(1)
                
                for k in range(clicks_months):
                    date_picker = driver.find_element_by_xpath('//*[@id="ibody"]/div[4]/div[2]/div[1]/table/thead/tr[1]/th[1]/span').click()
#                     driver.execute_script("arguments[0].click();", date_picker)
                    time.sleep(1)
                
                for l in range(2):
                    date_picker = driver.find_element_by_xpath(f'//*[@id="ibody"]/div[4]/div[2]/div[1]/table/tbody/tr[{x}]/td[{y}]').click()
                    time.sleep(1)
                
                date_picker = driver.find_element_by_xpath('//*[@id="ibody"]/div[3]/div/div/form/table/tbody/tr/td[1]/div/input')
                driver.execute_script("arguments[0].click();", date_picker)
                time.sleep(1)
                driver.quit()
                
                print(f'Tag {int(date_table.iloc[x-1, y-1])} download complete.')
                
            y += 1
        x += 1

In [32]:
# Create calendar indices january
date_table_jan = pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-31')})
date_table_jan['weeknum'] = date_table_jan['Date'].apply(lambda x: x.strftime("%y%V"))
date_table_jan['weeknum'] = date_table_jan['weeknum'].replace('2252', '2152')
date_table_jan['day'] = date_table_jan['Date'].apply(lambda x: x.strftime("%A"))
date_table_jan['no'] = date_table_jan['Date'].apply(lambda x: x.strftime("%d"))

date_table_jan = date_table_jan[['weeknum', 'day', 'no']]
date_table_jan = pd.pivot_table(date_table_jan, values='no', index=['weeknum'], columns=['day']).reset_index()
date_table_jan = date_table_jan[['weeknum', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']]
date_table_jan.set_index('weeknum', inplace=True)

# Create calendar indices february
date_table_feb = pd.DataFrame({'Date': pd.date_range('2022-02-01', '2022-02-28')})
date_table_feb['weeknum'] = date_table_feb['Date'].apply(lambda x: x.strftime("%y%V"))
date_table_feb['day'] = date_table_feb['Date'].apply(lambda x: x.strftime("%A"))
date_table_feb['no'] = date_table_feb['Date'].apply(lambda x: x.strftime("%d"))

date_table_feb = date_table_feb[['weeknum', 'day', 'no']]
date_table_feb = pd.pivot_table(date_table_feb, values='no', index=['weeknum'], columns=['day']).reset_index()
date_table_feb = date_table_feb[['weeknum', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']]
date_table_feb.set_index('weeknum', inplace=True)

# Create calendar indices march
date_table_mar = pd.DataFrame({'Date': pd.date_range('2022-03-01', '2022-03-31')})
date_table_mar['weeknum'] = date_table_mar['Date'].apply(lambda x: x.strftime("%y%V"))
date_table_mar['day'] = date_table_mar['Date'].apply(lambda x: x.strftime("%A"))
date_table_mar['no'] = date_table_mar['Date'].apply(lambda x: x.strftime("%d"))

date_table_mar = date_table_mar[['weeknum', 'day', 'no']]
date_table_mar = pd.pivot_table(date_table_mar, values='no', index=['weeknum'], columns=['day']).reset_index()
date_table_mar = date_table_mar[['weeknum', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']]
date_table_mar.set_index('weeknum', inplace=True)

# Create calendar indices april
date_table_apr = pd.DataFrame({'Date': pd.date_range('2022-04-01', '2022-04-30')})
date_table_apr['weeknum'] = date_table_apr['Date'].apply(lambda x: x.strftime("%y%V"))
date_table_apr['day'] = date_table_apr['Date'].apply(lambda x: x.strftime("%A"))
date_table_apr['no'] = date_table_apr['Date'].apply(lambda x: x.strftime("%d"))

date_table_apr = date_table_apr[['weeknum', 'day', 'no']]
date_table_apr = pd.pivot_table(date_table_apr, values='no', index=['weeknum'], columns=['day']).reset_index()
date_table_apr = date_table_apr[['weeknum', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']]
date_table_apr.set_index('weeknum', inplace=True)

date_table_apr

day,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
weeknum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2213,,,,,1.0,2.0,3.0
2214,4.0,5.0,6.0,7.0,8.0,9.0,10.0
2215,11.0,12.0,13.0,14.0,15.0,16.0,17.0
2216,18.0,19.0,20.0,21.0,22.0,23.0,24.0
2217,25.0,26.0,27.0,28.0,29.0,30.0,


In [33]:
get_ivw_daily(1, date_table_jan)

  driver = webdriver.Chrome(chromedriver)
  date_picker = driver.find_element_by_xpath('//*[@id="ibody"]/div[3]/div/div/form/div/input')
  date_picker = driver.find_element_by_xpath('//*[@id="ibody"]/div[4]/div[2]/div[1]/table/thead/tr[1]/th[1]/span').click()
  date_picker = driver.find_element_by_xpath(f'//*[@id="ibody"]/div[4]/div[2]/div[1]/table/tbody/tr[{x}]/td[{y}]').click()
  date_picker = driver.find_element_by_xpath('//*[@id="ibody"]/div[3]/div/div/form/table/tbody/tr/td[1]/div/input')


Tag 1 download complete.
Tag 2 download complete.
Tag 3 download complete.
Tag 4 download complete.
Tag 5 download complete.
Tag 6 download complete.
Tag 7 download complete.
Tag 8 download complete.
Tag 9 download complete.
Tag 10 download complete.
Tag 11 download complete.
Tag 12 download complete.
Tag 13 download complete.
Tag 14 download complete.
Tag 15 download complete.
Tag 16 download complete.
Tag 17 download complete.
Tag 18 download complete.
Tag 19 download complete.
Tag 20 download complete.
Tag 21 download complete.
Tag 22 download complete.
Tag 23 download complete.
Tag 24 download complete.
Tag 25 download complete.
Tag 26 download complete.
Tag 27 download complete.
Tag 28 download complete.
Tag 29 download complete.
Tag 30 download complete.
Tag 31 download complete.


In [34]:
get_ivw_daily(2, date_table_feb)

  driver = webdriver.Chrome(chromedriver)
  date_picker = driver.find_element_by_xpath('//*[@id="ibody"]/div[3]/div/div/form/div/input')
  date_picker = driver.find_element_by_xpath('//*[@id="ibody"]/div[4]/div[2]/div[1]/table/thead/tr[1]/th[1]/span').click()
  date_picker = driver.find_element_by_xpath(f'//*[@id="ibody"]/div[4]/div[2]/div[1]/table/tbody/tr[{x}]/td[{y}]').click()
  date_picker = driver.find_element_by_xpath('//*[@id="ibody"]/div[3]/div/div/form/table/tbody/tr/td[1]/div/input')


Tag 1 download complete.
Tag 2 download complete.
Tag 3 download complete.
Tag 4 download complete.
Tag 5 download complete.
Tag 6 download complete.
Tag 7 download complete.
Tag 8 download complete.
Tag 9 download complete.
Tag 10 download complete.
Tag 11 download complete.
Tag 12 download complete.
Tag 13 download complete.
Tag 14 download complete.
Tag 15 download complete.
Tag 16 download complete.
Tag 17 download complete.
Tag 18 download complete.
Tag 19 download complete.
Tag 20 download complete.
Tag 21 download complete.
Tag 22 download complete.
Tag 23 download complete.
Tag 24 download complete.
Tag 25 download complete.
Tag 26 download complete.
Tag 27 download complete.
Tag 28 download complete.


In [35]:
get_ivw_daily(3, date_table_mar)

  driver = webdriver.Chrome(chromedriver)
  date_picker = driver.find_element_by_xpath('//*[@id="ibody"]/div[3]/div/div/form/div/input')
  date_picker = driver.find_element_by_xpath('//*[@id="ibody"]/div[4]/div[2]/div[1]/table/thead/tr[1]/th[1]/span').click()
  date_picker = driver.find_element_by_xpath(f'//*[@id="ibody"]/div[4]/div[2]/div[1]/table/tbody/tr[{x}]/td[{y}]').click()
  date_picker = driver.find_element_by_xpath('//*[@id="ibody"]/div[3]/div/div/form/table/tbody/tr/td[1]/div/input')


Tag 1 download complete.
Tag 2 download complete.
Tag 3 download complete.
Tag 4 download complete.
Tag 5 download complete.
Tag 6 download complete.
Tag 7 download complete.
Tag 8 download complete.
Tag 9 download complete.
Tag 10 download complete.
Tag 11 download complete.
Tag 12 download complete.
Tag 13 download complete.
Tag 14 download complete.
Tag 15 download complete.
Tag 16 download complete.
Tag 17 download complete.
Tag 18 download complete.
Tag 19 download complete.
Tag 20 download complete.
Tag 21 download complete.
Tag 22 download complete.
Tag 23 download complete.
Tag 24 download complete.
Tag 25 download complete.
Tag 26 download complete.
Tag 27 download complete.
Tag 28 download complete.
Tag 29 download complete.
Tag 30 download complete.
Tag 31 download complete.


In [36]:
get_ivw_daily(4, date_table_apr)

  driver = webdriver.Chrome(chromedriver)
  date_picker = driver.find_element_by_xpath('//*[@id="ibody"]/div[3]/div/div/form/div/input')
  date_picker = driver.find_element_by_xpath(f'//*[@id="ibody"]/div[4]/div[2]/div[1]/table/tbody/tr[{x}]/td[{y}]').click()
  date_picker = driver.find_element_by_xpath('//*[@id="ibody"]/div[3]/div/div/form/table/tbody/tr/td[1]/div/input')


Tag 1 download complete.
Tag 2 download complete.
Tag 3 download complete.
Tag 4 download complete.
Tag 5 download complete.
Tag 6 download complete.
Tag 7 download complete.
Tag 8 download complete.
Tag 9 download complete.
Tag 10 download complete.
Tag 11 download complete.
Tag 12 download complete.
Tag 13 download complete.
Tag 14 download complete.
Tag 15 download complete.
Tag 16 download complete.
Tag 17 download complete.
Tag 18 download complete.
Tag 19 download complete.
Tag 20 download complete.
Tag 21 download complete.
Tag 22 download complete.
Tag 23 download complete.
Tag 24 download complete.
Tag 25 download complete.
Tag 26 download complete.
Tag 27 download complete.
Tag 28 download complete.
Tag 29 download complete.
Tag 30 download complete.


# Read in Article Data & Traffic Data

### IVW Daily Visits

In [185]:
filenames = glob.glob("Project/download*.csv")
fields = ['Angebote', 'Visits gesamt', 'mobile Visits gesamt']
column_names = ['website', 'visits_ges', 'visits_mob']

ivw_data = pd.DataFrame(columns=column_names)

for filename in filenames:
    df = pd.read_csv(filename,
                     sep=';',
                     encoding='latin-1',
                     skiprows=10,
                     usecols=fields
                    )
    
    df.columns = column_names
    df['date'] = filename[-14:-4]
    df = df.loc[(df['website'] == 'FAZ.NET') | (df['website'] == 'Süddeutsche.de'), :]
    ivw_data = pd.concat([ivw_data, df])

ivw_data['date'] = pd.to_datetime(ivw_data['date'], format="%d-%m-%Y")
ivw_data['visits_ges'] = ivw_data['visits_ges'].str.replace('.', '').astype('int')
ivw_data['visits_mob'] = ivw_data['visits_mob'].str.replace('.', '').astype('int')
ivw_data.sort_values(['date', 'website'], inplace=True)
ivw_data.reset_index(drop=True, inplace=True)
ivw_data.head()

  ivw_data['visits_ges'] = ivw_data['visits_ges'].str.replace('.', '').astype('int')
  ivw_data['visits_mob'] = ivw_data['visits_mob'].str.replace('.', '').astype('int')


Unnamed: 0,website,visits_ges,visits_mob,date
0,FAZ.NET,2241090,1822260,2022-01-01
1,Süddeutsche.de,1903338,1438209,2022-01-01
2,FAZ.NET,2358585,1848733,2022-01-02
3,Süddeutsche.de,1841514,1315536,2022-01-02
4,FAZ.NET,2316463,1651514,2022-01-03


In [204]:
# Create csv file for IVW-data for further EDA
ivw_data.to_csv('ivw_data_ges.csv', encoding='utf8')

### FAZ.NET Articles

In [151]:
filenames = glob.glob("Project/faz_articles/*.csv")
column_names = ['links', 'source', 'title', 'heading', 'publishtime', 'labelpaid', 'author', 'image_meta', 'labelpaid_meta', 'opinion_meta', 'author_meta', 'readtime_meta', 'source_meta', 'scrape_datetime']

articles_faz = pd.DataFrame(columns=column_names)

for filename in filenames:
    df = pd.read_csv(filename)
    
    df.columns = column_names
    articles_faz = pd.concat([articles_faz, df])

articles_faz.sort_values(['publishtime'], inplace=True)
articles_faz.reset_index(drop=True, inplace=True)
articles_faz.head()

Unnamed: 0,links,source,title,heading,publishtime,labelpaid,author,image_meta,labelpaid_meta,opinion_meta,author_meta,readtime_meta,source_meta,scrape_datetime
0,https://www.faz.net/agenturmeldungen/dpa/klein...,Agenturmeldung,Kleinkind fährt mit Dreirad alleine zum Vater,no_info,29.04.2022 10:41 Uhr,False,no_info,https://www.faz.net/img/faznet_logo_facebook_s...,False,no_info,no_info,1 Min.,"dpa"",""accessInfo"":{""",2022-05-01 01:00:00
1,https://www.faz.net/agenturmeldungen/dpa/freib...,Agenturmeldung,Freiburgs Petersen auch für Baden-Duell fraglich,no_info,28.04.2022 14:57 Uhr,False,no_info,https://www.faz.net/img/faznet_logo_facebook_s...,False,no_info,no_info,1 Min.,"dpa"",""accessInfo"":{""",2022-05-01 01:00:00
2,https://www.faz.net/agenturmeldungen/dpa/minis...,Agenturmeldung,Ministerin: Gerechtere Teilhabe für zugewander...,no_info,28.04.2022 14:58 Uhr,False,no_info,https://www.faz.net/img/faznet_logo_facebook_s...,False,no_info,no_info,1 Min.,"dpa"",""accessInfo"":{""",2022-05-01 01:00:00
3,https://www.faz.net/aktuell/sport/fussball/ach...,FAZ+,So schaffen es acht Bundesligaklubs nach Europa,Eintracht als Sonderfall,28.04.2022 14:58 Uhr,True,Von Tobias Rabe,https://media0.faz.net/ppmedia/aktuell/sport/6...,True,Von,Tobias Rabe,3 Min.,"FAZ.NET"",""accessInfo",2022-05-01 01:00:00
4,https://www.faz.net/aktuell/technik-motor/elek...,FAZ,Wildwuchs im Lotus-Garten,Elektrisches Hyper-SUV,28.04.2022 14:59 Uhr,False,Von Thomas Geiger,https://media1.faz.net/ppmedia/aktuell/3065397...,False,Von,Thomas Geiger,2 Min.,"F.A.Z."",""accessInfo""",2022-05-01 01:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30950,https://www.faz.net/aktuell/feuilleton/buecher...,FAZ+,"Eine Hürde, kein Hindernis",Ruth Anne Byrnes&nbsp,„Ungebremst“,24.04.2022 20:59 Uhr,True,Von Jeanette Schäfer,https://www.faz.net/img/faznet_logo_facebook_s...,True,Von,Jeanette Schäfer,2 Min.,"F.A.Z"",""accessInfo"":"
30951,https://www.faz.net/aktuell/rhein-main/neue-st...,FAZ+,Tipps zur Mobilität,Mainz Bike &amp,Ride-Anlagen,03.04.2022 21:33 Uhr,True,no_info,https://media0.faz.net/ppmedia/aktuell/2592705...,True,no_info,no_info,1 Min.,"F.A.Z"",""accessInfo"":"
30952,https://www.faz.net/aktuell/feuilleton/buecher...,FAZ+,War Luther eine durch und durch typographische...,Reformation &amp,Buchdruck,07.04.2022 20:05 Uhr,True,Von Mark Lehmstedt,https://www.faz.net/img/faznet_logo_facebook_s...,True,Von,Mark Lehmstedt,5 Min.,"F.A.Z"",""accessInfo"":"
30953,https://www.faz.net/aktuell/wirtschaft/unterne...,FAZ+,Unter Zugzwang,McKinsey &amp,Co.,07.03.2022 20:22 Uhr,True,Von Tillmann Neuscheler,https://www.faz.net/img/faznet_logo_facebook_s...,True,Ein Kommentar von,Tillmann Neuscheler,1 Min.,"F.A.Z"",""accessInfo"":"


In [166]:
articles_faz_1 = articles_faz[articles_faz.loc[:, 'scrape_datetime'].str.contains('2022')]
articles_faz_1.drop(['scrape_datetime'], axis=1, inplace=True)
articles_faz_1.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  articles_faz_1.drop(['scrape_datetime'], axis=1, inplace=True)


Unnamed: 0,links,source,title,heading,publishtime,labelpaid,author,image_meta,labelpaid_meta,opinion_meta,author_meta,readtime_meta,source_meta
0,https://www.faz.net/agenturmeldungen/dpa/klein...,Agenturmeldung,Kleinkind fährt mit Dreirad alleine zum Vater,no_info,29.04.2022 10:41 Uhr,False,no_info,https://www.faz.net/img/faznet_logo_facebook_s...,False,no_info,no_info,1 Min.,"dpa"",""accessInfo"":{"""
1,https://www.faz.net/agenturmeldungen/dpa/freib...,Agenturmeldung,Freiburgs Petersen auch für Baden-Duell fraglich,no_info,28.04.2022 14:57 Uhr,False,no_info,https://www.faz.net/img/faznet_logo_facebook_s...,False,no_info,no_info,1 Min.,"dpa"",""accessInfo"":{"""
2,https://www.faz.net/agenturmeldungen/dpa/minis...,Agenturmeldung,Ministerin: Gerechtere Teilhabe für zugewander...,no_info,28.04.2022 14:58 Uhr,False,no_info,https://www.faz.net/img/faznet_logo_facebook_s...,False,no_info,no_info,1 Min.,"dpa"",""accessInfo"":{"""
3,https://www.faz.net/aktuell/sport/fussball/ach...,FAZ+,So schaffen es acht Bundesligaklubs nach Europa,Eintracht als Sonderfall,28.04.2022 14:58 Uhr,True,Von Tobias Rabe,https://media0.faz.net/ppmedia/aktuell/sport/6...,True,Von,Tobias Rabe,3 Min.,"FAZ.NET"",""accessInfo"
4,https://www.faz.net/aktuell/technik-motor/elek...,FAZ,Wildwuchs im Lotus-Garten,Elektrisches Hyper-SUV,28.04.2022 14:59 Uhr,False,Von Thomas Geiger,https://media1.faz.net/ppmedia/aktuell/3065397...,False,Von,Thomas Geiger,2 Min.,"F.A.Z."",""accessInfo"""


In [167]:
articles_faz_2 = articles_faz[~articles_faz.loc[:, 'scrape_datetime'].str.contains('2022')]
articles_faz_2.head()

Unnamed: 0,links,source,title,heading,publishtime,labelpaid,author,image_meta,labelpaid_meta,opinion_meta,author_meta,readtime_meta,source_meta,scrape_datetime
30931,https://www.faz.net/aktuell/rhein-main/das-sch...,FAZ+,Mein Nachbar joggt,Gegenüber &amp,Nebenan,22.04.2022 20:13 Uhr,True,Von Severin Groebner,https://media1.faz.net/ppmedia/aktuell/rhein-m...,True,Von,Severin Groebner,1 Min.,"F.A.S"",""accessInfo"":"
30932,https://www.faz.net/aktuell/rhein-main/musik-n...,FAZ+,Gegenüber & Nebenan: Kreative Nachbarn,Gegenüber &amp,Nebenan,15.04.2022 20:04 Uhr,True,Von Severin Groebner,https://media0.faz.net/ppmedia/aktuell/rhein-m...,True,Von,Severin Groebner,1 Min.,"F.A.S"",""accessInfo"":"
30933,https://www.faz.net/aktuell/feuilleton/was-koe...,FAZ+,Was können Theater für die Ukraine tun?,Fragen Sie&nbsp,Vasco Boenisch,15.04.2022 20:01 Uhr,True,Von Vasco Boenisch,https://media0.faz.net/ppmedia/aktuell/feuille...,True,Von,Vasco Boenisch,1 Min.,"F.A.S"",""accessInfo"":"
30934,https://www.faz.net/aktuell/rhein-main/nachbar...,FAZ+,Severin Groebner: Gegenüber & Nebenan,Gegenüber &amp,Nebenan,04.03.2022 20:05 Uhr,True,Von Severin Groebner,https://www.faz.net/img/faznet_logo_facebook_s...,True,Von,Severin Groebner,1 Min.,"F.A.S"",""accessInfo"":"
30935,https://www.faz.net/aktuell/wirtschaft/erstaun...,FAZ+,Erstaunliches Comeback,Namen &amp,Nachrichten,28.01.2022 19:57 Uhr,True,Von Ralph Bollmann,https://media0.faz.net/ppmedia/aktuell/wirtsch...,True,Von,Ralph Bollmann,2 Min.,"F.A.S"",""accessInfo"":"


In [168]:
articles_faz_2.columns = ['links',
                          'source',
                          'title',
                          'heading',
                          'to_delete',
                          'publishtime',
                          'labelpaid',
                          'author',
                          'image_meta',
                          'labelpaid_meta',
                          'opinion_meta',
                          'author_meta',
                          'readtime_meta',
                          'source_meta'
                         ]

In [169]:
articles_faz_2.drop(['to_delete'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  articles_faz_2.drop(['to_delete'], axis=1, inplace=True)


In [183]:
articles_faz_ges = pd.concat([articles_faz_1, articles_faz_2])
articles_faz_ges.sort_values(['publishtime'], inplace=True)
articles_faz_ges.drop_duplicates(subset='links', keep='last')
articles_faz_ges.reset_index(drop=True, inplace=True)

In [205]:
articles_faz_ges.head()

Unnamed: 0,links,source,title,heading,publishtime,labelpaid,author,image_meta,labelpaid_meta,opinion_meta,author_meta,readtime_meta,source_meta
0,https://www.faz.net/aktuell/gesellschaft/mensc...,FAZ,Die Welt heißt das Jahr 2022 willkommen,Jahreswechsel,01.01.2022 02:49 Uhr,False,no_info,https://media0.faz.net/ppmedia/aktuell/1336146...,False,no_info,no_info,3 Min.,"dpa\r\n"",""accessInfo"
1,https://www.faz.net/aktuell/politik/ausland/de...,FAZ,Deutschland für ein Jahr an der Spitze der G 7,Staatengruppe,01.01.2022 07:11 Uhr,False,no_info,https://media0.faz.net/ppmedia/aktuell/4282106...,False,no_info,no_info,2 Min.,"dpa\r\n"",""accessInfo"
2,https://www.faz.net/aktuell/gesellschaft/mensc...,FAZ,Royale Ehren für junge Spendensammler und „Jam...,Großbritannien,01.01.2022 07:53 Uhr,False,no_info,https://media0.faz.net/ppmedia/aktuell/6943806...,False,no_info,no_info,1 Min.,"dpa\r\n"",""accessInfo"
3,https://www.faz.net/aktuell/gesellschaft/gesun...,FAZ,Inzidenz steigt den dritten Tag in Folge an,Pandemie in Deutschland,01.01.2022 08:09 Uhr,False,no_info,https://media1.faz.net/ppmedia/aktuell/1135664...,False,no_info,no_info,3 Min.,"dpa/puz.\r\n"",""acces"
4,https://www.faz.net/aktuell/sport/fussball/bun...,FAZ,So planen die Bundesligaklubs den Transfer-Winter,"Bayern, Dortmund und Co.",01.01.2022 08:22 Uhr,False,no_info,https://media1.faz.net/ppmedia/aktuell/sport/2...,False,no_info,no_info,6 Min.,"dpa"",""accessInfo"":{"""


In [202]:
# Create csv file for F.A.Z. article-data for further EDA
articles_faz_ges.to_csv('articles_faz_ges.csv', encoding='utf8')

### SZ.DE Articles

In [191]:
filenames = glob.glob("Project/sz_articles/*.csv")
column_names = ['links', 'source', 'title', 'heading', 'publishtime', 'labelpaid', 'author', 'labelcategory', 'image', 'keywords_meta', 'labelpaid_meta', 'opinion_meta', 'articletype_meta', 'loc_meta', 'imagewidth_meta', 'imageheight_meta', 'author_meta', 'readtime_meta', 'scrape_datetime']

articles_sz = pd.DataFrame(columns=column_names)

for filename in filenames:
    df = pd.read_csv(filename)
    
    df.columns = column_names
    articles_sz = pd.concat([articles_sz, df])

articles_sz.sort_values(['publishtime'], inplace=True)
articles_sz.reset_index(drop=True, inplace=True)
articles_sz.head()

Unnamed: 0,links,source,title,heading,publishtime,labelpaid,author,labelcategory,image,keywords_meta,labelpaid_meta,opinion_meta,articletype_meta,loc_meta,imagewidth_meta,imageheight_meta,author_meta,readtime_meta,scrape_datetime
0,https://www.sueddeutsche.de/kultur/brauchtum-t...,dpa,Wieder Walpurgisfeiern im Harz nach coronabedi...,Brauchtum - Thale,00:13,not_paid,no_author,no_label,https://media-cdn.sueddeutsche.de/image/dpa.ur...,"Brauchtum,Coronavirus,Coronavirus,Deutschland,...",free,False,article,de_DE,1200,675,Süddeutsche Zeitung,no_readtime_shown,2022-04-30 09:48:00
1,https://www.sueddeutsche.de/sport/fussball-sin...,dpa,Baden-Duell um Europa: TSG Hoffenheim empfängt...,Fußball - Sinsheim,00:14,not_paid,no_author,no_label,https://media-cdn.sueddeutsche.de/image/dpa.ur...,"Baden-Württemberg,Bundesliga,Deutschland,Freib...",free,False,article,de_DE,1200,675,Süddeutsche Zeitung,no_readtime_shown,2022-04-30 09:48:00
2,https://www.sueddeutsche.de/politik/landtag-ki...,dpa,Landtagswahl: Endspurt der Parteien in Schlesw...,Landtag - Kiel,00:16,not_paid,no_author,no_label,https://media-cdn.sueddeutsche.de/image/dpa.ur...,"Kiel,Deutschland,Landtag,Nordrhein-Westfalen,P...",free,False,article,de_DE,1200,675,Süddeutsche Zeitung,Lesezeit: 2 min,2022-05-07 07:00:00
3,https://www.sueddeutsche.de/leben/leute-lets-d...,dpa,"""Let's Dance""-Aus für Bastian Bielendorfer",Leute,00:24,not_paid,no_author,no_label,https://media-cdn.sueddeutsche.de/image/dpa.ur...,"Leute,Medien,Fernsehen,RTL,Bastian Bielendorfe...",free,False,article,de_DE,1200,675,Süddeutsche Zeitung,Lesezeit: 2 min,2022-04-30 09:48:00
4,https://www.sueddeutsche.de/sport/fussball-ber...,dpa,Union will Europa-Ticket,Fußball - Berlin,00:40,not_paid,no_author,no_label,https://media-cdn.sueddeutsche.de/image/dpa.ur...,"Berlin,Baden-Württemberg,Bundesliga,Deutschlan...",free,False,article,de_DE,1200,675,Süddeutsche Zeitung,no_readtime_shown,2022-05-07 07:00:00


In [196]:
articles_sz.drop_duplicates(subset='links', keep='last')
articles_sz.drop(['scrape_datetime'], axis=1, inplace=True)

In [203]:
# Create csv file for SZ article-data for further EDA
articles_sz.to_csv('articles_sz_ges.csv', encoding='utf8')