## Süddeutsche Zeitung
#### Code for scraping Der Süddeutsche Zeitung 
- https://www.sueddeutsche.de

In [24]:
# Import necessary libraries
import requests
import urllib
import pandas as pd
from bs4 import BeautifulSoup
import nltk

This site kindly uses a regular search string and allows you to search for a date range, so I scraped links from
the search page (50 per page) and then wrote a loop to scrape the text from each of those urls.

In [3]:
# Search term ("Flüchtling") and date range
search = '?search=Fl%C3%BCchtling&sort=date&all%5B%5D=dep&all%5B%5D=typ&all%5B%5D=sys&time=2021-03-16T00%3A00%2F2021-03-16T23%3A59&startDate=01.01.2015&endDate=31.12.2015'
# Site's base url
base_url = 'https://www.sueddeutsche.de/news/page/'
# results are organized by date within the range so I used multiples of 5 for the page numbers to try for 
# an even sample
n = 5
urls = []
while n <= 100:
    url = base_url+str(n)+search
    n += 5
    urls.append(url)
    
len(urls) #1000 potential articles

20

In [4]:
# Iterate through the urls of the search pages and get all the article urls embedded there 

art_links = []

for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    divs = soup.find_all("div", {"class":"entrylist__entry"})
    #print(divs)
    for div in divs:
        a = div.find('a') 
        art_links.append(a.get('href'))
        
    
#art_links    

len(art_links)

1000

In [5]:
# Quick look to make sure everything seems to be working
art_links[:5]

['https://www.sueddeutsche.de/politik/migration-wams-ueber-8000-spezielle-deutschklassen-fuer-fluechtlinge-geschaffen-dpa.urn-newsml-dpa-com-20090101-151227-99-575735',
 'https://www.sueddeutsche.de/politik/migration-italienische-kuestenwache-rettet-weitere-bootsfluechtlinge-dpa.urn-newsml-dpa-com-20090101-151226-99-574575',
 'https://www.sueddeutsche.de/kultur/rueckblick-magische-kinomomente-des-jahres-2015-1.2785573',
 'https://www.sueddeutsche.de/politik/rueckblick-2015-flucht-krieg-und-klimawandel-1.2794106',
 'https://www.sueddeutsche.de/politik/fluechtlinge-schwaebisch-gmuend-will-zeichen-gegen-menschenfeindlichkeit-setzen-1.2796664']

In [6]:
# Go through all the collected urls and get the text and date, append to a dictionary for easy df making
date_dict = {}
art_dict_sz = {}
x = ' '
for link in art_links:
    #page = urllib.request.urlopen(link)
    response = requests.get(link)
    if response.status_code=='404': #in case it runs into any 404 messages
        print(url)
        pass
    soup = BeautifulSoup(response.text, 'html.parser')
    #soup = BeautifulSoup(page)
    t = soup.find("div", {"class":"css-1jpy2hx e1lg1pmy0"})
    if t==None:
        art_dict_sz[link] = 'None' # A few links are broken it seems
        pass
        #print(t)
    else:
        d = soup.find("time",  {"class":"css-1ccsr7y"})
        date = d.text
        paras = t.findAll('p')
        a = [p.text for p in paras]
        text = x.join(a)
        date_dict[link] = date
        art_dict_sz[link] = text
    
#art_dict_sz
#date_dict

In [7]:
# Lost a couple along the way, overall it still seems a good sample
len(art_dict_sz)
len(date_dict)

982

In [35]:
#Create a dataframe from the two dictionaries
df_sz = pd.DataFrame.from_dict(art_dict_sz, orient='index')
df_sz.reset_index(inplace=True)
df_sz.columns = ['href', 'text']
df_sz['date']= df_sz['href'].map(date_dict)

# dropping NaN's
df_sz = df_sz.dropna(axis=0, how='any')


In [36]:
# add word count, sents, sent lens? token, types, ttr
word_c = df_sz.text.str.split().map(len)
df_sz['word_count'] = word_c
df_sz['sent_count'] = df_sz['text'].map(lambda s: len(nltk.sent_tokenize(s))) 
df_sz['toks'] = df_sz['text'].map(lambda t: len(nltk.word_tokenize(t))) 
df_sz['types'] = df_sz['text'].map(lambda x: len(set(nltk.word_tokenize(x)))) 
df_sz['TTR'] = df_sz.types/df_sz.toks

df_sz.head()

Unnamed: 0,href,text,date,word_count,sent_count,toks,types,TTR
0,https://www.sueddeutsche.de/politik/migration-...,Berlin (dpa) - Die Bundesländer haben für die ...,"27. Dezember 2015, 2:45 Uhr",89,5,103,83,0.805825
1,https://www.sueddeutsche.de/politik/migration-...,Rom (dpa) - Im Mittelmeer vor Italien sind auc...,"26. Dezember 2015, 20:51 Uhr",62,4,72,56,0.777778
2,https://www.sueddeutsche.de/kultur/rueckblick-...,1 / 12 Quelle: 20th Century Fox Südseefilme si...,"26. Dezember 2015, 17:57 Uhr",1818,87,2174,1004,0.461822
3,https://www.sueddeutsche.de/politik/rueckblick...,Bei dem Blick zurück auf das Jahr 2015 stechen...,"26. Dezember 2015, 16:00 Uhr",451,28,512,312,0.609375
4,https://www.sueddeutsche.de/politik/fluechtlin...,Nach einem Brandanschlag auf eine noch nicht f...,"26. Dezember 2015, 15:43 Uhr",387,26,451,274,0.607539


In [37]:
# Looking good!
df_sz.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 982 entries, 0 to 999
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   href        982 non-null    object 
 1   text        982 non-null    object 
 2   date        982 non-null    object 
 3   word_count  982 non-null    int64  
 4   sent_count  982 non-null    int64  
 5   toks        982 non-null    int64  
 6   types       982 non-null    int64  
 7   TTR         982 non-null    float64
dtypes: float64(1), int64(4), object(3)
memory usage: 69.0+ KB


In [38]:
df_sz.describe()

Unnamed: 0,word_count,sent_count,toks,types,TTR
count,982.0,982.0,982.0,982.0,982.0
mean,375.175153,24.216904,438.845214,240.369654,0.618964
std,280.159402,19.202361,329.773222,146.551981,0.113869
min,4.0,1.0,5.0,5.0,0.307692
25%,129.5,8.0,155.5,107.25,0.532525
50%,342.0,21.0,397.5,233.5,0.591833
75%,536.0,34.0,636.0,335.75,0.693488
max,2402.0,136.0,2767.0,1172.0,1.0


In [39]:
# Pickling the dataframe for easy use later, .gitignore is updated with .pkl files
pd.to_pickle(df_sz, "sz_df.pkl")