## Süddeutsche Zeitung
#### Code for scraping Der Süddeutsche Zeitung 
- https://www.sueddeutsche.de

In [4]:
# Import necessary libraries
import requests
import urllib
import pandas as pd
from bs4 import BeautifulSoup
import nltk

This site kindly uses a regular search string and allows you to search for a date range, so I scraped links from
the search page (50 per page) and then wrote a loop to scrape the text from each of those urls.

In [24]:
# Search term ("Flüchtling") and date range
# as of 04/24/21 I changed this to be sorted by relevance instead of chronologically because it was only
# getting 3 months
search ='?search=Flüchtling&sort=score&all%5B%5D=dep&all%5B%5D=typ&all%5B%5D=sys&time=2015-01-01T00%3A00%2F2015-12-31T23%3A59&startDate=01.01.2015&endDate=31.12.2015'
# Site's base url
base_url = 'https://www.sueddeutsche.de/news/page/'
# results are organized by date within the range so I used multiples of 5 for the page numbers to try for 
# an even sample
n = 5
urls = []
while n <= 100:
    url = base_url+str(n)+search
    n += 5
    urls.append(url)
    
len(urls) #1000 potential articles
#urls

20

In [26]:
# Iterate through the urls of the search pages and get all the article urls embedded there 

art_links = []

for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    divs = soup.find_all("div", {"class":"entrylist__entry"})
    #print(divs)
    for div in divs:
        a = div.find('a') 
        art_links.append(a.get('href'))
        
    
#art_links    

len(art_links)

1000

In [27]:
# Quick look to make sure everything seems to be working
art_links[:5]

['https://www.sueddeutsche.de/politik/fluechtlingskrise-zug-autobahn-und-faehrverkehr-zwischen-deutschland-und-daenemark-gestoppt-1.2641386',
 'https://www.sueddeutsche.de/politik/fluechtlinge-wie-menschen-in-deutschland-fluechtlingen-helfen-dpa.urn-newsml-dpa-com-20090101-150825-99-08985',
 'https://www.sueddeutsche.de/politik/polen-angst-und-kalkuel-1.2640725',
 'https://www.sueddeutsche.de/politik/migration-es-wird-engdeutschland-erwartet-mehr-noch-fluechtlinge-dpa.urn-newsml-dpa-com-20090101-150912-99-01498',
 'https://www.sueddeutsche.de/politik/international-steinmeier-fordert-saudi-arabien-zu-aufnahme-von-fluechtlingen-auf-dpa.urn-newsml-dpa-com-20090101-151019-99-06011']

In [28]:
# Go through all the collected urls and get the text and date, append to a dictionary for easy df making
date_dict = {}
art_dict_sz = {}
x = ' '
for link in art_links:
    #page = urllib.request.urlopen(link)
    response = requests.get(link)
    if response.status_code=='404': #in case it runs into any 404 messages
        print(url)
        pass
    soup = BeautifulSoup(response.text, 'html.parser')
    #soup = BeautifulSoup(page)
    t = soup.find("div", {"class":"css-1jpy2hx e1lg1pmy0"})
    if t==None:
        art_dict_sz[link] = 'None' # A few links are broken it seems
        pass
        #print(t)
    else:
        d = soup.find("time",  {"class":"css-1ccsr7y"})
        date = d.text
        paras = t.findAll('p')
        a = [p.text for p in paras]
        text = x.join(a)
        date_dict[link] = date
        art_dict_sz[link] = text
    
#art_dict_sz
#date_dict

In [30]:
# Lost a couple along the way, overall it still seems a good sample
# Since changing to sort by relevance, there are 8 more articles
len(art_dict_sz)
len(date_dict)

990

In [31]:
#Create a dataframe from the two dictionaries
df_sz = pd.DataFrame.from_dict(art_dict_sz, orient='index')
df_sz.reset_index(inplace=True)
df_sz.columns = ['href', 'text']
df_sz['date']= df_sz['href'].map(date_dict)

# dropping NaN's
df_sz = df_sz.dropna(axis=0, how='any')


In [32]:
# add word count, sents, sent lens? token, types, ttr
word_c = df_sz.text.str.split().map(len)
df_sz['word_count'] = word_c
df_sz['sent_count'] = df_sz['text'].map(lambda s: len(nltk.sent_tokenize(s))) 
df_sz['toks'] = df_sz['text'].map(lambda t: len(nltk.word_tokenize(t))) 
df_sz['types'] = df_sz['text'].map(lambda x: len(set(nltk.word_tokenize(x)))) 
df_sz['TTR'] = df_sz.types/df_sz.toks

df_sz.head()

Unnamed: 0,href,text,date,word_count,sent_count,toks,types,TTR
0,https://www.sueddeutsche.de/politik/fluechtlin...,Die europäische Flüchtlingskrise hat den Norde...,"9. September 2015, 17:25 Uhr",923,60,1051,473,0.450048
1,https://www.sueddeutsche.de/politik/fluechtlin...,Berlin (dpa) - Bilder rechtsextremer Ausschrei...,"26. August 2015, 8:00 Uhr",649,48,797,456,0.572146
2,https://www.sueddeutsche.de/politik/polen-angs...,Erzbischof Stanisław Gądecki ist seinem Chef v...,"9. September 2015, 19:05 Uhr",330,19,376,234,0.62234
3,https://www.sueddeutsche.de/politik/migration-...,München/Berlin (dpa) - Die Behörden in Deutsch...,"12. September 2015, 15:02 Uhr",572,38,681,357,0.524229
4,https://www.sueddeutsche.de/politik/internatio...,Riad (dpa) - Außenminister Frank-Walter Steinm...,"19. Oktober 2015, 12:47 Uhr",68,4,80,63,0.7875


In [33]:
# Looking good!
df_sz.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 990 entries, 0 to 999
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   href        990 non-null    object 
 1   text        990 non-null    object 
 2   date        990 non-null    object 
 3   word_count  990 non-null    int64  
 4   sent_count  990 non-null    int64  
 5   toks        990 non-null    int64  
 6   types       990 non-null    int64  
 7   TTR         990 non-null    float64
dtypes: float64(1), int64(4), object(3)
memory usage: 69.6+ KB


In [34]:
df_sz.describe()

Unnamed: 0,word_count,sent_count,toks,types,TTR
count,990.0,990.0,990.0,990.0,990.0
mean,320.090909,20.450505,372.70202,208.315152,0.647938
std,265.585881,17.888687,310.979566,140.115162,0.123858
min,12.0,1.0,13.0,13.0,0.387226
25%,82.0,6.0,94.0,75.0,0.545873
50%,244.5,15.0,285.0,173.5,0.626368
75%,516.0,31.0,604.0,327.0,0.765629
max,2766.0,189.0,3241.0,1255.0,1.0


In [35]:
# Pickling the dataframe for easy use later, .gitignore is updated with .pkl files
pd.to_pickle(df_sz, "sz_df.pkl")