## TAZ
#### Code for scraping Die Tageszeitung 
- https://taz.de

In [51]:
#Import necessary libraries
import requests
import pandas as pd 
import urllib
from bs4 import BeautifulSoup
import pickle
import nltk

In [9]:
# read in .txt file of the urls 
fname = '/Users/emilymartin/Documents/Data_Science/Fluechtlingskrise-Sentiment-Analysis/taz_urls.txt'
with open(fname) as f:
    lines = [line.rstrip() for line in f]

In [11]:
# make sure there are 100
len(lines)
#lines[:10]

100

In [48]:
# Create a dictionary for dates and article texts with the link as the key.
# Use urllib and Beautiful Soup to open and get the text and dates from each link
date_dict = {}
art_dict_taz = {}
x = ' '
for link in links:
    page = urllib.request.urlopen(link)
    soup = BeautifulSoup(page)
    t = soup.find("div", {"class":"main rack first_rack"})
    paras = t.findAll('p')
    a = [p.text for p in paras]
    d = soup.find("li", {"class":"date"})
    text = x.join(a)
    date_dict[link] = d
    art_dict_taz[link] = text
    
#art_dict_taz
#date_dict


In [49]:
# Timestamp is not really necessary for this data, so I will reduce it to just the date

for k, v in date_dict.items():
    date_dict[k] = v['content'][0:10]
#date_dict    

In [52]:
#Creating a dataframe from the two dictionaries
df_taz = pd.DataFrame.from_dict(art_dict_taz, orient='index')
df_taz.reset_index(inplace=True)
df_taz.columns = ['href', 'text']
#This is my new favorite little trick, maps the new column date using the common key 'href'
df_taz['date']= df_taz['href'].map(date_dict) 


# Adding a column for word count
word_c = df_taz.text.str.split().map(len)
df_taz['word_count'] = word_c

# Adding more columns
df_taz['sent_count'] = df_taz['text'].map(lambda s: len(nltk.sent_tokenize(s))) 
df_taz['toks'] = df_taz['text'].map(lambda t: len(nltk.word_tokenize(t))) 
df_taz['types'] = df_taz['text'].map(lambda x: len(set(nltk.word_tokenize(x)))) 
df_taz['TTR'] = df_taz.types/df_taz.toks

#A quick look at the new dataframe
df_taz

Unnamed: 0,href,text,date,word_count,sent_count,toks,types,TTR
0,https://taz.de/Altersfeststellung-bei-Fluechtl...,Bei der Altersfeststellung minderjähriger Flüc...,2015-08-18,678,41,815,398,0.488344
1,https://taz.de/Fluechtlingspolitik-in-Deutschl...,Die Mehrheit der Deutschen lehnt grenznahe Auf...,2015-07-26,343,22,379,223,0.588391
2,https://taz.de/Kommentar-Verfassungsschutz/!50...,Die Reform des V-Leute-Wesens ist eine Charmeo...,2015-03-25,266,20,303,195,0.643564
3,https://taz.de/Erstaufnahme-in-Neumuenster/!52...,Seit dem Wochenende werden unbegleitete minder...,2015-11-02,735,41,878,429,0.488610
4,https://taz.de/Kommentar-Katastrophe-im-Mittel...,Es gibt eine völkerrechtliche Verpflichtung zu...,2015-04-20,339,23,400,237,0.592500
...,...,...,...,...,...,...,...,...
95,https://taz.de/Fluechtlingsfussball/!5229139/,In der Sömmeringhalle in Charlottenburg spiele...,2015-09-13,388,33,462,262,0.567100
96,https://taz.de/Fluechtlinge-in-Hoyerswerda/!52...,"Eine Initiative versucht, die Flüchtlinge in H...",2015-06-26,1551,116,1871,789,0.421700
97,https://taz.de/Wie-Fluechtlinge-nach-Berlin-ko...,Das Bild des Fluchthelfers hat sich gewandelt....,2015-07-23,544,37,623,333,0.534510
98,https://taz.de/Bildung-fuer-Fluechtlinge/!5232...,„Neuzuwanderer-Klassen“ sollen Flüchtlinge auf...,2015-09-17,716,63,862,434,0.503480


In [53]:
df_taz.info()
# All non-null

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   href        100 non-null    object 
 1   text        100 non-null    object 
 2   date        100 non-null    object 
 3   word_count  100 non-null    int64  
 4   sent_count  100 non-null    int64  
 5   toks        100 non-null    int64  
 6   types       100 non-null    int64  
 7   TTR         100 non-null    float64
dtypes: float64(1), int64(4), object(3)
memory usage: 6.4+ KB


In [54]:
# Pickling the dataframe for easy use later, .gitignore is updated with .pkl files
pd.to_pickle(df_taz, "taz_df.pkl")
