In [6]:
# Importing Libraries and Modules
import os
import re
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# Downloading a German SpaCy Language Model
!python -m spacy download de_core_news_lg

Collecting de-core-news-lg==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_lg-3.7.0/de_core_news_lg-3.7.0-py3-none-any.whl (567.8 MB)
     ---------------------------------------- 0.0/567.8 MB ? eta -:--:--
     ---------------------------------------- 0.3/567.8 MB 7.9 MB/s eta 0:01:12
     ---------------------------------------- 0.7/567.8 MB 9.3 MB/s eta 0:01:01
     --------------------------------------- 2.2/567.8 MB 17.2 MB/s eta 0:00:33
     --------------------------------------- 3.9/567.8 MB 22.5 MB/s eta 0:00:26
     --------------------------------------- 4.9/567.8 MB 22.6 MB/s eta 0:00:25
     --------------------------------------- 5.8/567.8 MB 23.2 MB/s eta 0:00:25
     --------------------------------------- 5.8/567.8 MB 23.2 MB/s eta 0:00:25
      -------------------------------------- 8.6/567.8 MB 23.8 MB/s eta 0:00:24
      ------------------------------------- 10.3/567.8 MB 25.2 MB/s eta 0:00:23
      ---------------------


[notice] A new release of pip is available: 23.3 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# Loading SpaCy Model and Defining Entity Extraction Function
import spacy

model = spacy.load("de_core_news_lg")

def get_entities(sentence):
  doc = model(sentence)
  ent_lst = []
  for ent in doc.ents:
    if ent.label_ != 'MISC':
      ent_lst.append(ent) #  ent_lst.append((ent, ent.label_)) here the label is extracted with the word.. 
  return ent_lst

In [4]:
# Processing XML Files
input_folder = 'DATA/tei'

data_dict = {}

# Loop through each file in the input folder and add data to a dictionary with the ID as a key
for filename in os.listdir(input_folder):
    if filename.endswith('.xml'):  # Process only XML files
        input_file_path = os.path.join(input_folder, filename)
        
        temp_list = []

        # Extract the metadata
        tree = ET.parse(input_file_path)
        root = tree.getroot()

        date = root.find('.//date').text if root.find('.//date') is not None else ""
        temp_list.append(date)

        place = root.find('.//pubPlace').text if root.find('.//pubPlace') is not None else ""
        temp_list.append(place)

        # Extract the content
        with open(input_file_path, 'r', encoding='utf-8') as f:
            content = f.read()

        # Replace "<" with " <" to add a space before XML tags
        content = content.replace("<", " <")
        
        # Replace "=<" with "<" to remove the space before XML tags
        content = content.replace("= <", "<")

        soup = BeautifulSoup(content, 'xml')
        text = soup.get_text()
        
        # Replace "=" with "-"
        text = text.replace("=", "-")

        # Use regular expression to find and remove text before the first occurrence of "Mannheim" and remove leading/trailing whitespace
        new_text = re.sub(r"^(.*?Mannheim)", "", text, flags=re.DOTALL)

        # Remove some of the whitespace
        new_string = new_text.strip().replace('\n\n', '\n')
        temp_list.append(new_string)

        # Extract the named entities
        ner_results = get_entities(new_string)
        unique_tags = list(set(ner_results))
        temp_list.append(unique_tags)


        data_dict[filename[:8]] = temp_list

In [5]:
# Creating DataFrame and Modifying Index
df = pd.DataFrame.from_dict(data_dict, orient='index', columns=['Date', 'Place', 'Content', 'Words']).reset_index()

df = df.rename(columns={'index': 'ID'})

print(df.head())

         ID        Date              Place  \
0  khz01001  10-01-1744  Erfurt, Thüringen   
1  khz01002  13-01-1744  Erfurt, Thüringen   
2  khz01003  17-01-1744  Erfurt, Thüringen   
3  khz01004  07-02-1744  Erfurt, Thüringen   
4  khz01005  21-02-1744  Erfurt, Thüringen   

                                             Content  \
0  Der allerneuesten \n\nEuropäischen \n\nWelt- u...   
1  Der allerneuesten   Europäischen   Welt- und S...   
2  Der allerneuesten   Europäischen   Welt- und S...   
3  Der allerneuesten   Europäischen   Welt- und S...   
4  Der allerneuesten   Europäischen   Welt-und St...   

                                               Words  
0  [(Clevischen), (Graf, von), (Holland), (hertzl...  
1  [(Fürst, von, Lichtenstein), (Ubergangs), (Eng...  
2  [(Amts,  ), (Holland), (Jtalien), (August, von...  
3  [(Po), (Jhren, Sold), (Chur), (Pescaro), (D), ...  
4  [(Frantzösischen, Escadern), (Canonicum), (hof...  


In [6]:
# Removing All Entries Without Proper Date Format
df_new = df[df['Date'].astype(str).str.len() == 10]

In [7]:
# Converting Date Format and Extracting Year and Day-Month
print(df_new['Date'].dtype)
df_new['Date'] = pd.to_datetime(df_new['Date'], format='%d-%m-%Y')
print(df_new['Date'].dtype)

object
datetime64[ns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['Date'] = pd.to_datetime(df_new['Date'], format='%d-%m-%Y')


In [8]:
df_new['Year'] = df_new['Date'].dt.year
df_new['Day_month'] = df_new['Date'].dt.strftime('%d-%m')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['Year'] = df_new['Date'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['Day_month'] = df_new['Date'].dt.strftime('%d-%m')


In [9]:
# Splitting Day and Month
split_values = df_new['Day_month'].str.split('-', expand=True)
df_new['Day'] = split_values[0]
df_new['Month'] = split_values[1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['Day'] = split_values[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['Month'] = split_values[1]


In [10]:
# Geographic Data Correction
def geo (place):
    if place == 'Leibzig, Sachsen':
        return 'Leipzig, Sachsen'
    elif place == 'Stuttgart und Tübingen, Württemberg':
        return 'Stuttgart, Württemberg'
    else: 
        return place

df_new['Place_corrected'] = df_new['Place'].apply(geo)

print(df_new.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['Place_corrected'] = df_new['Place'].apply(geo)


         ID       Date              Place  \
0  khz01001 1744-01-10  Erfurt, Thüringen   
1  khz01002 1744-01-13  Erfurt, Thüringen   
2  khz01003 1744-01-17  Erfurt, Thüringen   
3  khz01004 1744-02-07  Erfurt, Thüringen   
4  khz01005 1744-02-21  Erfurt, Thüringen   

                                             Content  \
0  Der allerneuesten \n\nEuropäischen \n\nWelt- u...   
1  Der allerneuesten   Europäischen   Welt- und S...   
2  Der allerneuesten   Europäischen   Welt- und S...   
3  Der allerneuesten   Europäischen   Welt- und S...   
4  Der allerneuesten   Europäischen   Welt-und St...   

                                               Words  Year Day_month Day  \
0  [(Clevischen), (Graf, von), (Holland), (hertzl...  1744     10-01  10   
1  [(Fürst, von, Lichtenstein), (Ubergangs), (Eng...  1744     13-01  13   
2  [(Amts,  ), (Holland), (Jtalien), (August, von...  1744     17-01  17   
3  [(Po), (Jhren, Sold), (Chur), (Pescaro), (D), ...  1744     07-02  07   
4  [(Frantzö

In [11]:
places = df_new['Place_corrected'].unique()
print(places)

['Erfurt, Thüringen' 'Rudolstadt, Thüringen' 'Augsburg, Bayern'
 'Frankfurt, Hessen' 'Würzburg, Bayern' 'Bayreuth, Bayern'
 'Berlin, Preussen' 'Breslau, Schlesien' 'Leipzig, Sachsen' 'Bremen'
 'Mainz' 'Stuttgart, Württemberg' 'Hattingen, Nordrhein-Westfalen'
 'Berlin' 'Tübingen, Württemberg' 'Wien, Österreich']


In [7]:
# Using Geopy for Geographic Coordinates
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

In [15]:
coord_dict = {}

# Initialize geolocator
geolocator = Nominatim(user_agent="geoapiExercises")

# Use rate limiter to not hit service limits
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

# Function to get coordinates
def fetch_coordinates(place):
    try:
        location = geocode(place)
        return pd.Series([location.latitude, location.longitude])
    except AttributeError:
        return pd.Series([None, None])

for place in places:
    location = fetch_coordinates(place)
    coord_dict[place] = [location.iloc[0], location.iloc[1]]

RateLimiter caught an error, retrying (0/2 tries). Called with (*('Erfurt, Thüringen',), **{}).
Traceback (most recent call last):
  File "c:\Users\algro\AppData\Local\Programs\Python\Python311\Lib\site-packages\urllib3\connection.py", line 174, in _new_conn
    conn = connection.create_connection(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\algro\AppData\Local\Programs\Python\Python311\Lib\site-packages\urllib3\util\connection.py", line 95, in create_connection
    raise err
  File "c:\Users\algro\AppData\Local\Programs\Python\Python311\Lib\site-packages\urllib3\util\connection.py", line 85, in create_connection
    sock.connect(sa)
TimeoutError: timed out

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\algro\AppData\Local\Programs\Python\Python311\Lib\site-packages\urllib3\connectionpool.py", line 703, in urlopen
    httplib_response = self._make_request(
                       ^^^^^^^^^^^^^^^^^^^

In [23]:
print(coord_dict)

{'Erfurt, Thüringen': [50.9777974, 11.0287364], 'Rudolstadt, Thüringen': [50.7206063, 11.3401985], 'Augsburg, Bayern': [48.3690341, 10.8979522], 'Frankfurt, Hessen': [50.1106444, 8.6820917], 'Würzburg, Bayern': [49.7933723, 9.9309779], 'Bayreuth, Bayern': [49.9427202, 11.5763079], 'Berlin, Preussen': [52.517253, 13.3927431], 'Breslau, Schlesien': [51.1263106, 16.97819633051261], 'Leipzig, Sachsen': [51.3406321, 12.3747329], 'Bremen': [53.0758196, 8.8071646], 'Mainz': [50.0012314, 8.2762513], 'Stuttgart, Württemberg': [48.7820489, 9.2687566], 'Hattingen, Nordrhein-Westfalen': [51.4007175, 7.1862486], 'Berlin': [52.5170365, 13.3888599], 'Tübingen, Württemberg': [48.5236164, 9.0535531], 'Wien, Österreich': [48.2083537, 16.3725042]}


In [24]:
# Add Latitude and Longitude to DataFrame 
def get_coordinates(place):
    if place in coord_dict:
        return pd.Series(coord_dict[place])
    else:
        return pd.Series([None, None])

df_new[['latitude', 'longitude']] = df_new['Place_corrected'].apply(get_coordinates).apply(pd.Series)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new[['latitude', 'longitude']] = df_new['Place_corrected'].apply(get_coordinates).apply(pd.Series)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new[['latitude', 'longitude']] = df_new['Place_corrected'].apply(get_coordinates).apply(pd.Series)


In [25]:
df_new

Unnamed: 0,ID,Date,Place,Content,Words,Year,Day_month,Day,Month,Place_corrected,latitude,longitude
0,khz01001,1744-01-10,"Erfurt, Thüringen",Der allerneuesten \n\nEuropäischen \n\nWelt- u...,"[(Clevischen), (Graf, von), (Holland), (hertzl...",1744,10-01,10,01,"Erfurt, Thüringen",50.977797,11.028736
1,khz01002,1744-01-13,"Erfurt, Thüringen",Der allerneuesten Europäischen Welt- und S...,"[(Fürst, von, Lichtenstein), (Ubergangs), (Eng...",1744,13-01,13,01,"Erfurt, Thüringen",50.977797,11.028736
2,khz01003,1744-01-17,"Erfurt, Thüringen",Der allerneuesten Europäischen Welt- und S...,"[(Amts, ), (Holland), (Jtalien), (August, von...",1744,17-01,17,01,"Erfurt, Thüringen",50.977797,11.028736
3,khz01004,1744-02-07,"Erfurt, Thüringen",Der allerneuesten Europäischen Welt- und S...,"[(Po), (Jhren, Sold), (Chur), (Pescaro), (D), ...",1744,07-02,07,02,"Erfurt, Thüringen",50.977797,11.028736
4,khz01005,1744-02-21,"Erfurt, Thüringen",Der allerneuesten Europäischen Welt-und St...,"[(Frantzösischen, Escadern), (Canonicum), (hof...",1744,21-02,21,02,"Erfurt, Thüringen",50.977797,11.028736
...,...,...,...,...,...,...,...,...,...,...,...,...
647,khz21009,1850-10-12,"Wien, Österreich",Wiener Zeitung. \n \n N ro 244. Samst...,"[(Hoheiten), (viribus), (Joseph, , Schütz), (...",1850,12-10,12,10,"Wien, Österreich",48.208354,16.372504
648,khz21010,1850-10-17,"Wien, Österreich",Wiener Zeitung. \n \n N ro 248. Donne...,"[(Besitzanfällen), (Brixen), (Anschlusse), (Ba...",1850,17-10,17,10,"Wien, Österreich",48.208354,16.372504
649,khz21011,1850-10-25,"Wien, Österreich",Wiener Zeitung. \n \n N ro 255. Freit...,"[(Belgier), (Deutschen, Bundes), (Herzogthums,...",1850,25-10,25,10,"Wien, Österreich",48.208354,16.372504
650,khz21012,1850-10-29,"Wien, Österreich",Wiener Zeitung. \n \n N ro 258 Dinsta...,"[(Hrn., J., , Prettner), (FlügelAdjutanten, ...",1850,29-10,29,10,"Wien, Österreich",48.208354,16.372504


In [26]:
# Exporting the Final DataFrame
df_new.to_csv('Tei_dataframe without MISC.csv', index=False)