# Description

In this notebook, we extract all metadata information about individual works in noscemus from https://wiki.uibk.ac.at/noscemus/Category:Works

* INPUT: webpages of individual works on https://wiki.uibk.ac.at/noscemus/Category:Works
* OUTPUT: tabular data in one CSV file: `../data/noscemus_metadata.csv`

In [2]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [7]:
%%time
# Web test, extraction of clomun headers and dataframe extraction. (also checks time of execution. time for whole corpus ~="time"*994)
url = "https://wiki.uibk.ac.at/noscemus/A_Latin_Letter_containing_some_Animadversions_upon_Mr._Isaac_Newton,_his_Theory_of_Light"
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, "html.parser")
    table = soup.find("table", class_="wikitable")
    #data = [item.get_text(strip=True) for item in table.find_all("td")]
    indices = [item.get_text(strip=True) for item in table.find_all("th")]
    metadata_table = pd.DataFrame([indices])
else:
    print("Request error, response code is:", response.status_code)

CPU times: user 27 ms, sys: 3.43 ms, total: 30.4 ms
Wall time: 545 ms


In [23]:
# creation of list of works of noscemus based on which i am iterating throught the corpus
url = ["https://wiki.uibk.ac.at/noscemus/_-_/index.php?title=Category:Works&pageuntil=De+curandis+vulneribus+sclopettorum#mw-pages", "https://wiki.uibk.ac.at/noscemus/_-_/index.php?title=Category:Works&pagefrom=De+curandis+vulneribus+sclopettorum#mw-pages", "https://wiki.uibk.ac.at/noscemus/_-_/index.php?title=Category:Works&pagefrom=Discursus+astronomicus+novissimus#mw-pages", "https://wiki.uibk.ac.at/noscemus/_-_/index.php?title=Category:Works&pagefrom=In+opus+revolutionum+Nicolai+Copernici+Torunnaei+dialogus#mw-pages", "https://wiki.uibk.ac.at/noscemus/_-_/index.php?title=Category:Works&pagefrom=Petri+Nonii+Salaciensis+opera#mw-pages",]
seznam =[]
for item in url:
    response = requests.get(item)
    soup = BeautifulSoup(response.content, "html.parser")
    tag = soup.find("div", class_="mw-category")
    seznam.extend(re.findall("(?<=href=\").*(?=\" title)", str(tag)))

In [95]:
url = "https://wiki.uibk.ac.at"+seznam[150]
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
table = soup.find("table", class_="wikitable")

In [104]:
table

<table class="wikitable">
<tr>
<th> Author
</th>
<td> <a href="/noscemus/Remus,_Georg" title="Remus, Georg">Remus, Georg</a>, <a href="/noscemus/Wolff,_Christian_von" title="Wolff, Christian von">Wolff, Christian von</a>
</td></tr>
<tr>
<th> Full title
</th>
<td> Consideratio physico-mathematica hiemis proxime praeterlapsae quam rectore magnicientissimo serenissimo principe ac domino DN. Philippo Wilhelmo, principe Borussiae, marchione Brandenburgico, caetera, praeside Christiano Wolfio, mathematum Professore Publico Ordinario, ad diem XIII Junii anno MDCCIX. horis antemeridianis in auditorio majori publico eruditorum examini submittet Georgius Remus, Gedansis.
</td></tr>
<tr>
<th> Year
</th>
<td> 1709
</td></tr>
<tr>
<th> Place
</th>
<td> Halle (Saale)
</td></tr>
<tr>
<th> Publisher/Printer
</th>
<td> Zeidler, Andreas
</td></tr>
<tr>
<th> Era
</th>
<td> 18th century
</td></tr>
<tr>
<th> Form/Genre
</th>
<td> Dissertation
</td></tr>
<tr>
<th> Discipline/Content
</th>
<td> Physics, Mete

In [112]:
def get_wikientry_data(table):
    data = {}
    for tr in table.find_all("tr"):
        if 'onlyforeditors' in tr.get('class', []):
            try:
                tds = tr.find_all("td")
                data[tds[0].get_text(strip=True)] = tds[1].get_text(strip=True)
            except:
                pass
        else:
            try:
                data[tr.th.get_text(strip=True)] = tr.td.get_text(strip=True)
            except:
                try:
                    tds = tr.find_all("td")
                    data[tds[0].get_text(strip=True)] = tds[1].get_text(strip=True)                
                except:
                    pass
    return data

In [113]:
data = get_wikientry_data(table)

In [114]:
data

{'Author': 'Remus, Georg,Wolff, Christian von',
 'Full title': 'Consideratio physico-mathematica hiemis proxime praeterlapsae quam rectore magnicientissimo serenissimo principe ac domino DN. Philippo Wilhelmo, principe Borussiae, marchione Brandenburgico, caetera, praeside Christiano Wolfio, mathematum Professore Publico Ordinario, ad diem XIII Junii anno MDCCIX. horis antemeridianis in auditorio majori publico eruditorum examini submittet Georgius Remus, Gedansis.',
 'Year': '1709',
 'Place': 'Halle (Saale)',
 'Publisher/Printer': 'Zeidler, Andreas',
 'Era': '18th century',
 'Form/Genre': 'Dissertation',
 'Discipline/Content': 'Physics, Meteorology/Earth sciences',
 'Original': 'Consideratio physico-mathematica hiemis proxime praeterlapsae (Halle 1709)(Digitale Bibliothek Uni Halle)',
 'Digital sourcebook': '691456',
 'Description': 'TheConsideratiois a dissertation presented under thepraesesChristoph Wolff in Halle on the topic of ‘The Great Frost’ (‘der Jahrtausendwinter’; ‘le Grand

In [115]:
%%time
#writing metadata from every work in noscemus corpus into pd.dataframe "metadata_table"
all_data = []
for page in seznam:
    url = "https://wiki.uibk.ac.at"+page
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    table = soup.find("table", class_="wikitable")
    data = get_wikientry_data(table)
    all_data.append(data)

CPU times: user 43.2 s, sys: 1.41 s, total: 44.7 s
Wall time: 6min 38s


In [116]:
metadata_table = pd.DataFrame(all_data)

In [117]:
#export dataframe into .csv file. Please insert your desired file location.
metadata_table.to_csv("../data/metadata_table.csv", index=False)