Skip to content

Commit

Permalink
Merge pull request #142 from Datenschule/bayern-wfs
Browse files Browse the repository at this point in the history
[BY] Use WFS to get data
  • Loading branch information
k-nut authored Jul 15, 2024
2 parents b9e41d2 + 45d7bea commit 46f7265
Showing 1 changed file with 39 additions and 62 deletions.
101 changes: 39 additions & 62 deletions jedeschule/spiders/bayern.py
Original file line number Diff line number Diff line change
@@ -1,73 +1,50 @@
# -*- coding: utf-8 -*-
from urllib import parse

import xml.etree.ElementTree as ET
import scrapy
from scrapy import Item
from scrapy.shell import inspect_response

from jedeschule.items import School
from jedeschule.utils import get_first_or_none, cleanjoin


class BayernSpider(scrapy.Spider):
name = "bayern"
# allowed_domains = ["https://www.km.bayern.de/schueler/schulsuche.html"]
start_urls = ['https://www.km.bayern.de/schueler/schulsuche.html?s=&t=9999&r=9999&o=9999&u=0&m=3&seite=1']

def parse(self, response):
number_of_pages = response.css("div.schulsuche > div > p.Right a:last-child::text").extract_first()
# number_of_pages = 2
for i in range(1, int(number_of_pages) + 1):
url = "https://www.km.bayern.de/schueler/schulsuche.html?s=&t=9999&r=9999&o=9999&u=0&m=3&seite={page}"
yield scrapy.Request(url.format(page=i),
callback=self.parse_list)

def parse_list(self, response):
links = response.css('.ListSchools a::attr(href)').extract()
for link in links:
yield scrapy.Request(response.urljoin(link), callback=self.parse_detail)

def get_lat_lon(self, response):
try:
geoportal_href = response.css("article > a::attr(href)").extract_first()
querystring = parse.parse_qs(geoportal_href)
return querystring['N'][0], querystring['E'][0]
except:
return None, None

def parse_detail(self, response):
# inspect_response(response, self)
collection = {}
text = response.css("article ::text")
street, city = response.css("article > p")[0].css("::text").extract()
collection['street'] = street
collection['city'] = city
collection['name'] = cleanjoin(response.css('article h1::text').extract(), "")
collection['phone'] = get_first_or_none(text.re("Telefon: ([0-9 /]+)"))
collection['fax'] = get_first_or_none(text.re("Fax: ([0-9 /]+)"))
collection['web'] = response.css("article a::attr(href)").extract_first()
collection['number'] = get_first_or_none(text.re("Schulnummer: ([0-9]+)"))
collection['school_type'] = get_first_or_none(text.re("Schulart: (.+)"))
collection['type'] = get_first_or_none(text.re("Rechtlicher Status: (.+)"))
collection['teachers'] = get_first_or_none(text.re("Hauptamtliche Lehrkräfte: ([0-9]+)"))
collection['students'] = get_first_or_none(text.re("Schüler: ([0-9]+)"))
collection['url'] = response.url
collection['latitude'], collection['longitude'] = self.get_lat_lon(response)
yield collection
start_urls = ['https://gdiserv.bayern.de/srv112940/services/schulstandortebayern-wfs?SERVICE=WFS&VERSION=2.0.0&REQUEST=GetCapabilities']

def parse(self, response, **kwargs):
tree = ET.fromstring(response.body)
base_url = 'https://gdiserv.bayern.de/srv112940/services/schulstandortebayern-wfs?SERVICE=WFS&VERSION=2.0.0&REQUEST=GetFeature&srsname=EPSG:4326&typename='
for feature_type in tree.iter("{http://www.opengis.net/wfs/2.0}FeatureType"):
feature = feature_type.findtext("{http://www.opengis.net/wfs/2.0}Title")
yield scrapy.Request(f"{base_url}{feature}", callback=self.parse_resource, cb_kwargs={"feature": feature})

def parse_resource(self, response, feature):
tree = ET.fromstring(response.body)
namespaces = {
"gml": "http://www.opengis.net/gml/3.2",
"schul": "http://gdi.bayern/brbschul"
}
for school in tree.iter(feature.replace("schul:", "{http://gdi.bayern/brbschul}")):
data_elem = {'id': school.attrib["{http://www.opengis.net/gml/3.2}id"]}

for entry in school:
if entry.tag == "{http://gdi.bayern/brbschul}geometry":
lat, lon = entry.findtext(
"gml:Point/gml:pos", namespaces=namespaces
).split(" ")
data_elem["lat"] = lat
data_elem["lon"] = lon
continue
# strip the namespace before returning
data_elem[entry.tag.split("}", 1)[1]] = entry.text
yield data_elem

@staticmethod
def normalize(item: Item) -> School:
zip_code, *city_parts = item.get('city').split()
return School(name=item.get('name'),
phone=item.get('phone'),
fax=item.get('fax'),
website=item.get('web'),
address=item.get('street'),
city=' '.join(city_parts),
zip=zip_code,
school_type=item.get('school_type'),
legal_status=item.get('type'),
id='BY-{}'.format(item.get('number')),
latitude=item.get('latitude'),
longitude=item.get('longitude')
)
return School(name=item.get('schulname'),
address=item.get('strasse'),
city=item.get('ort'),
school_type=item.get('schulart'),
zip=item.get('postleitzahl'),
id='BY-{}'.format(item.get('id')),
latitude=item.get('lat'),
longitude=item.get('lon')
)

0 comments on commit 46f7265

Please sign in to comment.