# Obdelava podatkov

In [128]:
import re
from bs4 import BeautifulSoup
import requests
import os
import csv

## 1. Pridobivanje podatkov
Prvo definiramo spremenljivke:

In [129]:
podatki_url = 'https://en.wikipedia.org/wiki/List_of_chemical_elements'

directory_dat = 'Surovi podatki'

datoteka_html = 'podatki.html'


Potem napišemo funkcijo, ki podatke dobi iz spleta:

In [130]:
def nalozi_iz_spleta(url):
    
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        else:
            raise ValueError(f'Prišlo je do čudne kode: {response.status_code}')
    except Exception:
        print(f'Prišlo je do napake pri nalaganju strani {url}')

potem napisemo funkcijo, ki bo te podatke shranila v file:

In [131]:
def shrani_v_datoteko(text, directory, file):
    os.makedirs(directory, exist_ok=True)
    path = os.path.join(directory, file)
    with open(path, 'w', encoding='utf-8') as f:
        f.write(text)
    return None

in še funkcije, ki združi prejšnji funkciji:

In [132]:
def shrani(page, directory, file):
    vsebina = nalozi_iz_spleta(page)
    if vsebina != None:
        shrani_v_datoteko(vsebina, directory, file) 

Potem poženemo funkcijo, ki nam iz spletne strani shrani html kodo v datoteko na računalniku.

In [133]:
shrani(podatki_url, directory_dat, datoteka_html)

## 2. Izluščenje podatkov

In [134]:
def preberi_datoteko(dir, dat):
    pat = os.path.join(dir, dat)
    with open(pat, 'r', encoding = 'utf-8') as f:
        return f.read()

In [135]:
p = preberi_datoteko(directory_dat, datoteka_html)
print(p)

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled vector-feature-custom-font-size-clientpref-disabled vector-feature-client-preferences-disabled vector-feature-typography-survey-disabled vector-toc-available" lang="en" dir="ltr">
<head>
<meta charset="UTF-8">
<title>List of chemical elements - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-li

In [144]:
def luscenje(dir, dat):
    #najprej iz html kode izoliramo tabelo
    surovo = preberi_datoteko(dir, dat)
    zac_tab = surovo.index(r'<table')
    kon_tab = surovo.index(r'</table')
    tabela = surovo[zac_tab : kon_tab]
    vrstice = re.split('<tr', tabela) #naredimo seznam vrstic
    kljuci_slovarja = ['atomsko_st', 'simbol', 'ime', 'izvor_imena', 'skupina', 'perioda', 'blok', 'rel_at_masa', 'gostota', 'talisce', 'vrelisce', 'spec_toplota', 'el_neg', 'pogostost_v_zem_skorji', 'izvor', 'argegato']
    izl_vrstica = {
        'atomsko_st' : '',
        'simbol' : '',
        'ime' : '',
        'izvor_imena' : '',
        'skupina' : '',
        'perioda' : '',
        'blok' : '',
        'rel_at_masa' : '',
        'gostota' : '',
        'talisce' : '',
        'vrelisce' : '',
        'spec_toplota' : '',
        'el_neg' : '',
        'pogostost_v_zem_skorji' : '',
        'izvor' : '',
        'argegato' : ''
    }
    vrne = []
    for vrstica in vrstice[5:]:     #spustimo glavo
        celice = re.split(r'<td.*?>', vrstica, flags=re.DOTALL)     #ločimo vrstico na celice
        i = 0
        for x in celice[1:]:
            izl_celica = re.sub(r'<.*?>', '', x)
            izl_celica = re.sub(r'\n*', '', izl_celica)
            izl_celica = re.sub('&#8211;', '–', izl_celica)
            izl_vrstica[kljuci_slovarja[i]] = izl_celica
            i += 1
        print(izl_vrstica)
    return


In [145]:
p = luscenje(directory_dat, datoteka_html)
print(p)

{'atomsko_st': '1', 'simbol': 'H', 'ime': 'Hydrogen', 'izvor_imena': "Greek elements hydro- and -gen, 'water-forming'", 'skupina': '1', 'perioda': '1', 'blok': 's-block', 'rel_at_masa': '1.0080', 'gostota': '0.00008988', 'talisce': '14.01', 'vrelisce': '20.28', 'spec_toplota': '14.304', 'el_neg': '2.20', 'pogostost_v_zem_skorji': '1400', 'izvor': 'primordial', 'argegato': 'gas'}
{'atomsko_st': '2', 'simbol': 'He', 'ime': 'Helium', 'izvor_imena': "Greek hḗlios, 'sun'", 'skupina': '18', 'perioda': '1', 'blok': 's-block', 'rel_at_masa': '4.0026', 'gostota': '0.0001785', 'talisce': '–&#91;k&#93;', 'vrelisce': '4.22', 'spec_toplota': '5.193', 'el_neg': '–', 'pogostost_v_zem_skorji': '0.008', 'izvor': 'primordial', 'argegato': 'gas'}
{'atomsko_st': '3', 'simbol': 'Li', 'ime': 'Lithium', 'izvor_imena': "Greek líthos, 'stone'", 'skupina': '1', 'perioda': '2', 'blok': 's-block', 'rel_at_masa': '6.94', 'gostota': '0.534', 'talisce': '453.69', 'vrelisce': '1560', 'spec_toplota': '3.582', 'el_neg'