In [21]:
import pandas as pd

In [22]:
# żeby pracować z formatem xml musimy doinstalować pakiet lxml, korzystając z conda
# pamiętaj żeby zrobić restart kernela
!conda install -y lxml

Channels:
 - defaults
Platform: osx-arm64
Collecting package metadata (repodata.json): done
Solving environment: done

# All requested packages already installed.



In [19]:
# jeżeli nie chcesz użyć conda możesz też zainstalować lxml używając pip
!pip install --quiet lxml

# Pracujemy z plikami - XML

XML to język znaczników, który jest używany do przechowywania i przesyłania danych. XML jest skrótem od Extensible Markup Language. Jest to język znaczników, który definiuje zestaw zasad kodowania dokumentów w formacie, który jest czytelny zarówno dla człowieka, jak i dla maszyny.

Np.:
```xml
<client>
    <name>John Doe</name>
    <age>30</age>
    <city>New York</city>
</client>
```

## Czytanie pliku XML w Pandas

In [23]:
with open('simple.xml', "w") as f:
    f.write("""
<clients>
    <client>
        <name>John Doe</name>
        <age>25</age>
        <city>San Francisco</city>
    </client>
    <client>
        <name>Jane Doe</name>
        <age>22</age>
        <city>Los Angeles</city>
    </client>
</clients>
""")

In [24]:
df = pd.read_xml('simple.xml')
df

Unnamed: 0,name,age,city
0,John Doe,25,San Francisco
1,Jane Doe,22,Los Angeles


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    2 non-null      object
 1   age     2 non-null      int64 
 2   city    2 non-null      object
dtypes: int64(1), object(2)
memory usage: 180.0+ bytes


## Jak wczytywać daty z pliku XML?

In [26]:
with open('with_dates.xml', "w") as f:
    f.write("""
<clients>
    <client>
        <name>John Doe</name>
        <age>25</age>
        <city>San Francisco</city>
        <date_of_birth>1995-01-01</date_of_birth>
    </client>
    <client>
        <name>Jane Doe</name>
        <age>22</age>
        <city>Los Angeles</city>
        <date_of_birth>1998-01-01</date_of_birth>
    </client>
</clients>
""")

In [27]:
df = pd.read_xml('with_dates.xml')
df

Unnamed: 0,name,age,city,date_of_birth
0,John Doe,25,San Francisco,1995-01-01
1,Jane Doe,22,Los Angeles,1998-01-01


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           2 non-null      object
 1   age            2 non-null      int64 
 2   city           2 non-null      object
 3   date_of_birth  2 non-null      object
dtypes: int64(1), object(3)
memory usage: 196.0+ bytes


In [29]:
df = pd.read_xml('with_dates.xml', parse_dates=['date_of_birth'])
df

Unnamed: 0,name,age,city,date_of_birth
0,John Doe,25,San Francisco,1995-01-01
1,Jane Doe,22,Los Angeles,1998-01-01


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   name           2 non-null      object        
 1   age            2 non-null      int64         
 2   city           2 non-null      object        
 3   date_of_birth  2 non-null      datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 196.0+ bytes


In [31]:
df["date_of_birth"].dt.year

0    1995
1    1998
Name: date_of_birth, dtype: int64

In [37]:
with open ('plik_x.xml', 'w') as f:
    f.write("""
<autos>
    <auto>
        <marka>audi</marka>
        <model>q3</model>
        <cena>218000</cena>
    </auto>
    <auto>
        <marka>toyota</marka>
        <model>C-HR</model>
        <cena>120800</cena>
    </auto>
</autos>
""")

In [38]:
df = pd.read_xml('plik_x.xml')
df

Unnamed: 0,marka,model,cena
0,audi,q3,218000
1,toyota,C-HR,120800
