In [5]:
from pathlib import Path
import pandas as pd

# Pracujemy z plikami - CSV / TSV

## Czym jest CSV?

In [6]:
with open("simple.csv", "w") as f:
    f.write("""
imie,nazwisko,wiek
Jan,Kowalski,25
Anna,Nowak,37
""")

In [7]:
df = pd.read_csv('simple.csv')
df

Unnamed: 0,imie,nazwisko,wiek
0,Jan,Kowalski,25
1,Anna,Nowak,37


In [10]:
with open("lepszy.csv", "w") as g:
    g.write("""
    marka,model,cena
    Audi,Q3,185000
    Toyota,C-HR,125000
    Ford,Puma,119000
    Skoda,Kamiq,113000
    Hiunday, Kona, 121000
    """)

In [11]:
df1 = pd.read_csv('lepszy.csv')
df1

Unnamed: 0,marka,model,cena
0,Audi,Q3,185000
1,Toyota,C-HR,125000
2,Ford,Puma,119000
3,Skoda,Kamiq,113000
4,Hiunday,Kona,121000


In [12]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0       marka  5 non-null      object
 1   model      5 non-null      object
 2   cena       5 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 252.0+ bytes


In [13]:
#df1['cena'] = pd.to_numeric(df1['cena'], errors='coerce')
df1['rata'] = (df1['cena'] / 60)*1.19
df1

Unnamed: 0,marka,model,cena,rata
0,Audi,Q3,185000,3669.166667
1,Toyota,C-HR,125000,2479.166667
2,Ford,Puma,119000,2360.166667
3,Skoda,Kamiq,113000,2241.166667
4,Hiunday,Kona,121000,2399.833333


In [14]:
df1.to_csv('lepszy1.csv', index=False)

In [15]:
df1 = pd.read_csv('lepszy1.csv')
df1

Unnamed: 0,marka,model,cena,rata
0,Audi,Q3,185000,3669.166667
1,Toyota,C-HR,125000,2479.166667
2,Ford,Puma,119000,2360.166667
3,Skoda,Kamiq,113000,2241.166667
4,Hiunday,Kona,121000,2399.833333


In [16]:
with open("no_header.csv", "w") as f:
    f.write("""
Jan,Kowalski,25
Anna,Nowak,37
""")

In [17]:
df = pd.read_csv('no_header.csv', header=None)
df

Unnamed: 0,0,1,2
0,Jan,Kowalski,25
1,Anna,Nowak,37


In [18]:
df = pd.read_csv('no_header.csv', header=None, names=['name', 'surname', 'age'])
df

Unnamed: 0,name,surname,age
0,Jan,Kowalski,25
1,Anna,Nowak,37


In [19]:
with open("semicolon.csv", "w") as f:
    f.write("""
imie;nazwisko;wiek
Jan;Kowalski;25
Anna;Nowak;35
""")

In [20]:
df = pd.read_csv('semicolon.csv', sep=";")
df

Unnamed: 0,imie,nazwisko,wiek
0,Jan,Kowalski,25
1,Anna,Nowak,35


In [21]:
with open("broken_because_of_comma.csv", "w") as f:
    f.write("""
imie,nazwisko,wiek
Jan,Kowalski, Junior,25
Anna,Nowak,35
""")

In [22]:
df = pd.read_csv('broken_because_of_comma.csv', sep=",")
df

Unnamed: 0,imie,nazwisko,wiek
Jan,Kowalski,Junior,25.0
Anna,Nowak,35,


In [23]:
with open("fixed_because_of_comma.csv", "w") as f:
    f.write("""
imie,nazwisko,wiek
Jan,"Kowalski, Junior",25
Anna,Nowak,35
""")

In [24]:
df = pd.read_csv('fixed_because_of_comma.csv')
df

Unnamed: 0,imie,nazwisko,wiek
0,Jan,"Kowalski, Junior",25
1,Anna,Nowak,35


## Jak wczytywać daty z pliku CSV?

In [25]:
with open("with_dates.csv", "w") as f:
    f.write("""
imie,nazwisko,wiek,"data urodzenia",wizyta
Jan,Kowalski,25,1985-01-01,2024-01-01T12:00:00
Anna,Nowak,35,1954-02-26,2024-03-07T12:00:00
""")

In [26]:
df = pd.read_csv('with_dates.csv')
df

Unnamed: 0,imie,nazwisko,wiek,data urodzenia,wizyta
0,Jan,Kowalski,25,1985-01-01,2024-01-01T12:00:00
1,Anna,Nowak,35,1954-02-26,2024-03-07T12:00:00


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   imie            2 non-null      object
 1   nazwisko        2 non-null      object
 2   wiek            2 non-null      int64 
 3   data urodzenia  2 non-null      object
 4   wizyta          2 non-null      object
dtypes: int64(1), object(4)
memory usage: 212.0+ bytes


In [28]:
df = pd.read_csv('with_dates.csv', parse_dates=['data urodzenia', 'wizyta'])
df

Unnamed: 0,imie,nazwisko,wiek,data urodzenia,wizyta
0,Jan,Kowalski,25,1985-01-01,2024-01-01 12:00:00
1,Anna,Nowak,35,1954-02-26,2024-03-07 12:00:00


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   imie            2 non-null      object        
 1   nazwisko        2 non-null      object        
 2   wiek            2 non-null      int64         
 3   data urodzenia  2 non-null      datetime64[ns]
 4   wizyta          2 non-null      datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(2)
memory usage: 212.0+ bytes


In [30]:
df["data urodzenia"].dt.year

0    1985
1    1954
Name: data urodzenia, dtype: int64

## Wczytywanie CSV ze ścieżki 

In [24]:
# wyobraźmy sobie, że plik z danymi żyje w jakiejś strukturze katalogów
# tworzymy więc zmienną DATA_PATH, która przechowuje ścieżkę do katalogu z danymi
DATA_PATH = Path(".").absolute()

# wczytujemy plik z danymi
df = pd.read_csv(DATA_PATH / "with_dates.csv", parse_dates=['data urodzenia', 'wizyta'])
df


Unnamed: 0,imie,nazwisko,wiek,data urodzenia,wizyta
0,Jan,Kowalski,25,1985-01-01,2024-01-01 12:00:00
1,Anna,Nowak,35,1954-02-26,2024-03-07 12:00:00


## Czym jest TSV?

In [25]:
with open("simple.tsv", "w") as f:
    f.write("""
imie\tnazwisko\twiek
Jan\tKowalski\t25
Anna\tNowak\t35
""")

In [26]:
df = pd.read_csv('simple.tsv', sep="\t")
df

Unnamed: 0,imie,nazwisko,wiek
0,Jan,Kowalski,25
1,Anna,Nowak,35
