In [1]:
import pandas as pd

### Data Structures in Pandas
- Series
- DataFrame

Entrambe le data structures sono costruite usando Numpy die

#### Series

E' un one-dimensional array

In [2]:
# create a Series with an arbitrary list
s = pd.Series([7, 'Heisenberg', 3.14, -1789710578, 'Happy Eating!'])
s

0                7
1       Heisenberg
2             3.14
3      -1789710578
4    Happy Eating!
dtype: object

In [3]:
# personalizzazione dell'indice
s = pd.Series([7, 'Heisenberg', 3.14, -1789710578, 'Happy Eating!'],
              index=['A', 'Z', 'C', 'Y', 'E'])
s

A                7
Z       Heisenberg
C             3.14
Y      -1789710578
E    Happy Eating!
dtype: object

In [4]:
d = {"Davide": 100, "Alessio": 1000, "Luca": 10000}
nomi = pd.Series(d)
nomi

Davide       100
Alessio     1000
Luca       10000
dtype: int64

In [5]:
# Selezione di un valore da una serie (subsetting)
nomi['Davide']

100

In [6]:
nomi[['Davide', 'Alessio']]

Davide      100
Alessio    1000
dtype: int64

In [16]:
# subsetting booleano
nomi[nomi < 1000]

Davide    100
dtype: int64

In [12]:
nomi[[True, False, True]]

Davide      100
Luca      10000
dtype: int64

In [10]:
nomi < 1000

Davide      True
Alessio    False
Luca       False
dtype: bool

In [9]:
type(nomi < 1000)

pandas.core.series.Series

In [17]:
## Assegnare un nuovo valore per un elemento della serie
nomi['Davide'] = 3000
nomi

Davide      3000
Alessio     1000
Luca       10000
dtype: int64

In [18]:
nomi[nomi>9000] = 20
nomi

Davide     3000
Alessio    1000
Luca         20
dtype: int64

In [19]:
# operazioni aritmetiche
nomi * 20

Davide     60000
Alessio    20000
Luca         400
dtype: int64

In [20]:
nomi / 10

Davide     300.0
Alessio    100.0
Luca         2.0
dtype: float64

In [21]:
l = [20, 30, 40]
l/10

TypeError: unsupported operand type(s) for /: 'list' and 'int'

In [26]:
string_series = pd.Series(
    [
        "Davide Posillipo", 
        "Alessio Spagnolo", 
        "Paolo Carlevero"
    ]
)

In [25]:
type(string_series)

pandas.core.series.Series

In [27]:
string_series.str.replace(" ", "_")

0    Davide_Posillipo
1    Alessio_Spagnolo
2     Paolo_Carlevero
dtype: object

In [28]:
string_series.to_json()

'{"0":"Davide Posillipo","1":"Alessio Spagnolo","2":"Paolo Carlevero"}'

## Dataframes

In [37]:
data = {'year': [2010, 2011, 2012, 2011, 2012, 2010, 2011, 2012],
        'team': ['Bears', 'Bears', 'Bears', 'Packers', 'Packers', 'Lions', 'Lions', 'Lions'],
        'wins': [11, 8, 10, 15, 11, 6, 10, 4],
        'losses': [5, 8, 6, 1, 5, 10, 6, 12],
        "bool" : [True, False, True, False, True, False, True, False]}

In [38]:
football = pd.DataFrame(data)

In [39]:
football

Unnamed: 0,year,team,wins,losses,bool
0,2010,Bears,11,5,True
1,2011,Bears,8,8,False
2,2012,Bears,10,6,True
3,2011,Packers,15,1,False
4,2012,Packers,11,5,True
5,2010,Lions,6,10,False
6,2011,Lions,10,6,True
7,2012,Lions,4,12,False


In [40]:
type(football)

pandas.core.frame.DataFrame

In [41]:
type(football['wins'])

pandas.core.series.Series

In [42]:
# Selezionare le colonne
football['wins']

0    11
1     8
2    10
3    15
4    11
5     6
6    10
7     4
Name: wins, dtype: int64

In [43]:
football.wins

0    11
1     8
2    10
3    15
4    11
5     6
6    10
7     4
Name: wins, dtype: int64

### Utilities

In [44]:
football.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   year    8 non-null      int64 
 1   team    8 non-null      object
 2   wins    8 non-null      int64 
 3   losses  8 non-null      int64 
 4   bool    8 non-null      bool  
dtypes: bool(1), int64(3), object(1)
memory usage: 392.0+ bytes


In [46]:
pd.Series(["a", "b", "c"], dtype=pd.StringDtype())

0    a
1    b
2    c
dtype: string

In [47]:
football = football.astype({"team": pd.StringDtype()})

In [48]:
football.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   year    8 non-null      int64 
 1   team    8 non-null      string
 2   wins    8 non-null      int64 
 3   losses  8 non-null      int64 
 4   bool    8 non-null      bool  
dtypes: bool(1), int64(3), string(1)
memory usage: 392.0 bytes


In [49]:
football_2 = pd.DataFrame(data)

In [53]:
football_2.team = football_2.team.astype(pd.StringDtype())

In [55]:
football_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   year    8 non-null      int64 
 1   team    8 non-null      string
 2   wins    8 non-null      int64 
 3   losses  8 non-null      int64 
 4   bool    8 non-null      bool  
dtypes: bool(1), int64(3), string(1)
memory usage: 392.0 bytes
