In [1]:
import pandas as pd

### Pandas Data Structures
- Series
- DataFrame

I Dataframe sono composti di Series (cioè le colonne di un df sono delle Series). 

Pandas è costruito su Numpy, quindi le sue data structures sono costruite utilizzando numpy. 

### Series

è un vettore unidimensionale

In [2]:
# create a Series with an arbitrary list
s = pd.Series([7, 'Heisenberg', 3.14, -1789710578, 'Happy Eating!'])
s

0                7
1       Heisenberg
2             3.14
3      -1789710578
4    Happy Eating!
dtype: object

In [3]:
s = pd.Series([7, 'Heisenberg', 3.14, -1789710578, 'Happy Eating!'],
              index=['A', 'Z', 'C', 'Y', 'E'])
s

A                7
Z       Heisenberg
C             3.14
Y      -1789710578
E    Happy Eating!
dtype: object

In [4]:
d = {"Davide": 100, "Umberto": 200, "Riccardo": 300, "Carl": 400}

In [5]:
pd.Series(d)

Davide      100
Umberto     200
Riccardo    300
Carl        400
dtype: int64

In [6]:
nomi = pd.Series(d)

In [7]:
# Subsetting: selezione di uno o più elementi da una serie
nomi['Davide']

100

In [8]:
nomi[ ['Davide', 'Umberto'] ]

Davide     100
Umberto    200
dtype: int64

In [9]:
# Subsetting booleano
nomi[nomi < 250]

Davide     100
Umberto    200
dtype: int64

In [10]:
nomi < 250

Davide       True
Umberto      True
Riccardo    False
Carl        False
dtype: bool

In [11]:
nomi[ [True, True, False, False] ]

Davide     100
Umberto    200
dtype: int64

In [13]:
# lunghezza di una serie
nomi.shape[0]

4

In [14]:
# Assegnare valori nuovi ad una serie
nomi['Davide'] = 1000
nomi

Davide      1000
Umberto      200
Riccardo     300
Carl         400
dtype: int64

In [15]:
# Operazioni aritmetiche sulla serie
nomi / 10

Davide      100.0
Umberto      20.0
Riccardo     30.0
Carl         40.0
dtype: float64

In [16]:
l1 = [10, 100, 1000]
l1 / 10

TypeError: unsupported operand type(s) for /: 'list' and 'int'

In [17]:
string_series = pd.Series(
    ["Davide Posillipo",
     "Umberto Cangelosi",
     "Riccardo Convertino",
     "Carl Maria Caldura"
    ]
)

In [18]:
type(string_series)

pandas.core.series.Series

In [19]:
string_series.str.replace(" ", "_")

0       Davide_Posillipo
1      Umberto_Cangelosi
2    Riccardo_Convertino
3     Carl_Maria_Caldura
dtype: object

## Dataframe

In [20]:
data = {'year': [2010, 2011, 2012, 2011, 2012, 2010, 2011, 2012],
        'team': ['Bears', 'Bears', 'Bears', 'Packers', 'Packers', 'Lions', 'Lions', 'Lions'],
        'wins': [11, 8, 10, 15, 11, 6, 10, 4],
        'losses': [5, 8, 6, 1, 5, 10, 6, 12]}
football = pd.DataFrame(data, columns=['year', 'team', 'wins', 'losses'])
football

Unnamed: 0,year,team,wins,losses
0,2010,Bears,11,5
1,2011,Bears,8,8
2,2012,Bears,10,6
3,2011,Packers,15,1
4,2012,Packers,11,5
5,2010,Lions,6,10
6,2011,Lions,10,6
7,2012,Lions,4,12


In [21]:
type(football.year)

pandas.core.series.Series

In [22]:
football.shape #ndim=2, matrice di numpy

(8, 4)

### Utilities per dataframe

In [23]:
football.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   year    8 non-null      int64 
 1   team    8 non-null      object
 2   wins    8 non-null      int64 
 3   losses  8 non-null      int64 
dtypes: int64(3), object(1)
memory usage: 384.0+ bytes


In [24]:
football.dtypes

year       int64
team      object
wins       int64
losses     int64
dtype: object

In [25]:
football.describe() #solo per le variabili numeriche
# https://it.wikipedia.org/wiki/Quantile

Unnamed: 0,year,wins,losses
count,8.0,8.0,8.0
mean,2011.125,9.375,6.625
std,0.834523,3.377975,3.377975
min,2010.0,4.0,1.0
25%,2010.75,7.5,5.0
50%,2011.0,10.0,6.0
75%,2012.0,11.0,8.5
max,2012.0,15.0,12.0


In [27]:
football.head(2)

Unnamed: 0,year,team,wins,losses
0,2010,Bears,11,5
1,2011,Bears,8,8


In [28]:
football.tail(2)

Unnamed: 0,year,team,wins,losses
6,2011,Lions,10,6
7,2012,Lions,4,12


### Selecting da dataframe

In [29]:
# slicing usando direttamente l'indice
football[2:4]

Unnamed: 0,year,team,wins,losses
2,2012,Bears,10,6
3,2011,Packers,15,1


In [32]:
# selecting booleano 
football[football.wins < 10]

Unnamed: 0,year,team,wins,losses
1,2011,Bears,8,8
5,2010,Lions,6,10
7,2012,Lions,4,12


In [42]:
# selecting booleano con scelta di variabile specifica 
football[football.wins < 10][['team', 'wins']]

Unnamed: 0,team,wins
1,Bears,8
5,Lions,6
7,Lions,4


In [43]:
# selecting booleano con scelta di variabile specifica 
football[football['wins'] < 10][['team', 'wins']]

Unnamed: 0,team,wins
1,Bears,8
5,Lions,6
7,Lions,4
