# Pandas

# 1 - Series

In [1]:
import numpy as np
import pandas as pd

pd.Series([0.3, 0.5, 0.8])

0    0.3
1    0.5
2    0.8
dtype: float64

In [2]:
pd.Series([0.3, 0.5, 0.8], index=['3-July', '4-July', '5-July'] )

3-July    0.3
4-July    0.5
5-July    0.8
dtype: float64

<p><img src="./img/p1.png" alt="" width="300" height="100" /></p>

In [3]:
# Create a dataframe from a dictionary

city_dict = {'Latitude': 40.47, 'Longitude': 17.24, 'Elevation': 28.2}
pd.Series(city_dict)

Latitude     40.47
Longitude    17.24
Elevation    28.20
dtype: float64

In [4]:
# Create a dataframe with values and index

s1 = pd.Series([2.0, 3.1, 4.5, 8.7, 1.8], index=['mon', 'tue', 'wed', 'thu', 'fri'])
print(s1.values)
print(s1.index)

[2.  3.1 4.5 8.7 1.8]
Index(['mon', 'tue', 'wed', 'thu', 'fri'], dtype='object')


In [5]:
s1

mon    2.0
tue    3.1
wed    4.5
thu    8.7
fri    1.8
dtype: float64

### Indexing

In [6]:
s1.loc['tue'] # explicit indexing

3.1

In [7]:
s1.loc['tue'] = 5.8 # inplace value assignment

s1['tue'] # alternative explicit indexing

5.8

In [8]:
s1.iloc[2] # implicit indexing

4.5

### Slicing

In [9]:
s1.loc['mon':'wed'] # explicit slicing (stop element included)

mon    2.0
tue    5.8
wed    4.5
dtype: float64

In [10]:
s1.iloc[1:4] # implicit sclicing (stop element excluded)

tue    5.8
wed    4.5
thu    8.7
dtype: float64

### Masking

In [11]:
mask = (s1 > 2) & (s1<9)
mask

mon    False
tue     True
wed     True
thu     True
fri    False
dtype: bool

In [12]:
s1[mask]

tue    5.8
wed    4.5
thu    8.7
dtype: float64

In [13]:
mask = (s1.index == 'mon') | (s1.index == 'fri')
s1[mask]

mon    2.0
fri    1.8
dtype: float64

### Fancy indexing

In [14]:
s1.loc[['fri', 'mon']]

fri    1.8
mon    2.0
dtype: float64

In [15]:
s1.loc[['wed', 'fri']] = np.NaN
s1

mon    2.0
tue    5.8
wed    NaN
thu    8.7
fri    NaN
dtype: float64

# 2 - DataFrame

### Creation from a 2D Numpy array

<p><img src="./img/p2.png" alt="" width="280" height="100" /></p>

In [16]:
arr = np.array([[1.0, 5, 1.5],
                [1.4, 10, 0.3],
                [5, 8, 11]])
col = ['Price', 'Quantity', 'Liters']
idx = ['Water', 'Beer', 'Wine']

df = pd.DataFrame(arr, index = idx, columns = col)
df

Unnamed: 0,Price,Quantity,Liters
Water,1.0,5.0,1.5
Beer,1.4,10.0,0.3
Wine,5.0,8.0,11.0


### Creation from Pandas series

In [17]:
price = pd.Series([8.99, 29.99, 49.99], index=['cap', 'shirt', 'hoodie' ])
quantity = pd.Series([2, 2, 1], index=['cap', 'shirt', 'hoodie' ])
stock = pd.Series([12, 120, 54], index=['cap', 'shirt', 'hoodie' ])

rec_df = pd.DataFrame({'Price': price, 'Quantity': quantity, 'Stock': stock})
rec_df

Unnamed: 0,Price,Quantity,Stock
cap,8.99,2,12
shirt,29.99,2,120
hoodie,49.99,1,54


### Creation from dictionaries

In [18]:
dic_list = [{'c1': i, 'c2': i**2, 'c3': i**3} for i in range(6)]
dic_list

[{'c1': 0, 'c2': 0, 'c3': 0},
 {'c1': 1, 'c2': 1, 'c3': 1},
 {'c1': 2, 'c2': 4, 'c3': 8},
 {'c1': 3, 'c2': 9, 'c3': 27},
 {'c1': 4, 'c2': 16, 'c3': 64},
 {'c1': 5, 'c2': 25, 'c3': 125}]

In [19]:
el_df = pd.DataFrame(dic_list)
el_df

Unnamed: 0,c1,c2,c3
0,0,0,0
1,1,1,1
2,2,4,8
3,3,9,27
4,4,16,64
5,5,25,125


### Import from File

<p>It is possible to create a DataFrame reading a <strong>csv</strong> or <strong>json</strong> file. For the csv, in many cases it is important to specify the separator character, the number of rows to be skipped at the beginning of the file and all the possible null values.</p>

In [20]:
city_meteo = pd.read_json('./data/p1.json', orient ='records')

city_meteo

Unnamed: 0,Temperature,Humidity,Wind
MI,15.3,52.0,3.5
TO,16.5,,2.5
,22.0,54.0,


In [21]:
stock_2019 = pd.read_csv('./data/stock.csv', 
                         sep = ',', 
                         skiprows=1,
                         index_col = 0,
                         na_values = ['none', 'null'])
stock_2019

Unnamed: 0_level_0,Q1,Q2,Q3,Q4
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FB,166.69,193.0,178.08,205.17
AAPL,47.49,49.48,,73.41
AMZN,1780.75,1893.63,1735.91,1847.84
NFLX,356.56,367.32,267.62,323.57
GOOG,1173.31,1080.91,1219.0,1337.02


<p>In the same way it is possible to convert a Pandas DataFrame in a file. Several extension are supported (csv, json, HTML, HDF5, SAS, ...)</p>

In [22]:
city_meteo.to_csv('./data/meteo.csv', sep=',', index = True)

In [23]:
stock_2019.to_json('./data/stock.json')

## Accessing a DataFrame

In [24]:
stock_2019

Unnamed: 0_level_0,Q1,Q2,Q3,Q4
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FB,166.69,193.0,178.08,205.17
AAPL,47.49,49.48,,73.41
AMZN,1780.75,1893.63,1735.91,1847.84
NFLX,356.56,367.32,267.62,323.57
GOOG,1173.31,1080.91,1219.0,1337.02


In [25]:
stock_2019.values # Retrurna a Numpy array with data

array([[ 166.69,  193.  ,  178.08,  205.17],
       [  47.49,   49.48,     nan,   73.41],
       [1780.75, 1893.63, 1735.91, 1847.84],
       [ 356.56,  367.32,  267.62,  323.57],
       [1173.31, 1080.91, 1219.  , 1337.02]])

### Accessing a column

In [26]:
stock_2019['Q2'] # Returns a Series with column data

Code
FB       193.00
AAPL      49.48
AMZN    1893.63
NFLX     367.32
GOOG    1080.91
Name: Q2, dtype: float64

In [27]:
stock_2019.Q3

Code
FB       178.08
AAPL        NaN
AMZN    1735.91
NFLX     267.62
GOOG    1219.00
Name: Q3, dtype: float64

### Accessing a Row

In [28]:
stock_2019.loc['GOOG'] # explicit indexing

Q1    1173.31
Q2    1080.91
Q3    1219.00
Q4    1337.02
Name: GOOG, dtype: float64

In [29]:
stock_2019.iloc[2] # implicit indexing

Q1    1780.75
Q2    1893.63
Q3    1735.91
Q4    1847.84
Name: AMZN, dtype: float64

### Slicing

In [30]:
stock_2019.loc['AAPL':'NFLX','Q2':'Q3'] # explicit indexing

Unnamed: 0_level_0,Q2,Q3
Code,Unnamed: 1_level_1,Unnamed: 2_level_1
AAPL,49.48,
AMZN,1893.63,1735.91
NFLX,367.32,267.62


In [31]:
stock_2019.iloc[0:2, :2] # implicit indexing

Unnamed: 0_level_0,Q1,Q2
Code,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,166.69,193.0
AAPL,47.49,49.48


### Masking

In [32]:
mask = (stock_2019['Q2'] < 1000) & (stock_2019['Q4'] > 100)
mask

Code
FB       True
AAPL    False
AMZN    False
NFLX     True
GOOG    False
dtype: bool

In [33]:
stock_2019.loc[mask, 'Q3':] # masking + slicing

Unnamed: 0_level_0,Q3,Q4
Code,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,178.08,205.17
NFLX,267.62,323.57


### Query

In [34]:
stock_2019.query('Q2<1000 and Q4>100')

Unnamed: 0_level_0,Q1,Q2,Q3,Q4
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FB,166.69,193.0,178.08,205.17
NFLX,356.56,367.32,267.62,323.57


### Fancy Indexing

In [35]:
stock_2019.loc[['FB', 'GOOG'], ['Q1', 'Q4']]

Unnamed: 0_level_0,Q1,Q4
Code,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,166.69,205.17
GOOG,1173.31,1337.02


In [36]:
mask = (stock_2019['Q2'] < 1000) & (stock_2019['Q4'] > 100)

stock_2019.loc[mask, ['Q1', 'Q3']] # masking + fancy

Unnamed: 0_level_0,Q1,Q3
Code,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,166.69,178.08
NFLX,356.56,267.62


## Modify a DataFame

### Assign a value

In [37]:
market_df = pd.read_csv('./data/market_data.csv', header=0, sep=',', index_col=0)
market_df

Unnamed: 0_level_0,quantity,price,tax,available
product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,155,29.99,10,True
b,78,19.99,15,False
c,120,50.0,20,True
d,3,1189.99,20,True
e,85,139.99,5,False
f,42,24.99,15,True


In [38]:
market_df.loc[['a', 'd'], ['tax', 'quantity']] = np.NaN
market_df.iloc[ 1:3, 2:3] = 0

market_df # DataFrame modified inplace

Unnamed: 0_level_0,quantity,price,tax,available
product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,,29.99,,True
b,78.0,19.99,0.0,False
c,120.0,50.0,0.0,True
d,,1189.99,,True
e,85.0,139.99,5.0,False
f,42.0,24.99,15.0,True


### Adding new columns

In [39]:
# replace an existing column inplace
market_df['tax'] = pd.Series([10, 10, 20, 20, 15, 10],
                            index = ['a', 'b', 'c', 'd', 'e', 'f'])
market_df

Unnamed: 0_level_0,quantity,price,tax,available
product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,,29.99,10,True
b,78.0,19.99,10,False
c,120.0,50.0,20,True
d,,1189.99,20,True
e,85.0,139.99,15,False
f,42.0,24.99,10,True


In [40]:
# add a new column from a list inplace
market_df['new'] = [True, False, False, False, True, True]

market_df

Unnamed: 0_level_0,quantity,price,tax,available,new
product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,,29.99,10,True,True
b,78.0,19.99,10,False,False
c,120.0,50.0,20,True,False
d,,1189.99,20,True,False
e,85.0,139.99,15,False,True
f,42.0,24.99,10,True,True


### Rename columns

In [41]:
# use a dictionary which maps old names with new names
rename_dict = {'quantity' : 'Q',
               'price': 'P',
               'tax': 'T',
               'available': 'A',
               'new': 'N'}

# returns a copy of the DataFrame
market_df.rename(columns = rename_dict)

Unnamed: 0_level_0,Q,P,T,A,N
product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,,29.99,10,True,True
b,78.0,19.99,10,False,False
c,120.0,50.0,20,True,False
d,,1189.99,20,True,False
e,85.0,139.99,15,False,True
f,42.0,24.99,10,True,True


### Drop Columns

In [42]:
# returns a copy of the updated dataframe
market_df.drop(columns=['price', 'tax'])

Unnamed: 0_level_0,quantity,available,new
product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,,True,True
b,78.0,False,False
c,120.0,True,False
d,,True,False
e,85.0,False,True
f,42.0,True,True


# 3 - Computation with Pandas

## Operations

<ul>
<li>Unary Operations on Series and DataFrames<br />
<ul>
<li>They works with any <strong>Numpy</strong> ufunc<sup>*</sup></li>
<li>The operation is applied to each element of the Series / DataFrame</li>
</ul>
</li>
<li>Operation between Series and DataFrames
<ul>
<li>Operations are performed <strong>element-wise</strong>, taking into account the indexes and columns</li>
<li>If the index and/or column do not match the result is set to <strong>NaN</strong></li>
<li>It is possible performing operations between DF and Series, with the same rules that apply for Numpy <strong>broadcasting</strong></li>
</ul>
</li>
<li>Aggregation operations
<ul>
<li>It is possible to <strong>aggregate</strong> over Series and DataFrame</li>
<li>Typcal operations are mean, std, min ,max and sum</li>
<li>For DF the aggregation is performed column-wise and <strong>returns a series</strong></li>
</ul>
</li>
</ul>
<p><sub><em><sup>*</sup>A universal function (or ufunc for short) is a function that operates on ndarrays in an element-by-element fashion, supporting array broadcasting, type casting, and several other standard features. That is, a ufunc is a &ldquo;vectorized&rdquo; wrapper for a function that takes a fixed number of specific inputs and produces a fixed number of specific outputs.</em></sub></p>

### Unary operations on Series

In [43]:
s1 = pd.Series([10, 15, 20], index = ['a', 'b', 'c'])

s1 ** 2

a    100
b    225
c    400
dtype: int64

### Unary operations on DataFrames

In [44]:
stock_2019 * 2

Unnamed: 0_level_0,Q1,Q2,Q3,Q4
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FB,333.38,386.0,356.16,410.34
AAPL,94.98,98.96,,146.82
AMZN,3561.5,3787.26,3471.82,3695.68
NFLX,713.12,734.64,535.24,647.14
GOOG,2346.62,2161.82,2438.0,2674.04


### Operations between Series

In [45]:
s1 = pd.Series([10, 15, 10], index = ['Jan', 'Feb', 'Mar'])
s2 = pd.Series([10, 12, 20], index = ['Jan', 'Mar', 'Apr'])

s1 + s2

Apr     NaN
Feb     NaN
Jan    20.0
Mar    22.0
dtype: float64

### Operations between DataFrames

In [46]:
dict1 = {'Total': {'a': 1, 'b': 3, 'c': 5},
         'Quantity': {'a': 2, 'b': 4,'c': 6}}

df1 = pd.DataFrame(dict1)
df1

Unnamed: 0,Total,Quantity
a,1,2
b,3,4
c,5,6


In [47]:
dict2 = {'Total': {'a': 3, 'b': 4, 'c': 2, 'd':6},
         'Price': {'a': 2, 'b': 5, 'c': 5, 'd':4}}

df2 = pd.DataFrame(dict2)
df2

Unnamed: 0,Total,Price
a,3,2
b,4,5
c,2,5
d,6,4


In [48]:
df1 + df2

Unnamed: 0,Price,Quantity,Total
a,,,4.0
b,,,7.0
c,,,7.0
d,,,


### Operations between DataFrames and Series

<p>The operation is applied between the Series and each&nbsp;<strong>row</strong> of the DataFrame followiong&nbsp;<strong>broadcasting</strong> rules.</p>

In [49]:
temp_dict = {'TO': {'Jan': 0.5, 'Feb': -2, 'Mar': 5, 'Apr':9},
             'MI': {'Jan': 1, 'Feb': -1, 'Mar': 7, 'Apr':12},
             'NA': {'Jan': 6, 'Feb': 5, 'Mar': 12, 'Apr':15}}

temp_df = pd.DataFrame(temp_dict)
temp_df

Unnamed: 0,TO,MI,NA
Jan,0.5,1,6
Feb,-2.0,-1,5
Mar,5.0,7,12
Apr,9.0,12,15


In [50]:
offset = pd.Series([0.5, -1.2, 0.3], index = temp_df.columns)
offset

TO    0.5
MI   -1.2
NA    0.3
dtype: float64

In [51]:
temp_df + offset

Unnamed: 0,TO,MI,NA
Jan,1.0,-0.2,6.3
Feb,-1.5,-2.2,5.3
Mar,5.5,5.8,12.3
Apr,9.5,10.8,15.3


### Aggregation over Series and DataFrames

In [52]:
temp_df.std() # default axis = rows

TO    4.871259
MI    5.909033
NA    4.795832
dtype: float64

In [53]:
temp_df.mean(axis = 'columns') 

Jan     2.500000
Feb     0.666667
Mar     8.000000
Apr    12.000000
dtype: float64

In [54]:
# Z-score normalization
(temp_df - temp_df.mean())/temp_df.std()

Unnamed: 0,TO,MI,NA
Jan,-0.538875,-0.634622,-0.7298
Feb,-1.052089,-0.973087,-0.938315
Mar,0.384911,0.380773,0.521286
Apr,1.206054,1.226935,1.146829


## Missing Values

<p><strong>None </strong>is the&nbsp;Python keyword used to represent missing values that&nbsp;generates an object. Numpy represents missing values with&nbsp;<strong>np.NaN </strong>("Not-A-Number") which is a flotaing point number. Pandas supports both None and NaN, but typically NaN performs better with numerical computations.</p>
<p>Some important methods used to deal with missing values are:</p>
<ul>
<li>isnull( ) - Returns a boolean mask indicating null values</li>
<li>notnull( ) - Returns a boolena mask indicatin not-null values</li>
<li>dropna( ) - Returns a copy of the dataset filtered from columns or rows containing missin values</li>
<li>fillna( ) - Return a copy of the datset where the null values have been replaced with a specified techniques</li>
</ul>

In [55]:
s1 = pd.Series([10, None, 30, None, np.NaN])
s1

0    10.0
1     NaN
2    30.0
3     NaN
4     NaN
dtype: float64

In [56]:
s1.fillna(method='ffill') # propagate last valid value

0    10.0
1    10.0
2    30.0
3    30.0
4    30.0
dtype: float64

In [57]:
s1.fillna(method='bfill') # use next valid value

0    10.0
1    30.0
2    30.0
3     NaN
4     NaN
dtype: float64

In [58]:
s1.fillna(0) #fill with a defualt value

0    10.0
1     0.0
2    30.0
3     0.0
4     0.0
dtype: float64

In [59]:
s1.fillna(s1.mean()) # using the mean is a common practice (not always a good one)

0    10.0
1    20.0
2    30.0
3    20.0
4    20.0
dtype: float64

In [60]:
dict1 = {'Total': {'a': None, 'b': 4, 'c': 2, 'd':6},
         'Price': {'a': 2, 'b': 5, 'c': np.NaN, 'd':4},
         'Volume': {'a':2, 'b':None, 'c': np.NaN, 'd':3}}

df1 = pd.DataFrame(dict1)
df1

Unnamed: 0,Total,Price,Volume
a,,2.0,2.0
b,4.0,5.0,
c,2.0,,
d,6.0,4.0,3.0


In [61]:
df1.notnull()

Unnamed: 0,Total,Price,Volume
a,False,True,True
b,True,True,False
c,True,False,False
d,True,True,True


In [62]:
df1.fillna(method='ffill', axis=0) # default axis = 0

Unnamed: 0,Total,Price,Volume
a,,2.0,2.0
b,4.0,5.0,2.0
c,2.0,5.0,2.0
d,6.0,4.0,3.0


In [63]:
stock_2019

Unnamed: 0_level_0,Q1,Q2,Q3,Q4
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FB,166.69,193.0,178.08,205.17
AAPL,47.49,49.48,,73.41
AMZN,1780.75,1893.63,1735.91,1847.84
NFLX,356.56,367.32,267.62,323.57
GOOG,1173.31,1080.91,1219.0,1337.02


In [64]:
stock_2019.dropna(axis='rows')                

Unnamed: 0_level_0,Q1,Q2,Q3,Q4
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FB,166.69,193.0,178.08,205.17
AMZN,1780.75,1893.63,1735.91,1847.84
NFLX,356.56,367.32,267.62,323.57
GOOG,1173.31,1080.91,1219.0,1337.02


In [65]:
stock_2019.dropna(axis='columns')                

Unnamed: 0_level_0,Q1,Q2,Q4
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
FB,166.69,193.0,205.17
AAPL,47.49,49.48,73.41
AMZN,1780.75,1893.63,1847.84
NFLX,356.56,367.32,323.57
GOOG,1173.31,1080.91,1337.02


In [66]:
stock_2019.fillna(method='ffill', axis=1) # default axis = 0

Unnamed: 0_level_0,Q1,Q2,Q3,Q4
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FB,166.69,193.0,178.08,205.17
AAPL,47.49,49.48,49.48,73.41
AMZN,1780.75,1893.63,1735.91,1847.84
NFLX,356.56,367.32,267.62,323.57
GOOG,1173.31,1080.91,1219.0,1337.02


## Example 1

In [67]:
# Input table (12 samples x 4 attributes)
X = np.array([[5.1, 3.5, 1, 0.2],
           [4.3, 3. , 1, 0.1],
           [5. , np.nan, 1, 0.4],
           [5.1, 3.4, 2, 0.2],
           [7.0, 3.2, 1, 0.2],
           [6.9, 3.1, 3, 1.5],
           [6.7, 3.1, 1, np.nan],
           [6. , 2.9, 2, 1.5],
           [6.1, 3. , 2, 1.4],
           [6.5, 3. , 3, 2.2],
           [7.7, 3.8, 3, 2.2],
           [7.4, 2.8, 1, 1.9],
           [6.8, 3.2, 1, 2.3]])

# Column names
features = ['height','width','intensity','weight']

# Class label of each sample
labels = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2])

**a) Create a DataFrame from a Numpy array (X)**
- Setup column names and the class label
- Add a new composite feature, 'area' = 'width' * 'height'
- Fix missing values with forward fill method

In [68]:
df = pd.DataFrame(X, columns = features)
df['labels'] = labels
df['area'] = df['width'] * df['height']
df = df.fillna(method='ffill')

df

Unnamed: 0,height,width,intensity,weight,labels,area
0,5.1,3.5,1.0,0.2,0,17.85
1,4.3,3.0,1.0,0.1,0,12.9
2,5.0,3.0,1.0,0.4,0,12.9
3,5.1,3.4,2.0,0.2,0,17.34
4,7.0,3.2,1.0,0.2,0,22.4
5,6.9,3.1,3.0,1.5,1,21.39
6,6.7,3.1,1.0,1.5,1,20.77
7,6.0,2.9,2.0,1.5,1,17.4
8,6.1,3.0,2.0,1.4,1,18.3
9,6.5,3.0,3.0,2.2,2,19.5


**b) Compute the average area of samples with intensity greater than 1:**

In [69]:
df.loc[df['intensity'] > 1, 'area']

3     17.34
5     21.39
7     17.40
8     18.30
9     19.50
10    29.26
Name: area, dtype: float64

In [70]:
df.loc[df['intensity'] > 1, 'area'].mean()

20.531666666666666

**c) Compute the following probability:**

$$P(label=2 \ | \ height<7)$$   

In [71]:
num = (df['labels'] == 2) & (df['height'] < 7)
num

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9      True
10    False
11    False
12     True
dtype: bool

In [72]:
den = (df['height'] < 7)
P = num.sum()/den.sum()
P

0.2

**d) Compute the following probability. Use the Naive Bayes technique.**
$$P(label=l \ | \ intensity=2 \land height<6.5)$$

$$P(l|X)\cdot P(X)=P(X|l)\cdot P(l)$$


$$P(l|X)\cdot P(X)=P(X_{1}|l)\cdot P(X_{1}|l)\cdot P(l)$$


$$P(l|X) = \frac{P(X_{1}|l)\cdot P(X_{1}|l)\cdot P(l)}{P(X)}$$

In [73]:
for l in range(3):
    supp_l = (df['labels'] == l).sum() 
    px = (((df['intensity'] == 2) & (df['height'] < 6.5)).sum()) / len(df)

    pl = supp_l / len(df)
    px1 = ((df['intensity'] == 2) & (df['labels'] == l)).sum() / supp_l
    px2 = ((df['height'] < 6.5) & (df['labels'] == l)).sum() / supp_l
    
    pl_X = (px1 * px2 * pl) / px
    
    print(f"P({l}|X) = {pl_X}")

P(0|X) = 0.2666666666666667
P(1|X) = 0.3333333333333333
P(2|X) = 0.0


In [74]:
# alternative method (non-naive)
print(df.loc[(df['intensity'] == 2) & (df['height'] < 6.5), ['height','intensity','labels']])
for l in range(3):
    num = (df['intensity'] == 2) & (df['height'] < 6.5) & (df['labels'] == l)
    den = (df['intensity'] == 2) & (df['height'] < 6.5)
    P = num.sum()/den.sum()
    print(f'P({l}|X=x) = {P}')

   height  intensity  labels
3     5.1        2.0       0
7     6.0        2.0       1
8     6.1        2.0       1
P(0|X=x) = 0.3333333333333333
P(1|X=x) = 0.6666666666666666
P(2|X=x) = 0.0


# 4 - Combining Pandas Objects

<p>Pandas provides four methods for combining Series and DataFrames</p>
<ul>
<li>concat( )</li>
<li>append( )</li>
</ul>

In [75]:
s1 = pd.Series(['10', '20'], index = ['jan', 'feb'] )
s2 = pd.Series(['30', '40'], index = ['jan', 'feb'] )

pd.concat((s1, s2)) # index is preserved, even if duplicated (not nice for keys)

jan    10
feb    20
jan    30
feb    40
dtype: object

In [76]:
# To avoid duplicates use ignore_index

pd.concat((s1, s2), ignore_index = True)

0    10
1    20
2    30
3    40
dtype: object

In [77]:
df1 = pd.DataFrame(np.array([[1, 2], [3, 4]]), index = ['a', 'b'], columns = ['Total', 'Quantity'])
df2 = pd.DataFrame(np.array([[5, 6], [7, 8]]), index = ['c', 'd'], columns = ['Total', 'Quantity'])
print(f"{df1}\n\n{df2}")

pd.concat((df1, df2)) # Concatenate vertically by default


   Total  Quantity
a      1         2
b      3         4

   Total  Quantity
c      5         6
d      7         8


Unnamed: 0,Total,Quantity
a,1,2
b,3,4
c,5,6
d,7,8


In [78]:
df1 = pd.DataFrame(np.array([[1, 2], [3, 4]]), index = ['a', 'b'], columns = ['Total', 'Quantity'])
df2 = pd.DataFrame(np.array([[5, 6, 1], [7, 8, 2]]), index = ['c', 'd'], columns = ['Total', 'Quantity', 'Liters'])
print(f"{df1}\n\n{df2}")


pd.concat((df1, df2)) # Missing columns filled with NaN

   Total  Quantity
a      1         2
b      3         4

   Total  Quantity  Liters
c      5         6       1
d      7         8       2


Unnamed: 0,Total,Quantity,Liters
a,1,2,
b,3,4,
c,5,6,1.0
d,7,8,2.0


The append( ) method is a shortcut for concatenating DataFrames

In [79]:
df1.append(df2)

Unnamed: 0,Total,Quantity,Liters
a,1,2,
b,3,4,
c,5,6,1.0
d,7,8,2.0


In [80]:
df1 = pd.DataFrame(np.array([[10, 20], [30, 40], [50, 60]]), index = ['a', 'b', 'c'], columns = ['Jan', 'Feb'])
df2 = pd.DataFrame(np.array([[94, 95, 96], [97, 98, 99]]), index = ['a', 'b'], columns = ['Mar', 'Apr', 'May'])
print(f"{df1}\n\n{df2}")


pd.concat((df1, df2), axis=1) # concatenate horizontally

   Jan  Feb
a   10   20
b   30   40
c   50   60

   Mar  Apr  May
a   94   95   96
b   97   98   99


Unnamed: 0,Jan,Feb,Mar,Apr,May
a,10,20,94.0,95.0,96.0
b,30,40,97.0,98.0,99.0
c,50,60,,,


In [81]:
df1.append(df2)

Unnamed: 0,Jan,Feb,Mar,Apr,May
a,10.0,20.0,,,
b,30.0,40.0,,,
c,50.0,60.0,,,
a,,,94.0,95.0,96.0
b,,,97.0,98.0,99.0


<p>It is possible to join DataFrames with the&nbsp;<strong>merge( )&nbsp;</strong>method, which is based on <strong>relational algebra</strong></p>
<ul>
<li>Columns with the same name in the two DF to be merged are used as keys</li>
<li>Depending on the DF it is possible ot perform a join
<ul>
<li>one-to-one</li>
<li>maty-to-one</li>
<li>many to many</li>
</ul>
</li>
<li>Indices of the input DF are discarded</li>
</ul>

In [82]:
df1 = pd.DataFrame([[0, 'blue'], [1, 'orange']], index = ['id001', 'id002'], columns = ['key', 'color'])
df2 = pd.DataFrame([[0, 14.99, 'cap'], [1, 29.99, 'shirt']], index = ['p01', 'p02'], columns = ['key', 'price', 'name'])
print(f"{df1}\n\n{df2}")

pd.merge(df1, df2) # one-to-one                   

       key   color
id001    0    blue
id002    1  orange

     key  price   name
p01    0  14.99    cap
p02    1  29.99  shirt


Unnamed: 0,key,color,price,name
0,0,blue,14.99,cap
1,1,orange,29.99,shirt


In [83]:
df1 = pd.DataFrame([[0, 'blue'], [1, 'orange'], [0, 'white'], [1, 'red']], index = ['id001', 'id002', 'id003', 'id004'], columns = ['key', 'color'])
df2 = pd.DataFrame([[0, 14.99, 'cap'], [1, 29.99, 'shirt']], index = ['p01', 'p02'], columns = ['key', 'price', 'name'])
print(f"{df1}\n\n{df2}")

pd.merge(df1, df2) # many-to-many

       key   color
id001    0    blue
id002    1  orange
id003    0   white
id004    1     red

     key  price   name
p01    0  14.99    cap
p02    1  29.99  shirt


Unnamed: 0,key,color,price,name
0,0,blue,14.99,cap
1,0,white,14.99,cap
2,1,orange,29.99,shirt
3,1,red,29.99,shirt


In [84]:
df1 = pd.DataFrame([['a', 11.432], ['b', 1.303], ['c', 99.906]], columns = ['X1', 'X2'])
df2 = pd.DataFrame([['a', 20.784], ['b', np.NaN], ['d', 20.784]], columns = ['X1', 'X3'])
print(f"{df1}\n\n{df2}")

  X1      X2
0  a  11.432
1  b   1.303
2  c  99.906

  X1      X3
0  a  20.784
1  b     NaN
2  d  20.784


In [85]:
pd.merge(df1, df2, how='left', on='X1')

Unnamed: 0,X1,X2,X3
0,a,11.432,20.784
1,b,1.303,
2,c,99.906,


In [86]:
pd.merge(df1, df2, how='right', on='X1')

Unnamed: 0,X1,X2,X3
0,a,11.432,20.784
1,b,1.303,
2,d,,20.784


In [87]:
pd.merge(df1, df2, how='inner', on='X1')

Unnamed: 0,X1,X2,X3
0,a,11.432,20.784
1,b,1.303,


In [88]:
pd.merge(df1, df2, how='outer', on='X1')

Unnamed: 0,X1,X2,X3
0,a,11.432,20.784
1,b,1.303,
2,c,99.906,
3,d,,20.784


# 5 - Grouping Data

<p>Pandas provides the equivalent of the SQL&nbsp;<strong>GROUP BY&nbsp;</strong>statement, allowing</p>
<ul>
<li><strong>Iterations</strong> on groups</li>
<li><strong>Aggregation&nbsp;</strong>operators on groups (mean, min, max, ...)</li>
<li><strong>Filtering</strong> groups according to a condition</li>
</ul>
<p>The column passed as argument to the <strong>groupby( )&nbsp;</strong>method specifies the&nbsp;<strong>key&nbsp;</strong>on which the aggregation is performed. The function returns a <strong>DataFrameGroupBy&nbsp;</strong>object.&nbsp;</p>

In [89]:
df = pd.DataFrame({'k': ['a', 'b', 'a', 'b'], 'c1': [2, 10, 3, 15], 'c2': [4, 20, 5, 30]})
df

Unnamed: 0,k,c1,c2
0,a,2,4
1,b,10,20
2,a,3,5
3,b,15,30


In [90]:
grouped_df = df.groupby('k') # 2 groups 'a' and 'b'

grouped_df

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fa8655568e0>

<p><img src="./img/p4.png" alt="" width="400" height="100" /></p>

### Iterating on groups

Each group is a subset of the original DataFrame

In [91]:
for key, group in grouped_df:
    print(f"\nkey : {key}")
    print(f"\ngroup :\n{group}")


key : a

group :
   k  c1  c2
0  a   2   4
2  a   3   5

key : b

group :
   k  c1  c2
1  b  10  20
3  b  15  30


<p><img src="./img/p5.png" alt="" width="200" height="100" /></p>

### Aggregating by group

The output is a DataFrame with the result of the aggregation (min, max, mean, std, ...) for each group

In [92]:
grouped_df.mean()

Unnamed: 0_level_0,c1,c2
k,Unnamed: 1_level_1,Unnamed: 2_level_1
a,2.5,4.5
b,12.5,25.0


It is also possible to aggregate a single column by group

In [93]:
grouped_df['c1'].mean()

k
a     2.5
b    12.5
Name: c1, dtype: float64

### Filtering

In [94]:
# keeps groups for which c1 has a mean > 5

grouped_df.filter(lambda x: x['c1'].mean() > 5)

Unnamed: 0,k,c1,c2
1,b,10,20
3,b,15,30


<p><img src="./img/p6.png" alt="" width="300" height="100" /></p>

# 6 - Pivoting

Pivoting allows inspecting relationship within a dataset

In [95]:
df = pd.DataFrame({'type': ['a', 'b', 'b', 'a', 'b', 'a', 'b', 'a'],
                   'class': ['3', '2', '3', '3', '2', '1', '1', '2'],
                   'fail': [1, 1, 1, 0, 1, 0, 0, 0]})

df

Unnamed: 0,type,class,fail
0,a,3,1
1,b,2,1
2,b,3,1
3,a,3,0
4,b,2,1
5,a,1,0
6,b,1,0
7,a,2,0


Suppose that the DataSet shows failures for sensor of a given type and class during some test

In [96]:
df.pivot_table(values='fail', index='type', columns='class', aggfunc='sum')

class,1,2,3
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,0,0,1
b,0,2,1


<p><img src="./img/p7.png" alt="" width="200" height="100" /></p>

The table shows that 2 sensors of type "b" and class "2" had some failure

In [97]:
df.pivot_table(values='fail', index='type', columns='class', aggfunc='mean')

class,1,2,3
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,0.0,0.0,0.5
b,0.0,1.0,1.0


<p><img src="./img/p8.png" alt="" width="200" height="100" /></p>
The table shows that 50% of the sensors of type 'a' and class '3' had a failure

# 7 - Multi-indexing 

<p><strong>Multi-index&nbsp;</strong>allows specifying an index hierarchy for Series and DataFrames</p>

## Multi-indexed Series

In [98]:
idx = [['Rome', 'Turin', 'Rome', 'Turin'],
       ['2018', '2018', '2019', '2019']]

s1 = pd.Series([10, 7, 13, 9], index=idx)
s1

Rome   2018    10
Turin  2018     7
Rome   2019    13
Turin  2019     9
dtype: int64

In [99]:
s1 = s1.sort_index() # multi-index series must be sorted to use slicing
s1

Rome   2018    10
       2019    13
Turin  2018     7
       2019     9
dtype: int64

<p><img src="./img/p9.png" alt="" width="300" height="100" /></p>

In [100]:
s1.index.names=['city', 'year']
s1

city   year
Rome   2018    10
       2019    13
Turin  2018     7
       2019     9
dtype: int64

<p><strong>Slicing&nbsp;</strong>and&nbsp;<strong>simple indexing</strong> are allowed on multi-index Series (following Numpy rules).</p>

In [101]:
s1.loc['Rome'] # Access outer lever

year
2018    10
2019    13
dtype: int64

In [102]:
s1.loc[:, '2018'] # Access all cityes, year 2018

city
Rome     10
Turin     7
dtype: int64

In [103]:
s1.loc['Turin', :] # Access all years, city Turin

city   year
Turin  2018    7
       2019    9
dtype: int64

In [104]:
(s1 > 8) & (s1 < 13) #mask

city   year
Rome   2018     True
       2019    False
Turin  2018    False
       2019     True
dtype: bool

In [105]:
s1[(s1 > 8) & (s1 < 13)] # masking

city   year
Rome   2018    10
Turin  2019     9
dtype: int64

## Multi-indexed DataFrame

In [106]:
idx = [['Rome', 'Rome', 'Turin', 'Turin'],
       ['2018', '2019', '2018', '2019']]

cols = [['c1', 'c1', 'c2', 'c2'], ['a', 'b', 'a', 'b']]

data = np.arange(16).reshape(4,4)

df = pd.DataFrame(data, index=idx, columns=cols)

df

Unnamed: 0_level_0,Unnamed: 1_level_0,c1,c1,c2,c2
Unnamed: 0_level_1,Unnamed: 1_level_1,a,b,a,b
Rome,2018,0,1,2,3
Rome,2019,4,5,6,7
Turin,2018,8,9,10,11
Turin,2019,12,13,14,15


In [107]:
df['c1'] # accessing outer column

Unnamed: 0,Unnamed: 1,a,b
Rome,2018,0,1
Rome,2019,4,5
Turin,2018,8,9
Turin,2019,12,13


In [108]:
df.loc['Rome'] # accessing outer index

Unnamed: 0_level_0,c1,c1,c2,c2
Unnamed: 0_level_1,a,b,a,b
2018,0,1,2,3
2019,4,5,6,7


In [109]:
df['c2', 'a'] # accessing inner column level

Rome   2018     2
       2019     6
Turin  2018    10
       2019    14
Name: (c2, a), dtype: int64

In [110]:
df.loc['Turin', '2019'] # accessing inner row level

c1  a    12
    b    13
c2  a    14
    b    15
Name: (Turin, 2019), dtype: int64

In [111]:
df.loc['Rome', 'c1'] # accessing outer row and column

Unnamed: 0,a,b
2018,0,1
2019,4,5


In [112]:
ix = pd.IndexSlice # object for more complex slicing operations 

df.loc[ix['Rome', '2018'], ix['c1':'c2', 'a']]

c1  a    0
c2  a    2
Name: (Rome, 2018), dtype: int64

<p><img src="./img/p10.png" alt="" width="200" height="100" /></p>


In [113]:
df.index.names= ['city', 'year'] # assign a name to index

In [114]:
df_reset = df.reset_index() # transform index to DataFrame columns
df_reset

Unnamed: 0_level_0,city,year,c1,c1,c2,c2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,a,b,a,b
0,Rome,2018,0,1,2,3
1,Rome,2019,4,5,6,7
2,Turin,2018,8,9,10,11
3,Turin,2019,12,13,14,15


In [115]:
df_reset.set_index(['city', 'year']) # inverse function

Unnamed: 0_level_0,Unnamed: 1_level_0,c1,c1,c2,c2
Unnamed: 0_level_1,Unnamed: 1_level_1,a,b,a,b
city,year,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Rome,2018,0,1,2,3
Rome,2019,4,5,6,7
Turin,2018,8,9,10,11
Turin,2019,12,13,14,15


<p><img src="./img/p11.png" alt="" width="350" height="100" /></p>


### Stack and unstack

In [116]:
idx = [['Rome', 'Rome', 'Turin', 'Turin'],
       ['2018', '2019', '2018', '2019']]

s1 = pd.Series([10, 7, 13, 9], index=idx)
s1

Rome   2018    10
       2019     7
Turin  2018    13
       2019     9
dtype: int64

In [117]:
df1 = s1.unstack() # transforms a multi-indexed Series into a DataFrame
df1

Unnamed: 0,2018,2019
Rome,10,7
Turin,13,9


<p><img src="./img/p12.png" alt="" width="350" height="100" /></p>



In [118]:
df1.stack() # transforms a DataFrame into a multi-indexed Series

Rome   2018    10
       2019     7
Turin  2018    13
       2019     9
dtype: int64

### Aggregation on multi-indexed DataFrame

<p>To perform aggregate operations on a multi-indexed DataFrame it is necessary to specify the&nbsp;<strong>row granularity</strong> at which computing the result, using the&nbsp;<strong>level</strong> parameter</p>

In [119]:
idx = [['Rome', 'Rome', 'Turin', 'Turin'],
       ['2018', '2019', '2018', '2019']]
cols = [['c1', 'c1', 'c2', 'c2'], ['a', 'b', 'a', 'b']]
data = np.arange(16).reshape(4,4)
df = pd.DataFrame(data, index=idx, columns=cols)
df.index.names= ['city', 'year']

df

Unnamed: 0_level_0,Unnamed: 1_level_0,c1,c1,c2,c2
Unnamed: 0_level_1,Unnamed: 1_level_1,a,b,a,b
city,year,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Rome,2018,0,1,2,3
Rome,2019,4,5,6,7
Turin,2018,8,9,10,11
Turin,2019,12,13,14,15


In [120]:
df.max(level='city') # default

Unnamed: 0_level_0,c1,c1,c2,c2
Unnamed: 0_level_1,a,b,a,b
city,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Rome,4,5,6,7
Turin,12,13,14,15


<p><img src="./img/p14.png" alt="" width="350" height="100" /></p>


In [121]:
df.max(level='year')

Unnamed: 0_level_0,c1,c1,c2,c2
Unnamed: 0_level_1,a,b,a,b
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2018,8,9,10,11
2019,12,13,14,15


<p><img src="./img/p15.png" alt="" width="350" height="100" /></p>

It is also possible to perform aggregate functions on columns

In [122]:
df.max(axis='columns', level=0) # or axis=1

Unnamed: 0_level_0,Unnamed: 1_level_0,c1,c2
city,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Rome,2018,1,3
Rome,2019,5,7
Turin,2018,9,11
Turin,2019,13,15


<p><img src="./img/p16.png" alt="" width="350" height="100" /></p>

In [123]:
df.max(axis='columns', level=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
city,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Rome,2018,2,3
Rome,2019,6,7
Turin,2018,10,11
Turin,2019,14,15


In [124]:
df.mean(axis='columns', level=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,c1,c2
city,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Rome,2018,0.5,2.5
Rome,2019,4.5,6.5
Turin,2018,8.5,10.5
Turin,2019,12.5,14.5


In [125]:
df.mean(axis='columns', level=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
city,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Rome,2018,1,2
Rome,2019,5,6
Turin,2018,9,10
Turin,2019,13,14
