# Pandas

## 1 - Series

In [1]:
import numpy as np
import pandas as pd

pd.Series([0.3, 0.5, 0.8])

0    0.3
1    0.5
2    0.8
dtype: float64

In [2]:
pd.Series([0.3, 0.5, 0.8], index=['3-July', '4-July', '5-July'] )

3-July    0.3
4-July    0.5
5-July    0.8
dtype: float64

<p><img src="./img/p1.png" alt="" width="300" height="100" /></p>

In [3]:
# Create a dataframe from a dictionary

city_dict = {'Latitude': 40.47, 'Longitude': 17.24, 'Elevation': 28.2}
pd.Series(city_dict)

Latitude     40.47
Longitude    17.24
Elevation    28.20
dtype: float64

In [4]:
# Create a dataframe with values and index

s1 = pd.Series([2.0, 3.1, 4.5, 8.7, 1.8], index=['mon', 'tue', 'wed', 'thu', 'fri'])
print(s1.values)
print(s1.index)

[2.  3.1 4.5 8.7 1.8]
Index(['mon', 'tue', 'wed', 'thu', 'fri'], dtype='object')


In [5]:
s1

mon    2.0
tue    3.1
wed    4.5
thu    8.7
fri    1.8
dtype: float64

### Indexing

In [6]:
s1.loc['tue'] # explicit indexing

3.1

In [7]:
s1.loc['tue'] = 5.8 # inplace value assignment

s1['tue'] # alternative explicit indexing

5.8

In [8]:
s1.iloc[2] # implicit indexing

4.5

### Slicing

In [9]:
s1.loc['mon':'wed'] # explicit slicing (stop element included)

mon    2.0
tue    5.8
wed    4.5
dtype: float64

In [10]:
s1.iloc[1:4] # implicit sclicing (stop element excluded)

tue    5.8
wed    4.5
thu    8.7
dtype: float64

### Masking

In [11]:
mask = (s1 > 2) & (s1<9)
mask

mon    False
tue     True
wed     True
thu     True
fri    False
dtype: bool

In [12]:
s1[mask]

tue    5.8
wed    4.5
thu    8.7
dtype: float64

In [13]:
mask = (s1.index == 'mon') | (s1.index == 'fri')
s1[mask]

mon    2.0
fri    1.8
dtype: float64

### Fancy indexing

In [14]:
s1.loc[['fri', 'mon']]

fri    1.8
mon    2.0
dtype: float64

In [15]:
s1.loc[['wed', 'fri']] = np.NaN
s1

mon    2.0
tue    5.8
wed    NaN
thu    8.7
fri    NaN
dtype: float64

## 2 - DataFrame

### Creation from a 2D Numpy array

<p><img src="./img/p2.png" alt="" width="280" height="100" /></p>

In [16]:
arr = np.array([[1.0, 1.4, 5.0],
                [5, 10, 8],
                [1.5, 0.3, 1.0]])
col = ['Price', 'Quantity', 'Liters']
idx = ['Water', 'Beer', 'Wine']

df = pd.DataFrame(arr, index = idx, columns = col)
df

Unnamed: 0,Price,Quantity,Liters
Water,1.0,1.4,5.0
Beer,5.0,10.0,8.0
Wine,1.5,0.3,1.0


### Creation from Pandas series

In [17]:
price = pd.Series([8.99, 29.99, 49.99], index=['cap', 'shirt', 'hoodie' ])
quantity = pd.Series([2, 2, 1], index=['cap', 'shirt', 'hoodie' ])
stock = pd.Series([12, 120, 54], index=['cap', 'shirt', 'hoodie' ])

rec_df = pd.DataFrame({'Price': price, 'Quantity': quantity, 'Stock': stock})
rec_df

Unnamed: 0,Price,Quantity,Stock
cap,8.99,2,12
shirt,29.99,2,120
hoodie,49.99,1,54


### Creation from dictionaries

In [18]:
dic_list = [{'c1': i, 'c2': i**2, 'c3': i**3} for i in range(6)]
dic_list

[{'c1': 0, 'c2': 0, 'c3': 0},
 {'c1': 1, 'c2': 1, 'c3': 1},
 {'c1': 2, 'c2': 4, 'c3': 8},
 {'c1': 3, 'c2': 9, 'c3': 27},
 {'c1': 4, 'c2': 16, 'c3': 64},
 {'c1': 5, 'c2': 25, 'c3': 125}]

In [19]:
el_df = pd.DataFrame(dic_list)
el_df

Unnamed: 0,c1,c2,c3
0,0,0,0
1,1,1,1
2,2,4,8
3,3,9,27
4,4,16,64
5,5,25,125


### Import from File

<p>It is possible to create a DataFrame reading a <strong>csv</strong> or <strong>json</strong> file. For the csv, in many cases it is important to specify the separator character, the number of rows to be skipped at the beginning of the file and all the possible null values.</p>

In [47]:
city_meteo = pd.read_json('./data/p1.json', orient ='records')

city_meteo

Unnamed: 0,Temperature,Humidity,Wind
MI,15.3,52.0,3.5
TO,16.5,,2.5
,22.0,54.0,


In [48]:
stock_2019 = pd.read_csv('./data/stock.csv', 
                         sep = ',', 
                         skiprows=1,
                         index_col = 0,
                         na_values = ['none', 'null'])
stock_2019

Unnamed: 0_level_0,Q1,Q2,Q3,Q4
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FB,166.69,193.0,178.08,205.17
AAPL,47.49,49.48,,73.41
AMZN,1780.75,1893.63,1735.91,1847.84
NFLX,356.56,367.32,267.62,323.57
GOOG,1173.31,1080.91,1219.0,1337.02


<p>In the same way it is possible to convert a Pandas DataFrame in a file. Several extension are supported (csv, json, HTML, HDF5, SAS, ...)</p>

In [49]:
city_meteo.to_csv('./data/meteo.csv', sep=',', index = True)

In [50]:
stock_2019.to_json('./data/stock.json')

## Accessing DataFrame

In [51]:
stock_2019

Unnamed: 0_level_0,Q1,Q2,Q3,Q4
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FB,166.69,193.0,178.08,205.17
AAPL,47.49,49.48,,73.41
AMZN,1780.75,1893.63,1735.91,1847.84
NFLX,356.56,367.32,267.62,323.57
GOOG,1173.31,1080.91,1219.0,1337.02


### Accessing a column

In [54]:
stock_2019['Q2'] # Returns a Series with column data

Code
FB       193.00
AAPL      49.48
AMZN    1893.63
NFLX     367.32
GOOG    1080.91
Name: Q2, dtype: float64

### Accessing a Row

In [62]:
stock_2019.loc['GOOG'] # explicit indexing

Q1    1173.31
Q2    1080.91
Q3    1219.00
Q4    1337.02
Name: GOOG, dtype: float64

In [61]:
stock_2019.iloc[2] # implicit indexing

Q1    1780.75
Q2    1893.63
Q3    1735.91
Q4    1847.84
Name: AMZN, dtype: float64

### Slicing

In [80]:
stock_2019.loc['AAPL':'NFLX','Q2':'Q3'] # explicit indexing

Unnamed: 0_level_0,Q2,Q3
Code,Unnamed: 1_level_1,Unnamed: 2_level_1
AAPL,49.48,
AMZN,1893.63,1735.91
NFLX,367.32,267.62


In [83]:
stock_2019.iloc[0:2, :2] # implicit indexing

Unnamed: 0_level_0,Q1,Q2
Code,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,166.69,193.0
AAPL,47.49,49.48


### Masking

In [86]:
mask = (stock_2019['Q2'] < 1000) & (stock_2019['Q4'] > 100)
mask

Code
FB       True
AAPL    False
AMZN    False
NFLX     True
GOOG    False
dtype: bool

In [91]:
stock_2019.loc[mask, 'Q3':] # masking + slicing

Unnamed: 0_level_0,Q3,Q4
Code,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,178.08,205.17
NFLX,267.62,323.57


### Fancy Indexing

In [92]:
stock_2019.loc[['FB', 'GOOG'], ['Q1', 'Q4']]

Unnamed: 0_level_0,Q1,Q4
Code,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,166.69,205.17
GOOG,1173.31,1337.02


In [93]:
mask = (stock_2019['Q2'] < 1000) & (stock_2019['Q4'] > 100)

stock_2019.loc[mask, ['Q1', 'Q3']] # masking + fancy

Unnamed: 0_level_0,Q1,Q3
Code,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,166.69,178.08
NFLX,356.56,267.62
