In [440]:
!git add "Introduction\ to\ Pandas.ipynb"

In [442]:
!git commit -m "added Pandas introduction"

[master 6bd3f08] added Pandas introduction
 1 file changed, 1937 insertions(+)
 create mode 100644 Introduction to Pandas.ipynb


In [443]:
!git push origin master

Enumerating objects: 4, done.
Counting objects: 100% (4/4), done.
Delta compression using up to 8 threads
Compressing objects: 100% (3/3), done.
Writing objects: 100% (3/3), 4.81 KiB | 4.81 MiB/s, done.
Total 3 (delta 1), reused 0 (delta 0)
remote: Resolving deltas: 100% (1/1), completed with 1 local object.[K
To https://github.com/Luc-Bertin/TDs_ESILV.git
   77ab45f..6bd3f08  master -> master


# Discover Pandas 

Pandas is built on top of Numpy

>`pandas.DataFrame` is a 2-dimensional labeled data structure with columns of potentially different types. You can think of it like a spreadsheet or SQL table.
It got rows and columns' labels and can greatly contain and handle missing data.

Dealing with initially less structured, clean and complete data consists in most of the time spent by the data scientist.

https://pandas.pydata.org/docs/getting_started/overview.html

In [1]:
import pandas as pd

In [2]:
pd.__version__

'0.25.3'

In [3]:
pd?

In [176]:
%%timeit
3+2

14.7 ns ± 0.386 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


## Series

one-dimensional array of indexed data. 

In [612]:
pd.Series([3,2,1])

0    3
1    2
2    1
dtype: int64

with explicit index definition !

Example:

In [276]:
serie_1 = pd.Series([3,2,1], index=[93,129, 219394])

In [277]:
serie_1

93        3
129       2
219394    1
dtype: int64

In [278]:
serie_1.index

Int64Index([93, 129, 219394], dtype='int64')

a dictionnary-like, object with possible keys repetition

In [304]:
serie = pd.Series([3,2,1], index=["rené", "rené", "jean"])

In [305]:
serie

rené    3
rené    2
jean    1
dtype: int64

In [306]:
serie.values

array([3, 2, 1])

In [307]:
serie.index

Index(['rené', 'rené', 'jean'], dtype='object')

* Access by key

In [308]:
serie['rené']

rené    3
rené    2
dtype: int64

* Set a new key pair

In [309]:
serie['joseph'] = 5

* Change a value for a key

In [310]:
serie['rené'] = 4

In [311]:
serie

rené      4
rené      4
jean      1
joseph    5
dtype: int64

In [312]:
serie['rené'] = [4,3]

In [313]:
serie

rené      4
rené      3
jean      1
joseph    5
dtype: int64

* delete a key val pair

In [314]:
del serie["rené"]

In [315]:
serie

jean      1
joseph    5
dtype: int64

In [321]:
serie[0:4:2] # indexing: not possible in a simple dict 

jean    1
dtype: int64

* lookup 

In [324]:
print('rené' in serie)
print("jean" in serie)

False
True


- When index is unique, pandas use a hashtable just like `dict`s : O(1). 
- When index is non-unique and sorted, pandas use binary search O(logN)
- When index is non-unique and not-sorted pandas need to check all the keys just like a list look-up: O(N).



using a `dict` in the `pd.Series` constructor automatically assigns the index as the ordered keys in the `dict` (for Python 3.6 and later though, the index is in the same order as the insertion order).


In [613]:
test = pd.Series(dict(zip(["ea","fzf","aeif"], [2,3,2])))
# with zip or using a dict
test2 = pd.Series({"ea":2, "fzf":3, "aeif":2}, index=["ea"])

In [614]:
test

aeif    2
ea      2
fzf     3
dtype: int64

In [615]:
test2

ea    2
dtype: int64

**If multiple different types reside in a Series, all of the data will get upcasted to a dtype that accommodates all of the data involved.**



**dtype=object** means that the best common type infered representation for the contents of the pd.series is that they are Python objects.

this also mean performance decreases, any operations on the data will be done at the Python level

## selection in Series

In [332]:
(test>2)

aeif    False
ea      False
fzf      True
dtype: bool

In [333]:
(test<4)

aeif    True
ea      True
fzf     True
dtype: bool

In [340]:
# not "and" but "&" : & operator is a bitwise "and"
(test>2) & (test < 4) 

aeif    False
ea      False
fzf      True
dtype: bool

In [344]:
type((test>2) & (test < 4) )

pandas.core.series.Series

In [345]:
# mask ( the last expression whose result is an pd.Serie stored in the variable mask)
mask = (test>2) & (test < 4)

In [346]:
test[mask]

fzf    3
dtype: int64

In [347]:
# fancy indexing
test[["ea", "fzf"]]

ea     2
fzf    3
dtype: int64

In [348]:
# explicit index slicing
test["aeif": "fzf"]

aeif    2
ea      2
fzf     3
dtype: int64

In [349]:
# implicit index slicing
test[0: 2]

aeif    2
ea      2
dtype: int64

using explicit indexes while slicing makes the final index ***included*** in the slice hence the results

using implicit index in slicing ***exclude*** the final index during slicing 

what about i defined explicit integer indexes and i want to slice ? 🙄

## using loc

In [350]:
serie2 = pd.Series({1:2, 5:3, 7:2})

In [351]:
serie2

1    2
5    3
7    2
dtype: int64

In [352]:
serie2.loc[1] # explicit index

2

In [353]:
serie2.iloc[1] # implicit index

3

In [354]:
serie2.iloc[1:2] # implicit index for slicing

5    3
dtype: int64

In [355]:
serie2.loc[1:5] # explicit index for slicing

1    2
5    3
dtype: int64

In [357]:
serie2.loc[[1,5]] # fancy indexing

1    2
5    3
dtype: int64

### Index object 

* are immutable

In [386]:
df2.index[0]=18

TypeError: Index does not support mutable operations

* can be sliced or indexed (just like an array)

In [387]:
df2.index[0]

0

In [393]:
df.index[:2]

Index(['Corentin', 'Luc'], dtype='object')

In [395]:
df.index & {'Corentin', 'Yolo'}

Index(['Corentin'], dtype='object')

In [397]:
df.index ^ {'Corentin', 'Yolo'}

Index(['Luc', 'René', 'Yolo'], dtype='object')

# DataFrame

* sequence of "aligned" Series objects (sharing same indexes / like an Excel file )

* each Series object is a column

* Hence `pd.DataFrame` can be seen as dictionnary of Series objects

* Flexible rows and columns' labels

In [620]:
serie1 = pd.Series({"Luc": 25, "Corentin":29, "René": 40})
serie2 = pd.Series({"René": "100%", "Corentin": "25%", "Luc": "20%"})

In [621]:
df = pd.DataFrame({"note": serie1, 
                   "charge_de_travail": serie2})

In [622]:
df

Unnamed: 0,charge_de_travail,note
Corentin,25%,29
Luc,20%,25
René,100%,40


In [623]:
df.index

Index(['Corentin', 'Luc', 'René'], dtype='object')

In [624]:
df.columns

Index(['charge_de_travail', 'note'], dtype='object')

> If you pass an index and / or columns, you are guaranteeing the index and / or columns of the resulting DataFrame. Thus, a dict of Series plus a specific index will discard all data not matching up to the passed index.m

In [625]:
df2 = pd.DataFrame({"note": serie1, "charge_de_travail": serie2}, index=["Corentin", "Luc"], columns=["note", "autre"])

In [630]:
df2 # filled with NaN ("Not A Number") when no value exist for the given (row_index, column_index)

Unnamed: 0,note,autre
Corentin,29,
Luc,25,


In [627]:
df.shape

(3, 2)

shape: tuple of the number of elements with respect to each dimension

For a 1D array, the shape would be (n,) where n is the number of elements in your array.

For a 2D array, the shape would be (n,m) where n is the number of rows and m is the number of columns in your array

accessing columns by key : 

In [629]:
df['note']

Corentin    29
Luc         25
René        40
Name: note, dtype: int64


Using the attribute notation is not advised for assignements as some methods or attributes of the same name already exist in the DataFrame class' own namespace

In [380]:
df.note

Corentin    29
Luc         25
René        40
Name: note, dtype: int64

The `DataFrame` can be constructed using a list of dictionary
each dict element is a row
each key of each dict refers a column

In [381]:
df2 = pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])

In [382]:
df2

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


Indexing works the same way as for Series, but you have to account this time for the second dimension

`df.loc_or_iloc[ dim1 = rows, dim2 = columns]`


In [411]:
df.iloc[:3, :1]

Unnamed: 0,charge_de_travail
Corentin,25%
Luc,20%
René,100%


columns slicing/indexing is optional here, without specifying it, you select only rows 

In [412]:
df.iloc[:3]

Unnamed: 0,charge_de_travail,note
Corentin,25%,29
Luc,20%,25
René,100%,40


In [419]:
df.loc[["Corentin", "Luc"], :] # mixing slicing and fancy indexing

Unnamed: 0,charge_de_travail,note
Corentin,25%,29
Luc,20%,25


In [421]:
df.loc[["Corentin", "Luc"]] # without the "col argument"

Unnamed: 0,charge_de_travail,note
Corentin,25%,29
Luc,20%,25


Something to mention here, by default:
- indexing directly `df`, performs the indexing on its columns (1)
- slicing by conditions, or using a slice notation (::), is performed on rows (2)

(1)

In [429]:
df[["charge_de_travail"]] # indexing, defaults to columns

Unnamed: 0,charge_de_travail
Corentin,25%
Luc,20%
René,100%


(2) 

In [434]:
mask = df["charge_de_travail"]=="25%" 
mask

Corentin     True
Luc         False
René        False
Name: charge_de_travail, dtype: bool

In [435]:
df[mask] # masking, on lines

Unnamed: 0,charge_de_travail,note
Corentin,25%,29


In [438]:
df[:3] # slicing, on rows

Unnamed: 0,charge_de_travail,note
Corentin,25%,29
Luc,20%,25
René,100%,40


## Operations on Pandas

Element-wise operations are made easy in `pandas`


* 3 - 2 <=> substract(3,2) <=> binary operation (2 inputs)
* -2 <=> neg(2) <=> unary operation (one input)
* sin(2) <=> unary operation (one input)

in Pandas : 
  - unary operations on elements of a df preserve the indexes
  - binary operations on 2 elements of 2 df align on the indexes

In [837]:
import numpy as np 

In [838]:
rng = np.random.RandomState(42)

In [839]:
data = rng.randint(0,10, (3,4))

In [840]:
df = pd.DataFrame(data)

In [841]:
df

Unnamed: 0,0,1,2,3
0,6,3,7,4
1,6,9,2,6
2,7,4,3,7


In [842]:
df2 = pd.DataFrame(rng.randint(0,10, (4,4)))

In [843]:
df2

Unnamed: 0,0,1,2,3
0,7,2,5,4
1,1,7,5,1
2,4,0,9,5
3,8,0,9,2


In [844]:
df2 = df2.reindex([1,0,2,3])
df2

Unnamed: 0,0,1,2,3
1,1,7,5,1
0,7,2,5,4
2,4,0,9,5
3,8,0,9,2


In [845]:
df + df2

Unnamed: 0,0,1,2,3
0,13.0,5.0,12.0,8.0
1,7.0,16.0,7.0,7.0
2,11.0,4.0,12.0,12.0
3,,,,


on line of index 0, `7+6 = 13`
 which shows indexes had been aligned during the binary operation

also notice the union of the indices during the binary operation. If one may not exist in either of the dataframes and the result can't be evalutated, `NaN` fill the concerned entries

In [846]:
df.__add__(df2, fill_value=25) # used in the binary operation 25+8 = 33)

Unnamed: 0,0,1,2,3
0,13.0,5.0,12.0,8.0
1,7.0,16.0,7.0,7.0
2,11.0,4.0,12.0,12.0
3,33.0,25.0,34.0,27.0


#### Operation between pandas series and a pandas dataframe

From the Numpy Docs

> Broadcasting is how numpy treats arrays with different shapes during arithmetic operations. 
Subject to certain constraints, the smaller array is “broadcast” across the larger array so that they have compatible shapes. 
Broadcasting provides a means of vectorizing array operations so that looping occurs in C instead of Python

The only requirement for broadcasting is a way aligning array dimensions such that either :
* aligned dimensions are equal (so that operations are done on an element-by-element basis from 2 arrays of same shape)
* one of the aligned dimensions is 1 (in other words, dimensions with size 1 are stretched or “copied” to match the dimension of the other array)

Operations between pandas.Series and pandas.DataFrame respect the numpy broadcasting rules:
>  If the two arrays differ in their number of dimensions, the shape of the one with fewer dimensions is padded with ones on its leading (left) side.'    

In [847]:
df.shape, df.iloc[1].shape, df.iloc[1][np.newaxis, :].shape

((3, 4), (4,), (1, 4))

In [848]:
serie = df.iloc[1]

In [849]:
df

Unnamed: 0,0,1,2,3
0,6,3,7,4
1,6,9,2,6
2,7,4,3,7


In [850]:
df - df.iloc[1]

Unnamed: 0,0,1,2,3
0,0,-6,5,-2
1,0,0,0,0
2,1,-5,1,1


In [851]:
df - df.iloc[1].sample(4) # kept the index alignements during computation

Unnamed: 0,0,1,2,3
0,0,-6,5,-2
1,0,0,0,0
2,1,-5,1,1


if you want to do it columnwise and not row wise

In [852]:
df.__sub__(df.iloc[1], axis=0) # caution, the indexes will be based on the column indexes

Unnamed: 0,0,1,2,3
0,0.0,-3.0,1.0,-2.0
1,-3.0,0.0,-7.0,-3.0
2,5.0,2.0,1.0,5.0
3,,,,


In [853]:
df.columns = ["a","b",0,"d"]

In [854]:
df 

Unnamed: 0,a,b,0,d
0,6,3,7,4
1,6,9,2,6
2,7,4,3,7


In [855]:
df.iloc[1]

a    6
b    9
0    2
d    6
Name: 1, dtype: int64

In [856]:
df.__sub__(df.iloc[1], axis=0) # based on the column indexes

Unnamed: 0,a,b,0,d
0,4.0,1.0,5.0,2.0
1,,,,
2,,,,
a,,,,
b,,,,
d,,,,


In [857]:
df[0]

0    7
1    2
2    3
Name: 0, dtype: int64

In [858]:
df

Unnamed: 0,a,b,0,d
0,6,3,7,4
1,6,9,2,6
2,7,4,3,7


In [859]:
df[0].shape, df.shape

((3,), (3, 4))

In [860]:
df

Unnamed: 0,a,b,0,d
0,6,3,7,4
1,6,9,2,6
2,7,4,3,7


In [861]:
df2 = df - pd.DataFrame([(1,2), (4,5), (9,19)], columns=["a","b"])
df2

Unnamed: 0,0,a,b,d
0,,5,1,
1,,2,4,
2,,-2,-15,



### dtypes

In [862]:
print(df.dtypes)
print(df2.dtypes) 
# NaN is a floating-point value, 
# hence the Series embedding it gets its dtype upcasted to float (if it were an int)
# this pd.Series supports fast operations

a    int64
b    int64
0    int64
d    int64
dtype: object
0    float64
a      int64
b      int64
d    float64
dtype: object


In [863]:
%timeit np.arange(1E6, dtype="int").sum()

1.14 ms ± 142 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [864]:
%timeit np.arange(1E6, dtype="object").sum()

77.8 ms ± 4.15 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [865]:
%timeit np.arange(1E6, dtype="float").sum()

1.25 ms ± 17.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


### Managing missing values

In [866]:
pd.Series([2, np.nan]).isnull()

0    False
1     True
dtype: bool

In [878]:
df2.iloc[0,2] = np.nan

In [879]:
df2
df2.isnull()

Unnamed: 0,0,a,b,d
0,,5,,
1,,2,4.0,
2,,-2,-15.0,


Unnamed: 0,0,a,b,d
0,True,False,True,True
1,True,False,False,True
2,True,False,False,True


In [880]:
pd.Series([2, np.nan]).dropna()

0    2.0
dtype: float64

In [881]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [891]:
df2.dropna(axis=1) # drop a column when contains one NA value
df2.dropna(axis=0) # drop a row when contains one NA value
df2.dropna(axis=1, how="all") # drop a column when contains all NA value
df2.dropna(axis=1, thresh=3) # drop a column if below 3 non-NA value

Unnamed: 0,a
0,5
1,2
2,-2


Unnamed: 0,0,a,b,d


Unnamed: 0,a,b
0,5,
1,2,4.0
2,-2,-15.0


Unnamed: 0,a
0,5
1,2
2,-2


In [893]:
df2
df2.fillna(value=2)
df2.fillna(method="bfill")
df2.fillna(method="bfill", axis=1)

Unnamed: 0,0,a,b,d
0,,5,,
1,,2,4.0,
2,,-2,-15.0,


Unnamed: 0,0,a,b,d
0,2.0,5,2.0,2.0
1,2.0,2,4.0,2.0
2,2.0,-2,-15.0,2.0


Unnamed: 0,0,a,b,d
0,,5,4.0,
1,,2,4.0,
2,,-2,-15.0,


Unnamed: 0,0,a,b,d
0,5.0,5.0,,
1,2.0,2.0,4.0,
2,-2.0,-2.0,-15.0,
