# Analyse Pandas - Manipulation données

## Fusion de dataframes

In [8]:
from pandas import DataFrame
import pandas as pd

In [15]:
df1 = DataFrame({'key':['b','b','a','c','a','a','b'],
               'data1':range(7)})

In [16]:
df1

Unnamed: 0,data1,key
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,a
6,6,b


In [18]:
df2 = DataFrame({'key':['a','b','d'],
               'data2':range(3)})
df2

Unnamed: 0,data2,key
0,0,a
1,1,b
2,2,d


In [19]:
pd.merge(df1,df2,on='key')

Unnamed: 0,data1,key,data2
0,0,b,1
1,1,b,1
2,6,b,1
3,2,a,0
4,4,a,0
5,5,a,0


In [13]:
df3 = DataFrame({'key1':['b','b','a','c','a','a','b'],
               'data1':range(7)})

df4 = DataFrame({'key2':['a','b','d'],
               'data2':range(3)})

pd.merge(df3,df4,left_on='key1',right_on='key2')


Unnamed: 0,data1,key1,data2,key2
0,0,b,1,b
1,1,b,1,b
2,6,b,1,b
3,2,a,0,a
4,4,a,0,a
5,5,a,0,a


In [20]:
# Jointure outer
pd.merge(df1,df2,how='outer')


Unnamed: 0,data1,key,data2
0,0.0,b,1.0
1,1.0,b,1.0
2,6.0,b,1.0
3,2.0,a,0.0
4,4.0,a,0.0
5,5.0,a,0.0
6,3.0,c,
7,,d,2.0


In [21]:
# Jointure externe gauche
pd.merge(df1,df2,how='left')

Unnamed: 0,data1,key,data2
0,0,b,1.0
1,1,b,1.0
2,2,a,0.0
3,3,c,
4,4,a,0.0
5,5,a,0.0
6,6,b,1.0


In [22]:
# Jointure externe droite
pd.merge(df1,df2,how='right')

Unnamed: 0,data1,key,data2
0,0.0,b,1
1,1.0,b,1
2,6.0,b,1
3,2.0,a,0
4,4.0,a,0
5,5.0,a,0
6,,d,2


In [23]:
# Jointure interne
pd.merge(df1,df2,how='inner')

Unnamed: 0,data1,key,data2
0,0,b,1
1,1,b,1
2,6,b,1
3,2,a,0
4,4,a,0
5,5,a,0


In [26]:
# Jointure interne
pd.merge(df1,df2,on='key',how='outer',suffixes=('_left','_right'))

Unnamed: 0,data1,key,data2
0,0.0,b,1.0
1,1.0,b,1.0
2,6.0,b,1.0
3,2.0,a,0.0
4,4.0,a,0.0
5,5.0,a,0.0
6,3.0,c,
7,,d,2.0


## Index

In [28]:
left1 = DataFrame({'key':['a','b','a','a','b','c'],
                  'value':range(6)})

In [29]:
right1 = DataFrame({'group_val':[3.5,7]},index=['a','b'])

In [30]:
pd.merge(left1,right1,left_on='key',right_index=True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0


In [34]:
left2 = DataFrame({'group_val2':[8,9]},index=['a','b'])

In [35]:
left2.join(right1,how='outer')

Unnamed: 0,group_val2,group_val
a,8,3.5
b,9,7.0


In [36]:
left2.join(left1,how='outer')

Unnamed: 0,group_val2,key,value
a,8.0,,
b,9.0,,
0,,a,0.0
1,,b,1.0
2,,a,2.0
3,,a,3.0
4,,b,4.0
5,,c,5.0


## Concaténer le long d'un axe

In [42]:
import numpy as np
from pandas import Series

arr = np.arange(12).reshape(3,4)

In [40]:
np.concatenate([arr,arr],axis=1)

array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])

In [44]:
s1 = Series([0,1],index=['a','b'])
s2 = Series([2,3,4],index=['c','d','e'])
s3 = Series([5,6],index=['f','g'])

In [46]:
pd.concat([s1,s2,s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [47]:
pd.concat([s1,s2,s3],axis=1)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [48]:
s4 = pd.concat([s1*5,s3])
s4

a    0
b    5
f    5
g    6
dtype: int64

In [49]:
pd.concat([s1,s4],axis=1,join='inner')

Unnamed: 0,0,1
a,0,0
b,1,5


In [51]:
result = pd.concat([s1,s2,s3],keys=['one','two','three'])
result

one    a    0
       b    1
two    c    2
       d    3
       e    4
three  f    5
       g    6
dtype: int64

In [52]:
result.unstack()

Unnamed: 0,a,b,c,d,e,f,g
one,0.0,1.0,,,,,
two,,,2.0,3.0,4.0,,
three,,,,,,5.0,6.0


In [53]:
result2 = pd.concat([s1,s2,s3],axis=1,keys=['one','two','three'])
result2

Unnamed: 0,one,two,three
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [54]:
result3 = pd.concat({'level1':result2,'level2':result2},axis=1)
result3

Unnamed: 0_level_0,level1,level1,level1,level2,level2,level2
Unnamed: 0_level_1,one,two,three,one,two,three
a,0.0,,,0.0,,
b,1.0,,,1.0,,
c,,2.0,,,2.0,
d,,3.0,,,3.0,
e,,4.0,,,4.0,
f,,,5.0,,,5.0
g,,,6.0,,,6.0


In [55]:
result4 = pd.concat({'level1':result2,'level2':result2},axis=0)
result4

Unnamed: 0,Unnamed: 1,one,two,three
level1,a,0.0,,
level1,b,1.0,,
level1,c,,2.0,
level1,d,,3.0,
level1,e,,4.0,
level1,f,,,5.0
level1,g,,,6.0
level2,a,0.0,,
level2,b,1.0,,
level2,c,,2.0,


In [56]:
result5 = pd.concat({'level1':result2,'level2':result2},axis=1,ignore_index=True)
result5

Unnamed: 0,0,1,2,3,4,5
a,0.0,,,0.0,,
b,1.0,,,1.0,,
c,,2.0,,,2.0,
d,,3.0,,,3.0,
e,,4.0,,,4.0,
f,,,5.0,,,5.0
g,,,6.0,,,6.0


In [57]:
result3['level1']

Unnamed: 0,one,two,three
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [58]:
result3['level1']['two']

a    NaN
b    NaN
c    2.0
d    3.0
e    4.0
f    NaN
g    NaN
Name: two, dtype: float64

In [63]:
result3.loc['b']

level1  one      1.0
        two      NaN
        three    NaN
level2  one      1.0
        two      NaN
        three    NaN
Name: b, dtype: float64

In [64]:
result3.loc['b']['level1']

one      1.0
two      NaN
three    NaN
Name: b, dtype: float64

In [65]:
result3.loc['b']['level1']['two']

nan

## Combinaison avec superposition

In [66]:
s5 = Series([2,3,4,np.nan,np.nan],index=['c','d','e','a','b'])
s6 = Series([2,np.nan,np.nan,3,4],index=['c','d','e','a','b'])
np.where(pd.isnull(s5),s6,s5)


array([2., 3., 4., 3., 4.])

In [67]:
s6.combine_first(s5)

c    2.0
d    3.0
e    4.0
a    3.0
b    4.0
dtype: float64

## Faire pivoter

In [68]:
result3

Unnamed: 0_level_0,level1,level1,level1,level2,level2,level2
Unnamed: 0_level_1,one,two,three,one,two,three
a,0.0,,,0.0,,
b,1.0,,,1.0,,
c,,2.0,,,2.0,
d,,3.0,,,3.0,
e,,4.0,,,4.0,
f,,,5.0,,,5.0
g,,,6.0,,,6.0


In [69]:
result3.unstack()

level1  one    a    0.0
               b    1.0
               c    NaN
               d    NaN
               e    NaN
               f    NaN
               g    NaN
        two    a    NaN
               b    NaN
               c    2.0
               d    3.0
               e    4.0
               f    NaN
               g    NaN
        three  a    NaN
               b    NaN
               c    NaN
               d    NaN
               e    NaN
               f    5.0
               g    6.0
level2  one    a    0.0
               b    1.0
               c    NaN
               d    NaN
               e    NaN
               f    NaN
               g    NaN
        two    a    NaN
               b    NaN
               c    2.0
               d    3.0
               e    4.0
               f    NaN
               g    NaN
        three  a    NaN
               b    NaN
               c    NaN
               d    NaN
               e    NaN
               f    5.0
               g

In [70]:
result3.stack()

Unnamed: 0,Unnamed: 1,level1,level2
a,one,0.0,0.0
b,one,1.0,1.0
c,two,2.0,2.0
d,two,3.0,3.0
e,two,4.0,4.0
f,three,5.0,5.0
g,three,6.0,6.0


In [71]:
result3.stack().unstack()

Unnamed: 0_level_0,level1,level1,level1,level2,level2,level2
Unnamed: 0_level_1,one,two,three,one,two,three
a,0.0,,,0.0,,
b,1.0,,,1.0,,
c,,2.0,,,2.0,
d,,3.0,,,3.0,
e,,4.0,,,4.0,
f,,,5.0,,,5.0
g,,,6.0,,,6.0


## Doublons

In [72]:
data = DataFrame({'k1':['one']*3+['two']*4,
                 'k2':[1,1,2,3,3,4,4]})
data

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


In [73]:
data.duplicated()

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [74]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3
5,two,4


In [75]:
data['v1']=range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,one,1,1
2,one,2,2
3,two,3,3
4,two,3,4
5,two,4,5
6,two,4,6


In [76]:
data.drop_duplicates()

Unnamed: 0,k1,k2,v1
0,one,1,0
1,one,1,1
2,one,2,2
3,two,3,3
4,two,3,4
5,two,4,5
6,two,4,6


In [77]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
3,two,3,3


In [78]:
data.drop_duplicates(['k1','k2'])

Unnamed: 0,k1,k2,v1
0,one,1,0
2,one,2,2
3,two,3,3
5,two,4,5


## Fonction Map

In [80]:
conversion = {'one':'four','two':'height','three':'twelve'}
data['k1'].map(conversion).map(str.upper)

0      FOUR
1      FOUR
2      FOUR
3    HEIGHT
4    HEIGHT
5    HEIGHT
6    HEIGHT
Name: k1, dtype: object

In [81]:
data['k1'].map(lambda x:str.upper(conversion[x]))

0      FOUR
1      FOUR
2      FOUR
3    HEIGHT
4    HEIGHT
5    HEIGHT
6    HEIGHT
Name: k1, dtype: object

In [82]:
data.replace(2,7)

Unnamed: 0,k1,k2,v1
0,one,1,0
1,one,1,1
2,one,7,7
3,two,3,3
4,two,3,4
5,two,4,5
6,two,4,6


In [88]:
data2 = data.copy()
data2.index = data.index.map(lambda x:2*x)
data2

Unnamed: 0,k1,k2,v1
0,one,1,0
2,one,1,1
4,one,2,2
6,two,3,3
8,two,3,4
10,two,4,5
12,two,4,6


## Discretisation

In [90]:
ages = [20,22,25,27,21,23,37,31,61,45,41,32]
bins=[18,25,35,60,100]
cats=pd.cut(ages,bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [91]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [94]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [102]:
datas=np.random.randn(20)
datas

array([ 1.02859887,  0.38337372, -0.97858704, -0.17156223,  0.2566229 ,
       -0.49343579,  0.24361064, -0.47982343,  0.34622008, -1.07014696,
       -0.64580754, -0.33422117,  0.6848343 ,  1.02272555, -1.22289608,
       -0.28368213,  0.60485535,  0.96196072,  1.97236198, -0.24094465])

In [104]:
categ=pd.qcut(datas,4)
categ

[(0.625, 1.972], (0.036, 0.625], (-1.224, -0.483], (-0.483, 0.036], (0.036, 0.625], ..., (-0.483, 0.036], (0.036, 0.625], (0.625, 1.972], (0.625, 1.972], (-0.483, 0.036]]
Length: 20
Categories (4, interval[float64]): [(-1.224, -0.483] < (-0.483, 0.036] < (0.036, 0.625] < (0.625, 1.972]]

In [105]:
categ.codes

array([3, 2, 0, 1, 2, 0, 2, 1, 2, 0, 0, 1, 3, 3, 0, 1, 2, 3, 3, 1],
      dtype=int8)

In [107]:
categ.value_counts()

(-1.224, -0.483]    5
(-0.483, 0.036]     5
(0.036, 0.625]      5
(0.625, 1.972]      5
dtype: int64

## Filtrer

In [109]:
np.random.seed(12345)
datass = DataFrame(np.random.randn(1000,4))
datass.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067684,0.067924,0.025598,-0.002298
std,0.998035,0.992106,1.006835,0.996794
min,-3.428254,-3.548824,-3.184377,-3.745356
25%,-0.77489,-0.591841,-0.641675,-0.644144
50%,-0.116401,0.101143,0.002073,-0.013611
75%,0.616366,0.780282,0.680391,0.654328
max,3.366626,2.653656,3.260383,3.927528


In [111]:
col = datass[3]
col[np.abs(col) >3]

97     3.927528
305   -3.399312
400   -3.745356
Name: 3, dtype: float64

In [115]:
datass[(np.abs(datass)>3).any(1)]

Unnamed: 0,0,1,2,3
5,-0.539741,0.476985,3.248944,-1.021228
97,-0.774363,0.552936,0.106061,3.927528
102,-0.655054,-0.56523,3.176873,0.959533
305,-2.315555,0.457246,-0.025907,-3.399312
324,0.050188,1.951312,3.260383,0.963301
400,0.146326,0.508391,-0.196713,-3.745356
499,-0.293333,-0.242459,-3.05699,1.918403
523,-3.428254,-0.296336,-0.439938,-0.867165
586,0.275144,1.179227,-3.184377,1.369891
808,-0.362528,-3.548824,1.553205,-2.186301


## Permutations et échantillons

In [123]:
sampler = np.random.permutation(5)
sampler

array([1, 3, 4, 0, 2])

In [124]:
df= DataFrame(np.arange(5*4).reshape(5,4))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [125]:
df.take(sampler)

Unnamed: 0,0,1,2,3
1,4,5,6,7
3,12,13,14,15
4,16,17,18,19
0,0,1,2,3
2,8,9,10,11


In [127]:
# Echantillon avec remise
samplers = np.random.randint(0,len(df),len(df)*2)

In [128]:
draws=df.take(samplers)

In [129]:
draws

Unnamed: 0,0,1,2,3
4,16,17,18,19
4,16,17,18,19
2,8,9,10,11
2,8,9,10,11
2,8,9,10,11
0,0,1,2,3
3,12,13,14,15
0,0,1,2,3
4,16,17,18,19
1,4,5,6,7


## Calcul indicateur variables vides

In [130]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,one,1,1
2,one,2,2
3,two,3,3
4,two,3,4
5,two,4,5
6,two,4,6


In [132]:
pd.get_dummies(data['k1'])

Unnamed: 0,one,two
0,1,0
1,1,0
2,1,0
3,0,1
4,0,1
5,0,1
6,0,1


In [134]:
dummies = pd.get_dummies(data['k1'],prefix='k1')
dummies

Unnamed: 0,k1_one,k1_two
0,1,0
1,1,0
2,1,0
3,0,1
4,0,1
5,0,1
6,0,1


In [135]:
df_with_dummy = data.iloc[:,1:].join(dummies)
df_with_dummy

Unnamed: 0,k2,v1,k1_one,k1_two
0,1,0,1,0
1,1,1,1,0
2,2,2,1,0
3,3,3,0,1
4,3,4,0,1
5,4,5,0,1
6,4,6,0,1


In [136]:
values = np.random.rand(10)
values


array([0.75603383, 0.90830844, 0.96588737, 0.17373658, 0.87592824,
       0.75415641, 0.163486  , 0.23784062, 0.85564381, 0.58743194])

In [137]:
bins=[0,0.2,0.4,0.6,0.8,1]
pd.get_dummies(pd.cut(values,bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,1,0
1,0,0,0,0,1
2,0,0,0,0,1
3,1,0,0,0,0
4,0,0,0,0,1
5,0,0,0,1,0
6,1,0,0,0,0
7,0,1,0,0,0
8,0,0,0,0,1
9,0,0,1,0,0
