In [1]:
import numpy as np
import pandas as pd

In [2]:
# On peut créer une Series à partir d'une list
data = pd.Series([0.25, 0.5, 0.75, 1.0])
print("data ressemble à un tableau Numpy: ", data)

data ressemble à un tableau Numpy:  0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64


In [3]:
# On peut spécifier des indices à la main
data = pd.Series([0.25, 0.5, 0.75, 1.0],
         index=['a', 'b', 'c', 'd'])
print("data ressemble à un dict en Python: ", data)
print(data['b'])

data ressemble à un dict en Python:  a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
0.5


In [6]:
# On peut même créer une Serie directement à partir d'une dict
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
area_dict = {'California': 423967, 
             'Texas': 695662,
             'New York': 141297, 
             'Florida': 170312,
             'Illinois': 149995}
population = pd.Series(population_dict)
area = pd.Series(area_dict)
print(population)
print(area)

California    38332521
Florida       19552860
Illinois      12882135
New York      19651127
Texas         26448193
dtype: int64
California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
dtype: int64


In [7]:
# Que pensez vous de cette ligne?
print(population['California':'Illinois'])

California    38332521
Florida       19552860
Illinois      12882135
dtype: int64


In [8]:
# A partir d'une Series
df = pd.DataFrame(population, columns=['population'])
print(df)

            population
California    38332521
Florida       19552860
Illinois      12882135
New York      19651127
Texas         26448193


In [9]:
# A partir d'une list de dict
data = [{'a': i, 'b': 2 * i}
        for i in range(3)]
df = pd.DataFrame(data)
print(df)

   a  b
0  0  0
1  1  2
2  2  4


In [10]:
# A partir de plusieurs Series
df = pd.DataFrame({'population': population,
              'area': area})
print(df)

              area  population
California  423967    38332521
Florida     170312    19552860
Illinois    149995    12882135
New York    141297    19651127
Texas       695662    26448193


In [11]:
# A partir d'un tableau Numpy de dimension 2
df = pd.DataFrame(np.random.rand(3, 2),
             columns=['foo', 'bar'],
             index=['a', 'b', 'c'])
print(df)

        foo       bar
a  0.347596  0.847463
b  0.776784  0.219600
c  0.736615  0.414514


In [12]:
# Une fonction pour générer facilement des DataFrame - Elle nous sera utile dans la suite de ce chapitre...
def make_df(cols, ind):
    """Crée rapidement des DataFrame"""
    data = {c: [str(c) + str(i) for i in ind]
            for c in cols}
    return pd.DataFrame(data, ind)
# exemple
make_df('ABC', range(3))

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [13]:
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index=['a', 'b', 'c', 'd'])
print(data)
# On peut désigner un élément d'une Series par son index
print(data.loc['b'])
# Ou bien par sa position
print(data.iloc[1])

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
0.5
0.5


In [14]:
data = pd.DataFrame({'area':area, 'pop':population})
print(data)
data.loc[:'Illinois', :'pop']

              area       pop
California  423967  38332521
Florida     170312  19552860
Illinois    149995  12882135
New York    141297  19651127
Texas       695662  26448193


Unnamed: 0,area,pop
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135


In [15]:
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [16]:
df1 = make_df('AB', [1, 2])
print(df1)
df2 = make_df('AB', [3, 4])
print(df2)
pd.concat([df1, df2])

    A   B
1  A1  B1
2  A2  B2
    A   B
3  A3  B3
4  A4  B4


Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4


In [20]:
pd.concat([df1,df2],axis=1)

Unnamed: 0,A,B,A.1,B.1
1,A1,B1,,
2,A2,B2,,
3,,,A3,B3
4,,,A4,B4


In [23]:
df3 = make_df('CD',[1,2])
pd.concat([df1,df3],axis=1)

Unnamed: 0,A,B,C,D
1,A1,B1,C1,D1
2,A2,B2,C2,D2


In [24]:
x = make_df('AB', [0, 1])
y = make_df('AB', [2, 3])
y.index = x.index  # Rend les index identiques
# Nous avons alors des index dupliqués
print(pd.concat([x, y]))
# Nous pouvons spécifier des index hiérarchiques
hdf = pd.concat([x, y], keys=['x', 'y'])
print(hdf)

    A   B
0  A0  B0
1  A1  B1
0  A2  B2
1  A3  B3
      A   B
x 0  A0  B0
  1  A1  B1
y 0  A2  B2
  1  A3  B3


In [25]:
hdf.loc[('x', 1),]

A    A1
B    B1
Name: (x, 1), dtype: object

In [29]:
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'department': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
                    'date': [2004, 2008, 2012, 2014]})
print(df1)
print(df2)
df3 = pd.merge(df1, df2)
print(df3)

    department employee
0   Accounting      Bob
1  Engineering     Jake
2  Engineering     Lisa
3           HR      Sue
   date employee
0  2004     Lisa
1  2008      Bob
2  2012     Jake
3  2014      Sue
    department employee  date
0   Accounting      Bob  2008
1  Engineering     Jake  2012
2  Engineering     Lisa  2004
3           HR      Sue  2014


In [35]:
df4 = pd.DataFrame({'emp_name': ['Lisa', 'Bob', 'Jake', 'Sue'],
                    'date': [2004, 2008, 2012, 2014]})

df5 = pd.merge(df1, df4, left_on= "employee", right_on= "emp_name")
print(df5)

    department employee  date emp_name
0   Accounting      Bob  2008      Bob
1  Engineering     Jake  2012     Jake
2  Engineering     Lisa  2004     Lisa
3           HR      Sue  2014      Sue


In [36]:
df6 = pd.DataFrame({'department': ['Accounting', 'Engineering', 'HR'],
                    'supervisor': ['Carly', 'Guido', 'Steve']})
pd.merge(df5, df6)

Unnamed: 0,department,employee,date,emp_name,supervisor
0,Accounting,Bob,2008,Bob,Carly
1,Engineering,Jake,2012,Jake,Guido
2,Engineering,Lisa,2004,Lisa,Guido
3,HR,Sue,2014,Sue,Steve


In [38]:
df7 = pd.DataFrame({'department': ['Accounting', 'Accounting',
                              'Engineering', 'Engineering', 'HR', 'HR'],
                    'competence': ['math', 'spreadsheets', 'coding', 'linux',
                               'spreadsheets', 'organization']})
print(df7)

     competence   department
0          math   Accounting
1  spreadsheets   Accounting
2        coding  Engineering
3         linux  Engineering
4  spreadsheets           HR
5  organization           HR


In [39]:
pd.merge(df1,df7)

Unnamed: 0,department,employee,competence
0,Accounting,Bob,math
1,Accounting,Bob,spreadsheets
2,Engineering,Jake,coding
3,Engineering,Jake,linux
4,Engineering,Lisa,coding
5,Engineering,Lisa,linux
6,HR,Sue,spreadsheets
7,HR,Sue,organization


In [40]:
df8 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue', 'Lea'],
                    'department': ['Accounting', 'Engineering', 'Engineering', 'HR', 'Engineering']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
                    'date': [2004, 2008, 2012, 2014]})

In [41]:
pd.merge(df8,df2)

Unnamed: 0,department,employee,date
0,Accounting,Bob,2008
1,Engineering,Jake,2012
2,Engineering,Lisa,2004
3,HR,Sue,2014


In [42]:
pd.merge(df8,df2,how="left")

Unnamed: 0,department,employee,date
0,Accounting,Bob,2008.0
1,Engineering,Jake,2012.0
2,Engineering,Lisa,2004.0
3,HR,Sue,2014.0
4,Engineering,Lea,


In [43]:
pd.merge(df8,df2,how="right")

Unnamed: 0,department,employee,date
0,Accounting,Bob,2008
1,Engineering,Jake,2012
2,Engineering,Lisa,2004
3,HR,Sue,2014


In [44]:
pd.merge(df8,df2,how="outer")

Unnamed: 0,department,employee,date
0,Accounting,Bob,2008.0
1,Engineering,Jake,2012.0
2,Engineering,Lisa,2004.0
3,HR,Sue,2014.0
4,Engineering,Lea,


In [45]:
# Nous ajoutons une nouvelle colonne à df1 et df2, qui contient toujours
# la même valeur, ici 0.
df1['key'] = 0
df2['key'] = 0
print(df1)
print(df2)

    department employee  key
0   Accounting      Bob    0
1  Engineering     Jake    0
2  Engineering     Lisa    0
3           HR      Sue    0
   date employee  key
0  2004     Lisa    0
1  2008      Bob    0
2  2012     Jake    0
3  2014      Sue    0


In [47]:
# La jointure plusieurs-à-plusieurs
produit_cartesien = pd.merge(df1, df2, on='key')
print(produit_cartesien)

     department employee_x  key  date employee_y
0    Accounting        Bob    0  2004       Lisa
1    Accounting        Bob    0  2008        Bob
2    Accounting        Bob    0  2012       Jake
3    Accounting        Bob    0  2014        Sue
4   Engineering       Jake    0  2004       Lisa
5   Engineering       Jake    0  2008        Bob
6   Engineering       Jake    0  2012       Jake
7   Engineering       Jake    0  2014        Sue
8   Engineering       Lisa    0  2004       Lisa
9   Engineering       Lisa    0  2008        Bob
10  Engineering       Lisa    0  2012       Jake
11  Engineering       Lisa    0  2014        Sue
12           HR        Sue    0  2004       Lisa
13           HR        Sue    0  2008        Bob
14           HR        Sue    0  2012       Jake
15           HR        Sue    0  2014        Sue


In [48]:
# Effaçons la colonne key qui n'est plus utile
produit_cartesien.drop('key',1, inplace=True)
print(produit_cartesien)

     department employee_x  date employee_y
0    Accounting        Bob  2004       Lisa
1    Accounting        Bob  2008        Bob
2    Accounting        Bob  2012       Jake
3    Accounting        Bob  2014        Sue
4   Engineering       Jake  2004       Lisa
5   Engineering       Jake  2008        Bob
6   Engineering       Jake  2012       Jake
7   Engineering       Jake  2014        Sue
8   Engineering       Lisa  2004       Lisa
9   Engineering       Lisa  2008        Bob
10  Engineering       Lisa  2012       Jake
11  Engineering       Lisa  2014        Sue
12           HR        Sue  2004       Lisa
13           HR        Sue  2008        Bob
14           HR        Sue  2012       Jake
15           HR        Sue  2014        Sue


In [49]:
pd.merge(df1.assign(key=0), df2.assign(key=0), on='key').drop('key', axis=1)

Unnamed: 0,department,employee_x,date,employee_y
0,Accounting,Bob,2004,Lisa
1,Accounting,Bob,2008,Bob
2,Accounting,Bob,2012,Jake
3,Accounting,Bob,2014,Sue
4,Engineering,Jake,2004,Lisa
5,Engineering,Jake,2008,Bob
6,Engineering,Jake,2012,Jake
7,Engineering,Jake,2014,Sue
8,Engineering,Lisa,2004,Lisa
9,Engineering,Lisa,2008,Bob


In [52]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.rand(5))
print(ser)

0    0.374540
1    0.950714
2    0.731994
3    0.598658
4    0.156019
dtype: float64


In [53]:
print(ser.sum())
print(ser.mean())

2.811925491708157
0.5623850983416314


In [55]:
df = pd.DataFrame({'A': rng.rand(5),
                   'B': rng.rand(5)})            
print(df)
# Par colonne
print(df.mean())
# Par ligne
print(df.mean(axis='columns'))

          A         B
0  0.183405  0.611853
1  0.304242  0.139494
2  0.524756  0.292145
3  0.431945  0.366362
4  0.291229  0.456070
A    0.347115
B    0.373185
dtype: float64
0    0.397629
1    0.221868
2    0.408451
3    0.399153
4    0.373650
dtype: float64


In [56]:
df0 = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data1': range(6),
                   'data2': [10,11,10,9,10,10]})                  
print(df0)

   data1  data2 key
0      0     10   A
1      1     11   B
2      2     10   C
3      3      9   A
4      4     10   B
5      5     10   C


In [59]:
gb = df0.groupby('key')

<pandas.core.groupby.DataFrameGroupBy object at 0x000001A289CDB358>


In [60]:
print(gb.sum())
print(gb.mean())

     data1  data2
key              
A        3     19
B        5     21
C        7     20
     data1  data2
key              
A      1.5    9.5
B      2.5   10.5
C      3.5   10.0


In [61]:
s = gb['data1','data2'].sum()
m = gb['data2',].mean()
print(s)
print(m)

     data1  data2
key              
A        3     19
B        5     21
C        7     20
     data2
key       
A      9.5
B     10.5
C     10.0


In [63]:
groupped = pd.concat([s,m], axis=1)
groupped.columns = ["data1_somme","data2_somme","data2_moyenne"]
print(groupped)

     data1_somme  data2_somme  data2_moyenne
key                                         
A              3           19            9.5
B              5           21           10.5
C              7           20           10.0


In [66]:
print(groupped.sum(axis='columns'))

key
A    31.5
B    36.5
C    37.0
dtype: float64
