# 10 Minutes to pandas

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Object Creation

In [2]:
# Creando un Seriespaso pasando una lista de valores, permitiendo que los pandas creen un índice entero predeterminado:
s = pd.Series([1,3,5,np.nan,6,8])

In [3]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [4]:
# Crear una DataFramepasando una matriz NumPy, con un índice de fecha y hora y columnas etiquetadas:
dates = pd.date_range('20130101', periods=6)

In [5]:
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [6]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

In [7]:
df

Unnamed: 0,A,B,C,D
2013-01-01,1.835012,-1.108351,-2.192206,-0.207734
2013-01-02,-0.093317,-0.278418,-1.447972,0.658798
2013-01-03,0.084235,-1.020284,-0.968255,1.015849
2013-01-04,0.740418,1.067682,1.830481,0.176171
2013-01-05,0.037257,-2.007993,0.426792,-1.108372
2013-01-06,0.252073,-0.384184,-0.416389,0.627039


In [8]:
df2 = pd.DataFrame({ 'A' : 1., 'B' : pd.Timestamp('20130102'), 'C' : pd.Series(1,index=list(range(4)),dtype='float32'), 'D' : np.array([3] * 4,dtype='int32'), 'E' : pd.Categorical(["test","train","test","train"]), 'F' : 'foo' })

In [9]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [10]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## Viewing Data

In [11]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,1.835012,-1.108351,-2.192206,-0.207734
2013-01-02,-0.093317,-0.278418,-1.447972,0.658798
2013-01-03,0.084235,-1.020284,-0.968255,1.015849
2013-01-04,0.740418,1.067682,1.830481,0.176171
2013-01-05,0.037257,-2.007993,0.426792,-1.108372


In [12]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,0.740418,1.067682,1.830481,0.176171
2013-01-05,0.037257,-2.007993,0.426792,-1.108372
2013-01-06,0.252073,-0.384184,-0.416389,0.627039


In [13]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.475946,-0.621925,-0.461258,0.193625
std,0.726233,1.033999,1.433146,0.766404
min,-0.093317,-2.007993,-2.192206,-1.108372
25%,0.049001,-1.086334,-1.328043,-0.111758
50%,0.168154,-0.702234,-0.692322,0.401605
75%,0.618332,-0.30486,0.215997,0.650858
max,1.835012,1.067682,1.830481,1.015849


In [14]:
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,1.835012,-0.093317,0.084235,0.740418,0.037257,0.252073
B,-1.108351,-0.278418,-1.020284,1.067682,-2.007993,-0.384184
C,-2.192206,-1.447972,-0.968255,1.830481,0.426792,-0.416389
D,-0.207734,0.658798,1.015849,0.176171,-1.108372,0.627039


In [15]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,-0.207734,-2.192206,-1.108351,1.835012
2013-01-02,0.658798,-1.447972,-0.278418,-0.093317
2013-01-03,1.015849,-0.968255,-1.020284,0.084235
2013-01-04,0.176171,1.830481,1.067682,0.740418
2013-01-05,-1.108372,0.426792,-2.007993,0.037257
2013-01-06,0.627039,-0.416389,-0.384184,0.252073


In [16]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-05,0.037257,-2.007993,0.426792,-1.108372
2013-01-01,1.835012,-1.108351,-2.192206,-0.207734
2013-01-03,0.084235,-1.020284,-0.968255,1.015849
2013-01-06,0.252073,-0.384184,-0.416389,0.627039
2013-01-02,-0.093317,-0.278418,-1.447972,0.658798
2013-01-04,0.740418,1.067682,1.830481,0.176171


## Selection

Note:

While standard Python / Numpy expressions for selecting and setting are intuitive and come in handy for interactive work, for production code, we recommend the optimized pandas data access methods, .at, .iat, .loc and .iloc.

Getting

In [17]:
# Selecting a single column, which yields a Series, equivalent to df.A:
df['A']

2013-01-01    1.835012
2013-01-02   -0.093317
2013-01-03    0.084235
2013-01-04    0.740418
2013-01-05    0.037257
2013-01-06    0.252073
Freq: D, Name: A, dtype: float64

In [18]:
# Selecting via [], which slices the rows.
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,1.835012,-1.108351,-2.192206,-0.207734
2013-01-02,-0.093317,-0.278418,-1.447972,0.658798
2013-01-03,0.084235,-1.020284,-0.968255,1.015849


In [19]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-0.093317,-0.278418,-1.447972,0.658798
2013-01-03,0.084235,-1.020284,-0.968255,1.015849
2013-01-04,0.740418,1.067682,1.830481,0.176171


Selection by Label

In [20]:
df.loc[dates[0]]

A    1.835012
B   -1.108351
C   -2.192206
D   -0.207734
Name: 2013-01-01 00:00:00, dtype: float64

In [21]:
# Selecting on a multi-axis by label:
df.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,1.835012,-1.108351
2013-01-02,-0.093317,-0.278418
2013-01-03,0.084235,-1.020284
2013-01-04,0.740418,1.067682
2013-01-05,0.037257,-2.007993
2013-01-06,0.252073,-0.384184


In [22]:
# Showing label slicing, both endpoints are included:
df.loc['20130102':'20130104',['A','B']]

Unnamed: 0,A,B
2013-01-02,-0.093317,-0.278418
2013-01-03,0.084235,-1.020284
2013-01-04,0.740418,1.067682


In [23]:
# Reduction in the dimensions of the returned object:
df.loc['20130102',['A','B']]

A   -0.093317
B   -0.278418
Name: 2013-01-02 00:00:00, dtype: float64

In [24]:
# For getting a scalar value:
df.loc[dates[0],'A']

1.8350119442113282

In [25]:
# For getting fast access to a scalar (equivalent to the prior method):
df.at[dates[0],'A']

1.8350119442113282

Selection by Position

In [26]:
df.iloc[3]

A    0.740418
B    1.067682
C    1.830481
D    0.176171
Name: 2013-01-04 00:00:00, dtype: float64

In [27]:
# By integer slices, acting similar to numpy/python:
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2013-01-04,0.740418,1.067682
2013-01-05,0.037257,-2.007993


In [28]:
# By lists of integer position locations, similar to the numpy/python style:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,-0.093317,-1.447972
2013-01-03,0.084235,-0.968255
2013-01-05,0.037257,0.426792


In [29]:
# For slicing rows explicitly:
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2013-01-02,-0.093317,-0.278418,-1.447972,0.658798
2013-01-03,0.084235,-1.020284,-0.968255,1.015849


In [30]:
# For slicing columns explicitly:
df.iloc[:,1:3]

Unnamed: 0,B,C
2013-01-01,-1.108351,-2.192206
2013-01-02,-0.278418,-1.447972
2013-01-03,-1.020284,-0.968255
2013-01-04,1.067682,1.830481
2013-01-05,-2.007993,0.426792
2013-01-06,-0.384184,-0.416389


In [31]:
# For getting a value explicitly:
df.iloc[1,1]

-0.2784184604657575

In [32]:
# For getting fast access to a scalar (equivalent to the prior method):
df.iat[1,1]

-0.2784184604657575

Boolean Indexing

In [33]:
# Using a single column’s values to select data.
df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-01,1.835012,-1.108351,-2.192206,-0.207734
2013-01-03,0.084235,-1.020284,-0.968255,1.015849
2013-01-04,0.740418,1.067682,1.830481,0.176171
2013-01-05,0.037257,-2.007993,0.426792,-1.108372
2013-01-06,0.252073,-0.384184,-0.416389,0.627039


In [34]:
# Selecting values from a DataFrame where a boolean condition is met.
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,1.835012,,,
2013-01-02,,,,0.658798
2013-01-03,0.084235,,,1.015849
2013-01-04,0.740418,1.067682,1.830481,0.176171
2013-01-05,0.037257,,0.426792,
2013-01-06,0.252073,,,0.627039


In [35]:
# Using the isin() method for filtering:
df2 = df.copy()

In [36]:
df2['E'] = ['one', 'one','two','three','four','three']

In [37]:
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,1.835012,-1.108351,-2.192206,-0.207734,one
2013-01-02,-0.093317,-0.278418,-1.447972,0.658798,one
2013-01-03,0.084235,-1.020284,-0.968255,1.015849,two
2013-01-04,0.740418,1.067682,1.830481,0.176171,three
2013-01-05,0.037257,-2.007993,0.426792,-1.108372,four
2013-01-06,0.252073,-0.384184,-0.416389,0.627039,three


In [38]:
df2[df2['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,0.084235,-1.020284,-0.968255,1.015849,two
2013-01-05,0.037257,-2.007993,0.426792,-1.108372,four


Setting

In [39]:
# Setting a new column automatically aligns the data by the indexes.
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))

In [40]:
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [41]:
# Setting values by label:
df.at[dates[0],'A'] = 0

In [42]:
# Setting values by position:
df.iat[0,1] = 0

In [43]:
# Setting by assigning with a NumPy array:
df.loc[:,'D'] = np.array([5] * len(df))

In [44]:
# The result of the prior setting operations.
df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-2.192206,5
2013-01-02,-0.093317,-0.278418,-1.447972,5
2013-01-03,0.084235,-1.020284,-0.968255,5
2013-01-04,0.740418,1.067682,1.830481,5
2013-01-05,0.037257,-2.007993,0.426792,5
2013-01-06,0.252073,-0.384184,-0.416389,5


In [45]:
# A where operation with setting.
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-2.192206,-5
2013-01-02,-0.093317,-0.278418,-1.447972,-5
2013-01-03,-0.084235,-1.020284,-0.968255,-5
2013-01-04,-0.740418,-1.067682,-1.830481,-5
2013-01-05,-0.037257,-2.007993,-0.426792,-5
2013-01-06,-0.252073,-0.384184,-0.416389,-5
