In [2]:
import numpy as np
import pandas as pd
s = pd.Series([1,2,3,5,6,np.nan,8])

In [4]:
s

0    1.0
1    2.0
2    3.0
3    5.0
4    6.0
5    NaN
6    8.0
dtype: float64

In [3]:
dates = pd.date_range('20130101', periods=6)

In [4]:
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [5]:
df = pd.DataFrame(np.random.randn(6,4),index=dates, columns=list('ABCD'))

In [6]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.544355,1.236081,-0.507431,-0.364324
2013-01-02,-0.328166,0.324835,-1.21786,-1.276681
2013-01-03,0.012117,0.364308,-0.989275,0.944559
2013-01-04,0.90988,0.079529,1.574323,0.052037
2013-01-05,-0.426929,0.083476,-1.667431,-0.18559
2013-01-06,-1.032612,-0.548388,1.006054,0.430436


In [7]:
df2 = pd.DataFrame({'A': 1.,
                   'B': pd.Timestamp('20130102'),
                   'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                   'D': np.array([3]*4, dtype='int32'),
                   'E': pd.Categorical(["test","train","test","train"]),
                   'F': 'foo'})

Creating an DataFrame by passing a dict of objects that can be converted to a series like

In [8]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [9]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [10]:
df2.A

0    1.0
1    1.0
2    1.0
3    1.0
Name: A, dtype: float64

In [11]:
df.head() #Top rows

Unnamed: 0,A,B,C,D
2013-01-01,0.544355,1.236081,-0.507431,-0.364324
2013-01-02,-0.328166,0.324835,-1.21786,-1.276681
2013-01-03,0.012117,0.364308,-0.989275,0.944559
2013-01-04,0.90988,0.079529,1.574323,0.052037
2013-01-05,-0.426929,0.083476,-1.667431,-0.18559


In [12]:
df.tail(3) #Bottom rows

Unnamed: 0,A,B,C,D
2013-01-04,0.90988,0.079529,1.574323,0.052037
2013-01-05,-0.426929,0.083476,-1.667431,-0.18559
2013-01-06,-1.032612,-0.548388,1.006054,0.430436


In [13]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [14]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [15]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

DataFrame.to_numpy() gives a NumPy representation of the underlying data. Note that this can be an expensive operation when your DataFrame has columns with different data types, which comes down to a fundamental differ- ence between pandas and NumPy: NumPy arrays have one dtype for the entire array, while pandas DataFrames have one dtype per column. When you call DataFrame.to_numpy(), pandas will find the NumPy dtype that can hold all of the dtypes in the DataFrame. This may end up being object, which requires casting every value to a Python object.


In [16]:
df.to_numpy()

array([[ 0.54435502,  1.23608052, -0.50743124, -0.3643243 ],
       [-0.32816598,  0.32483506, -1.21785975, -1.2766815 ],
       [ 0.01211701,  0.36430778, -0.98927537,  0.94455869],
       [ 0.90987968,  0.07952908,  1.57432263,  0.05203657],
       [-0.42692894,  0.0834763 , -1.66743125, -0.18558964],
       [-1.03261202, -0.54838778,  1.0060541 ,  0.43043599]])

For df2, the DataFrame with multiple dtypes, DataFrame.to_numpy() is relatively expensive.

In [17]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

DataFrame.to_numpy()doesnotincludetheindexorcolumnlabelsintheoutput.

In [18]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.053559,0.25664,-0.30027,-0.066594
std,0.70191,0.580512,1.299948,0.75507
min,-1.032612,-0.548388,-1.667431,-1.276681
25%,-0.402238,0.080516,-1.160714,-0.319641
50%,-0.158024,0.204156,-0.748353,-0.066777
75%,0.411296,0.35444,0.627683,0.335836
max,0.90988,1.236081,1.574323,0.944559


In [19]:
df.T #Transposing

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,0.544355,-0.328166,0.012117,0.90988,-0.426929,-1.032612
B,1.236081,0.324835,0.364308,0.079529,0.083476,-0.548388
C,-0.507431,-1.21786,-0.989275,1.574323,-1.667431,1.006054
D,-0.364324,-1.276681,0.944559,0.052037,-0.18559,0.430436


In [20]:
df.sort_index(axis=1, ascending=False) # Sorting by axis

Unnamed: 0,D,C,B,A
2013-01-01,-0.364324,-0.507431,1.236081,0.544355
2013-01-02,-1.276681,-1.21786,0.324835,-0.328166
2013-01-03,0.944559,-0.989275,0.364308,0.012117
2013-01-04,0.052037,1.574323,0.079529,0.90988
2013-01-05,-0.18559,-1.667431,0.083476,-0.426929
2013-01-06,0.430436,1.006054,-0.548388,-1.032612


In [21]:
df.sort_values(by='B') #Sorting by values:

Unnamed: 0,A,B,C,D
2013-01-06,-1.032612,-0.548388,1.006054,0.430436
2013-01-04,0.90988,0.079529,1.574323,0.052037
2013-01-05,-0.426929,0.083476,-1.667431,-0.18559
2013-01-02,-0.328166,0.324835,-1.21786,-1.276681
2013-01-03,0.012117,0.364308,-0.989275,0.944559
2013-01-01,0.544355,1.236081,-0.507431,-0.364324


Selection

In [22]:
df['A']

2013-01-01    0.544355
2013-01-02   -0.328166
2013-01-03    0.012117
2013-01-04    0.909880
2013-01-05   -0.426929
2013-01-06   -1.032612
Freq: D, Name: A, dtype: float64

In [23]:
df[0:3] #Selecting via [], which slices the rows.

Unnamed: 0,A,B,C,D
2013-01-01,0.544355,1.236081,-0.507431,-0.364324
2013-01-02,-0.328166,0.324835,-1.21786,-1.276681
2013-01-03,0.012117,0.364308,-0.989275,0.944559


In [24]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-0.328166,0.324835,-1.21786,-1.276681
2013-01-03,0.012117,0.364308,-0.989275,0.944559
2013-01-04,0.90988,0.079529,1.574323,0.052037


In [25]:
df.loc[dates[0]] #For getting a cross section using a label:

A    0.544355
B    1.236081
C   -0.507431
D   -0.364324
Name: 2013-01-01 00:00:00, dtype: float64

In [26]:
df.loc[:, ['A', 'B']] #Selecting on a multi-axis by label:

Unnamed: 0,A,B
2013-01-01,0.544355,1.236081
2013-01-02,-0.328166,0.324835
2013-01-03,0.012117,0.364308
2013-01-04,0.90988,0.079529
2013-01-05,-0.426929,0.083476
2013-01-06,-1.032612,-0.548388


In [27]:
df.loc['20130102':'20130104', ['A', 'B']] #Showing label slicing, both endpoints are included:

Unnamed: 0,A,B
2013-01-02,-0.328166,0.324835
2013-01-03,0.012117,0.364308
2013-01-04,0.90988,0.079529


In [28]:
df.loc['20130102', ['A', 'B']] #Reduction in the dimensions of the returned object:

A   -0.328166
B    0.324835
Name: 2013-01-02 00:00:00, dtype: float64

In [29]:
df.loc[dates[0], 'A'] #For getting a scalar value:

0.5443550199262591

In [30]:
df.at[dates[0],'A'] #For getting fast access to a scalar

0.5443550199262591

In [31]:
df.iloc[3] #Select via the position of the passed integers:

A    0.909880
B    0.079529
C    1.574323
D    0.052037
Name: 2013-01-04 00:00:00, dtype: float64

In [35]:
df.iloc[3:5, 0:2]# By integer slice, acting similar to numpy/python

Unnamed: 0,A,B
2013-01-04,0.90988,0.079529
2013-01-05,-0.426929,0.083476


In [37]:
df.iloc[[1,2,4],[0,2]] #By lists of integer position locations, similar to the numpy/python style:

Unnamed: 0,A,C
2013-01-02,-0.328166,-1.21786
2013-01-03,0.012117,-0.989275
2013-01-05,-0.426929,-1.667431


In [38]:
df.iloc[1:3,:] #For slicing row explicitly

Unnamed: 0,A,B,C,D
2013-01-02,-0.328166,0.324835,-1.21786,-1.276681
2013-01-03,0.012117,0.364308,-0.989275,0.944559


In [39]:
df.iloc[:,1:3]#For slicing column explicitly

Unnamed: 0,B,C
2013-01-01,1.236081,-0.507431
2013-01-02,0.324835,-1.21786
2013-01-03,0.364308,-0.989275
2013-01-04,0.079529,1.574323
2013-01-05,0.083476,-1.667431
2013-01-06,-0.548388,1.006054


In [40]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.544355,1.236081,-0.507431,-0.364324
2013-01-02,-0.328166,0.324835,-1.21786,-1.276681
2013-01-03,0.012117,0.364308,-0.989275,0.944559
2013-01-04,0.90988,0.079529,1.574323,0.052037
2013-01-05,-0.426929,0.083476,-1.667431,-0.18559
2013-01-06,-1.032612,-0.548388,1.006054,0.430436


In [41]:
df.iloc[1,1] #For getting a value explicitly

0.32483506140357016

In [42]:
df.iat[1,1]#For getting fast access to a scalar

0.32483506140357016

Boolean Indexing

In [43]:
df[df.A>0]

Unnamed: 0,A,B,C,D
2013-01-01,0.544355,1.236081,-0.507431,-0.364324
2013-01-03,0.012117,0.364308,-0.989275,0.944559
2013-01-04,0.90988,0.079529,1.574323,0.052037


In [44]:
df[df>0]

Unnamed: 0,A,B,C,D
2013-01-01,0.544355,1.236081,,
2013-01-02,,0.324835,,
2013-01-03,0.012117,0.364308,,0.944559
2013-01-04,0.90988,0.079529,1.574323,0.052037
2013-01-05,,0.083476,,
2013-01-06,,,1.006054,0.430436


In [45]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [46]:
df3 = df.copy()

In [47]:
df3['E'] = ['one', 'one', 'two', 'three', 'four', 'three']

In [48]:
df3

Unnamed: 0,A,B,C,D,E
2013-01-01,0.544355,1.236081,-0.507431,-0.364324,one
2013-01-02,-0.328166,0.324835,-1.21786,-1.276681,one
2013-01-03,0.012117,0.364308,-0.989275,0.944559,two
2013-01-04,0.90988,0.079529,1.574323,0.052037,three
2013-01-05,-0.426929,0.083476,-1.667431,-0.18559,four
2013-01-06,-1.032612,-0.548388,1.006054,0.430436,three


In [51]:
df3[df3['E'].isin(['two','four'])] #Using the isin() method for filtering:

Unnamed: 0,A,B,C,D,E
2013-01-03,0.012117,0.364308,-0.989275,0.944559,two
2013-01-05,-0.426929,0.083476,-1.667431,-0.18559,four


Setting

In [52]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102',periods=6))

In [53]:
s1 #Setting a new column automatically aligns the data by the indexes.

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [54]:
df['F'] =s1

In [55]:
df.at[dates[0],'A']=0 #Setting value by label

In [56]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,1.236081,-0.507431,-0.364324,
2013-01-02,-0.328166,0.324835,-1.21786,-1.276681,1.0
2013-01-03,0.012117,0.364308,-0.989275,0.944559,2.0
2013-01-04,0.90988,0.079529,1.574323,0.052037,3.0
2013-01-05,-0.426929,0.083476,-1.667431,-0.18559,4.0
2013-01-06,-1.032612,-0.548388,1.006054,0.430436,5.0


In [58]:
df.iat[0,1]=0 #Setting values by position

In [59]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.507431,-0.364324,
2013-01-02,-0.328166,0.324835,-1.21786,-1.276681,1.0
2013-01-03,0.012117,0.364308,-0.989275,0.944559,2.0
2013-01-04,0.90988,0.079529,1.574323,0.052037,3.0
2013-01-05,-0.426929,0.083476,-1.667431,-0.18559,4.0
2013-01-06,-1.032612,-0.548388,1.006054,0.430436,5.0


In [60]:
df.loc[:, 'D'] = np.array([5] * len(df))

In [61]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.507431,5,
2013-01-02,-0.328166,0.324835,-1.21786,5,1.0
2013-01-03,0.012117,0.364308,-0.989275,5,2.0
2013-01-04,0.90988,0.079529,1.574323,5,3.0
2013-01-05,-0.426929,0.083476,-1.667431,5,4.0
2013-01-06,-1.032612,-0.548388,1.006054,5,5.0


In [62]:
df4 = df.copy()

In [63]:
df4[df4>0] = -df4 #A where operation with setting.

In [64]:
df4

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.507431,-5,
2013-01-02,-0.328166,-0.324835,-1.21786,-5,-1.0
2013-01-03,-0.012117,-0.364308,-0.989275,-5,-2.0
2013-01-04,-0.90988,-0.079529,-1.574323,-5,-3.0
2013-01-05,-0.426929,-0.083476,-1.667431,-5,-4.0
2013-01-06,-1.032612,-0.548388,-1.006054,-5,-5.0
