In [1]:
import numpy as np
import pandas as pd 

**NumPy arrays have one dtype for the entire array, while pandas DataFrames have one dtype per column**

In [4]:
pd.Series(np.arange(12))

0      0
1      1
2      2
3      3
4      4
5      5
6      6
7      7
8      8
9      9
10    10
11    11
dtype: int32

In [6]:
pd.DataFrame(np.arange(12).reshape(4,3))

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11


In [7]:
pd.DataFrame(np.arange(12).reshape(4,3),columns=list('ABC'))

Unnamed: 0,A,B,C
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11


In [8]:
pd.DataFrame(np.arange(12).reshape(4,3),columns=list('ABC'),index=list('efgh'))

Unnamed: 0,A,B,C
e,0,1,2
f,3,4,5
g,6,7,8
h,9,10,11


In [9]:
# pd.date_range("20130101", periods=6)

In [20]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
        "G": pd.Series(np.arange(4))
    }
)
df2

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo,0
1,1.0,2013-01-02,1.0,3,train,foo,1
2,1.0,2013-01-02,1.0,3,test,foo,2
3,1.0,2013-01-02,1.0,3,train,foo,3


In [21]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
G             int32
dtype: object

In [23]:
df2.head(2)

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo,0
1,1.0,2013-01-02,1.0,3,train,foo,1


In [24]:
df2.tail(2)

Unnamed: 0,A,B,C,D,E,F,G
2,1.0,2013-01-02,1.0,3,test,foo,2
3,1.0,2013-01-02,1.0,3,train,foo,3


In [26]:
df2.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [27]:
df2.columns

Index(['A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype='object')

In [30]:
'''
This is expensive
'''
df2.to_numpy() 

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo', 0],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo', 1],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo', 2],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo', 3]],
      dtype=object)

In [31]:
df2.describe()

Unnamed: 0,A,C,D,G
count,4.0,4.0,4.0,4.0
mean,1.0,1.0,3.0,1.5
std,0.0,0.0,0.0,1.290994
min,1.0,1.0,3.0,0.0
25%,1.0,1.0,3.0,0.75
50%,1.0,1.0,3.0,1.5
75%,1.0,1.0,3.0,2.25
max,1.0,1.0,3.0,3.0


In [33]:
'''
Index are sorted with axis=0
'''
df2.sort_index(axis=0, ascending=False) 

Unnamed: 0,A,B,C,D,E,F,G
3,1.0,2013-01-02,1.0,3,train,foo,3
2,1.0,2013-01-02,1.0,3,test,foo,2
1,1.0,2013-01-02,1.0,3,train,foo,1
0,1.0,2013-01-02,1.0,3,test,foo,0


In [34]:
'''
columns are sorted with axis=1
'''
df2.sort_index(axis=1, ascending=False) 

Unnamed: 0,G,F,E,D,C,B,A
0,0,foo,test,3,1.0,2013-01-02,1.0
1,1,foo,train,3,1.0,2013-01-02,1.0
2,2,foo,test,3,1.0,2013-01-02,1.0
3,3,foo,train,3,1.0,2013-01-02,1.0


In [35]:
'''
Sorting by values
'''
df2.sort_values(by="B")

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo,0
1,1.0,2013-01-02,1.0,3,train,foo,1
2,1.0,2013-01-02,1.0,3,test,foo,2
3,1.0,2013-01-02,1.0,3,train,foo,3


### Selection

In [36]:
# optimized pandas data access methods, .at, .iat, .loc and .iloc.

In [42]:
'''
Multi-dimension indexing like numpy is not possible in pandas
Only piossible way is to slice rows
'''
df2[1:3]

Unnamed: 0,A,B,C,D,E,F,G
1,1.0,2013-01-02,1.0,3,train,foo,1
2,1.0,2013-01-02,1.0,3,test,foo,2


In [47]:
df2.loc[[2,0,3]] # fetching rows via indexes

Unnamed: 0,A,B,C,D,E,F,G
2,1.0,2013-01-02,1.0,3,test,foo,2
0,1.0,2013-01-02,1.0,3,test,foo,0
3,1.0,2013-01-02,1.0,3,train,foo,3


In [49]:
df2.loc[1:3,["C","D"]] # fetching rows via index slice and column labels

Unnamed: 0,C,D
1,1.0,3
2,1.0,3
3,1.0,3


In [51]:
df2.iloc[1:3, 2:5] # fetching rows via index slice and column position slice

Unnamed: 0,C,D,E
1,1.0,3,train
2,1.0,3,test


In [54]:
df2.iloc[[1, 2, 3], [0, 2]]

Unnamed: 0,A,C
1,1.0,1.0
2,1.0,1.0
3,1.0,1.0
