In [1]:
import numpy as np
import pandas as pd 

**NumPy arrays have one dtype for the entire array, while pandas DataFrames have one dtype per column**

In [2]:
pd.Series(np.arange(12))

0      0
1      1
2      2
3      3
4      4
5      5
6      6
7      7
8      8
9      9
10    10
11    11
dtype: int32

In [3]:
pd.DataFrame(np.arange(12).reshape(4,3))

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11


In [4]:
pd.DataFrame(np.arange(12).reshape(4,3),columns=list('ABC'))

Unnamed: 0,A,B,C
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11


In [5]:
pd.DataFrame(np.arange(12).reshape(4,3),columns=list('ABC'),index=list('efgh'))

Unnamed: 0,A,B,C
e,0,1,2
f,3,4,5
g,6,7,8
h,9,10,11


In [6]:
# pd.date_range("20130101", periods=6)

In [7]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
        "G": pd.Series(np.arange(4))
    }
)
df2

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo,0
1,1.0,2013-01-02,1.0,3,train,foo,1
2,1.0,2013-01-02,1.0,3,test,foo,2
3,1.0,2013-01-02,1.0,3,train,foo,3


In [8]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
G             int32
dtype: object

In [9]:
df2.head(2)

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo,0
1,1.0,2013-01-02,1.0,3,train,foo,1


In [10]:
df2.tail(2)

Unnamed: 0,A,B,C,D,E,F,G
2,1.0,2013-01-02,1.0,3,test,foo,2
3,1.0,2013-01-02,1.0,3,train,foo,3


In [11]:
df2.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [12]:
df2.columns

Index(['A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype='object')

In [13]:
'''
This is expensive
'''
df2.to_numpy() 

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo', 0],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo', 1],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo', 2],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo', 3]],
      dtype=object)

In [14]:
df2.describe()

Unnamed: 0,A,C,D,G
count,4.0,4.0,4.0,4.0
mean,1.0,1.0,3.0,1.5
std,0.0,0.0,0.0,1.290994
min,1.0,1.0,3.0,0.0
25%,1.0,1.0,3.0,0.75
50%,1.0,1.0,3.0,1.5
75%,1.0,1.0,3.0,2.25
max,1.0,1.0,3.0,3.0


In [15]:
'''
Index are sorted with axis=0
'''
df2.sort_index(axis=0, ascending=False) 

Unnamed: 0,A,B,C,D,E,F,G
3,1.0,2013-01-02,1.0,3,train,foo,3
2,1.0,2013-01-02,1.0,3,test,foo,2
1,1.0,2013-01-02,1.0,3,train,foo,1
0,1.0,2013-01-02,1.0,3,test,foo,0


In [16]:
'''
columns are sorted with axis=1
'''
df2.sort_index(axis=1, ascending=False) 

Unnamed: 0,G,F,E,D,C,B,A
0,0,foo,test,3,1.0,2013-01-02,1.0
1,1,foo,train,3,1.0,2013-01-02,1.0
2,2,foo,test,3,1.0,2013-01-02,1.0
3,3,foo,train,3,1.0,2013-01-02,1.0


In [17]:
'''
Sorting by values
'''
df2.sort_values(by="B")

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo,0
1,1.0,2013-01-02,1.0,3,train,foo,1
2,1.0,2013-01-02,1.0,3,test,foo,2
3,1.0,2013-01-02,1.0,3,train,foo,3


### Selection

In [18]:
# optimized pandas data access methods, .at, .iat, .loc and .iloc.

In [19]:
'''
Multi-dimension indexing like numpy is not possible in pandas
Only possible way is to slice rows
'''
df2[1:3]

Unnamed: 0,A,B,C,D,E,F,G
1,1.0,2013-01-02,1.0,3,train,foo,1
2,1.0,2013-01-02,1.0,3,test,foo,2


In [20]:
df2.loc[[2,0,3]] # fetching rows via indexes

Unnamed: 0,A,B,C,D,E,F,G
2,1.0,2013-01-02,1.0,3,test,foo,2
0,1.0,2013-01-02,1.0,3,test,foo,0
3,1.0,2013-01-02,1.0,3,train,foo,3


In [21]:
df2.loc[1:3,["C","D"]] # fetching rows via index slice and column labels

Unnamed: 0,C,D
1,1.0,3
2,1.0,3
3,1.0,3


In [22]:
df2.iloc[1:3, 2:5] # fetching rows via index slice and column position slice

Unnamed: 0,C,D,E
1,1.0,3,train
2,1.0,3,test


In [25]:
df2.iloc[[1, 2, 3], [0, 2]]

Unnamed: 0,A,C
1,1.0,1.0
2,1.0,1.0
3,1.0,1.0


In [29]:
df2.iloc[1,1] # for getting values explicitly

Timestamp('2013-01-02 00:00:00')

In [31]:
df2.iat[1, 1] # Faster method to get scalar, similar to above iloc method

Timestamp('2013-01-02 00:00:00')

### Boolean Indexing

In [32]:
df2[df2["A"]>0]

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo,0
1,1.0,2013-01-02,1.0,3,train,foo,1
2,1.0,2013-01-02,1.0,3,test,foo,2
3,1.0,2013-01-02,1.0,3,train,foo,3


In [39]:
df2[df2[["A","C"]]>0.0] # Selecting values from a DF where a boolean condition is met

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,NaT,1.0,,,,
1,1.0,NaT,1.0,,,,
2,1.0,NaT,1.0,,,,
3,1.0,NaT,1.0,,,,


In [49]:
df = pd.DataFrame(np.random.randn(6,4),columns=list("ABCD"))
df["E"] = list("ABABCB")
df

Unnamed: 0,A,B,C,D,E
0,-0.264296,0.340047,-0.752084,0.817208,A
1,-0.948043,-0.770316,1.846461,-0.673723,B
2,0.317445,0.827732,1.402679,0.936912,A
3,0.076284,1.130111,0.197522,0.53121,B
4,-1.679902,0.813568,0.975194,0.400388,C
5,-0.710278,-0.354388,-1.066032,0.986649,B


In [50]:
df[df["E"].isin(["A","C"])] # isin for filtering in pandas

Unnamed: 0,A,B,C,D,E
0,-0.264296,0.340047,-0.752084,0.817208,A
2,0.317445,0.827732,1.402679,0.936912,A
4,-1.679902,0.813568,0.975194,0.400388,C


### Setting data

In [51]:
dates = pd.date_range("20130101", periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))

df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)


In [52]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"])
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,-1.661457,1.37866,-1.340781,0.135988,
2013-01-02,3.227292,-0.788488,-1.792152,-0.373822,
2013-01-03,-1.392239,1.66613,-0.700029,0.518276,
2013-01-04,0.905377,2.818927,-0.540818,0.248837,


In [53]:
df1.loc[dates[0] : dates[1], "E"] = 1
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,-1.661457,1.37866,-1.340781,0.135988,1.0
2013-01-02,3.227292,-0.788488,-1.792152,-0.373822,1.0
2013-01-03,-1.392239,1.66613,-0.700029,0.518276,
2013-01-04,0.905377,2.818927,-0.540818,0.248837,


In [54]:
df1.dropna(how="any")

Unnamed: 0,A,B,C,D,E
2013-01-01,-1.661457,1.37866,-1.340781,0.135988,1.0
2013-01-02,3.227292,-0.788488,-1.792152,-0.373822,1.0


In [55]:
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,E
2013-01-01,-1.661457,1.37866,-1.340781,0.135988,1.0
2013-01-02,3.227292,-0.788488,-1.792152,-0.373822,1.0
2013-01-03,-1.392239,1.66613,-0.700029,0.518276,5.0
2013-01-04,0.905377,2.818927,-0.540818,0.248837,5.0


In [56]:
pd.isna(df1)

Unnamed: 0,A,B,C,D,E
2013-01-01,False,False,False,False,False
2013-01-02,False,False,False,False,False
2013-01-03,False,False,False,False,True
2013-01-04,False,False,False,False,True


### Operations

In [81]:
df = pd.DataFrame(np.random.randint(low=1, high=10, size=(4,6)),columns=list("ABCDEF"))
df

Unnamed: 0,A,B,C,D,E,F
0,4,6,4,1,7,7
1,7,8,6,3,4,4
2,4,2,7,2,3,8
3,7,6,4,7,2,3


In [82]:
df.mean() #mean of columns

A    5.50
B    5.50
C    5.25
D    3.25
E    4.00
F    5.50
dtype: float64

In [83]:
df.mean(1) # mean of rows, axis=1

0    4.833333
1    5.333333
2    4.333333
3    4.833333
dtype: float64

In [76]:
import random
emp = pd.DataFrame()
emp["id"]=np.arange(100,110)
emp["dept"] = np.random.choice(["HR","FIN","MKT","IT"],size=(10,))
emp["sal"] = np.random.randint(low=1000, high=10000, size=(10,))
emp

Unnamed: 0,id,dept,sal
0,100,MKT,7142
1,101,MKT,7204
2,102,MKT,8247
3,103,MKT,6149
4,104,MKT,8899
5,105,IT,6996
6,106,MKT,5081
7,107,HR,1694
8,108,MKT,8596
9,109,HR,3573
