In [1]:
import numpy as np
import pandas as pd 

**NumPy arrays have one dtype for the entire array, while pandas DataFrames have one dtype per column**

In [2]:
pd.Series(np.arange(12))

0      0
1      1
2      2
3      3
4      4
5      5
6      6
7      7
8      8
9      9
10    10
11    11
dtype: int32

In [3]:
pd.DataFrame(np.arange(12).reshape(4,3))

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11


In [4]:
pd.DataFrame(np.arange(12).reshape(4,3),columns=list('ABC'))

Unnamed: 0,A,B,C
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11


In [5]:
pd.DataFrame(np.arange(12).reshape(4,3),columns=list('ABC'),index=list('efgh'))

Unnamed: 0,A,B,C
e,0,1,2
f,3,4,5
g,6,7,8
h,9,10,11


In [6]:
# pd.date_range("20130101", periods=6)

In [7]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
        "G": pd.Series(np.arange(4))
    }
)
df2

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo,0
1,1.0,2013-01-02,1.0,3,train,foo,1
2,1.0,2013-01-02,1.0,3,test,foo,2
3,1.0,2013-01-02,1.0,3,train,foo,3


In [8]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
G             int32
dtype: object

In [9]:
df2.head(2)

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo,0
1,1.0,2013-01-02,1.0,3,train,foo,1


In [10]:
df2.tail(2)

Unnamed: 0,A,B,C,D,E,F,G
2,1.0,2013-01-02,1.0,3,test,foo,2
3,1.0,2013-01-02,1.0,3,train,foo,3


In [11]:
df2.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [12]:
df2.columns

Index(['A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype='object')

In [13]:
'''
This is expensive
'''
df2.to_numpy() 

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo', 0],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo', 1],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo', 2],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo', 3]],
      dtype=object)

In [14]:
df2.describe()

Unnamed: 0,A,C,D,G
count,4.0,4.0,4.0,4.0
mean,1.0,1.0,3.0,1.5
std,0.0,0.0,0.0,1.290994
min,1.0,1.0,3.0,0.0
25%,1.0,1.0,3.0,0.75
50%,1.0,1.0,3.0,1.5
75%,1.0,1.0,3.0,2.25
max,1.0,1.0,3.0,3.0


In [15]:
'''
Index are sorted with axis=0
'''
df2.sort_index(axis=0, ascending=False) 

Unnamed: 0,A,B,C,D,E,F,G
3,1.0,2013-01-02,1.0,3,train,foo,3
2,1.0,2013-01-02,1.0,3,test,foo,2
1,1.0,2013-01-02,1.0,3,train,foo,1
0,1.0,2013-01-02,1.0,3,test,foo,0


In [16]:
'''
columns are sorted with axis=1
'''
df2.sort_index(axis=1, ascending=False) 

Unnamed: 0,G,F,E,D,C,B,A
0,0,foo,test,3,1.0,2013-01-02,1.0
1,1,foo,train,3,1.0,2013-01-02,1.0
2,2,foo,test,3,1.0,2013-01-02,1.0
3,3,foo,train,3,1.0,2013-01-02,1.0


In [17]:
'''
Sorting by values
'''
df2.sort_values(by="B")

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo,0
1,1.0,2013-01-02,1.0,3,train,foo,1
2,1.0,2013-01-02,1.0,3,test,foo,2
3,1.0,2013-01-02,1.0,3,train,foo,3


### Selection

In [18]:
# optimized pandas data access methods, .at, .iat, .loc and .iloc.

In [19]:
'''
Multi-dimension indexing like numpy is not possible in pandas
Only possible way is to slice rows
'''
df2[1:3]

Unnamed: 0,A,B,C,D,E,F,G
1,1.0,2013-01-02,1.0,3,train,foo,1
2,1.0,2013-01-02,1.0,3,test,foo,2


In [20]:
df2.loc[[2,0,3]] # fetching rows via indexes

Unnamed: 0,A,B,C,D,E,F,G
2,1.0,2013-01-02,1.0,3,test,foo,2
0,1.0,2013-01-02,1.0,3,test,foo,0
3,1.0,2013-01-02,1.0,3,train,foo,3


In [21]:
df2.loc[1:3,["C","D"]] # fetching rows via index slice and column labels

Unnamed: 0,C,D
1,1.0,3
2,1.0,3
3,1.0,3


In [22]:
df2.iloc[1:3, 2:5] # fetching rows via index slice and column position slice

Unnamed: 0,C,D,E
1,1.0,3,train
2,1.0,3,test


In [23]:
df2.iloc[[1, 2, 3], [0, 2]]

Unnamed: 0,A,C
1,1.0,1.0
2,1.0,1.0
3,1.0,1.0


In [24]:
df2.iloc[1,1] # for getting values explicitly

Timestamp('2013-01-02 00:00:00')

In [25]:
df2.iat[1, 1] # Faster method to get scalar, similar to above iloc method

Timestamp('2013-01-02 00:00:00')

### Boolean Indexing

In [26]:
df2[df2["A"]>0]

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo,0
1,1.0,2013-01-02,1.0,3,train,foo,1
2,1.0,2013-01-02,1.0,3,test,foo,2
3,1.0,2013-01-02,1.0,3,train,foo,3


In [27]:
df2[df2[["A","C"]]>0.0] # Selecting values from a DF where a boolean condition is met

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,NaT,1.0,,,,
1,1.0,NaT,1.0,,,,
2,1.0,NaT,1.0,,,,
3,1.0,NaT,1.0,,,,


In [28]:
df = pd.DataFrame(np.random.randn(6,4),columns=list("ABCD"))
df["E"] = list("ABABCB")
df

Unnamed: 0,A,B,C,D,E
0,-1.034123,0.570005,0.033957,0.486585,A
1,0.94003,3.433614,2.721074,-0.558165,B
2,-0.42764,-0.54578,1.013947,0.170706,A
3,-0.347618,-0.795466,0.033586,0.713436,B
4,-0.81325,-0.837781,0.81569,0.510607,C
5,0.157228,0.295347,0.303278,0.955162,B


In [29]:
df[df["E"].isin(["A","C"])] # isin for filtering in pandas

Unnamed: 0,A,B,C,D,E
0,-1.034123,0.570005,0.033957,0.486585,A
2,-0.42764,-0.54578,1.013947,0.170706,A
4,-0.81325,-0.837781,0.81569,0.510607,C


### Setting data

In [30]:
dates = pd.date_range("20130101", periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))

df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)


In [31]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"])
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.032369,-1.166807,1.121107,0.49165,
2013-01-02,0.528902,1.675641,-1.025561,-1.619947,
2013-01-03,2.010107,-0.384627,0.438967,-1.314484,
2013-01-04,-0.53374,-0.833891,0.003388,-0.270227,


In [32]:
df1.loc[dates[0] : dates[1], "E"] = 1
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.032369,-1.166807,1.121107,0.49165,1.0
2013-01-02,0.528902,1.675641,-1.025561,-1.619947,1.0
2013-01-03,2.010107,-0.384627,0.438967,-1.314484,
2013-01-04,-0.53374,-0.833891,0.003388,-0.270227,


In [33]:
df1.dropna(how="any")

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.032369,-1.166807,1.121107,0.49165,1.0
2013-01-02,0.528902,1.675641,-1.025561,-1.619947,1.0


In [34]:
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.032369,-1.166807,1.121107,0.49165,1.0
2013-01-02,0.528902,1.675641,-1.025561,-1.619947,1.0
2013-01-03,2.010107,-0.384627,0.438967,-1.314484,5.0
2013-01-04,-0.53374,-0.833891,0.003388,-0.270227,5.0


In [35]:
pd.isna(df1)

Unnamed: 0,A,B,C,D,E
2013-01-01,False,False,False,False,False
2013-01-02,False,False,False,False,False
2013-01-03,False,False,False,False,True
2013-01-04,False,False,False,False,True


### Operations

In [36]:
df = pd.DataFrame(np.random.randint(low=1, high=10, size=(4,6)),columns=list("ABCDEF"))
df

Unnamed: 0,A,B,C,D,E,F
0,8,9,7,3,9,6
1,4,9,1,4,1,3
2,1,4,7,3,1,9
3,9,7,7,6,4,5


In [37]:
df.mean() #mean of columns

A    5.50
B    7.25
C    5.50
D    4.00
E    3.75
F    5.75
dtype: float64

In [38]:
df.mean(1) # mean of rows, axis=1

0    7.000000
1    3.666667
2    4.166667
3    6.333333
dtype: float64

In [39]:
import random
emp = pd.DataFrame()
emp["id"]=np.arange(100,110)
emp["dept"] = np.random.choice(["HR","FIN","MKT","IT"],size=(10,))
emp["sal"] = np.random.randint(low=1000, high=10000, size=(10,))
emp

Unnamed: 0,id,dept,sal
0,100,FIN,8175
1,101,HR,3473
2,102,HR,3694
3,103,MKT,4611
4,104,MKT,6812
5,105,MKT,4233
6,106,HR,6933
7,107,IT,4771
8,108,IT,2603
9,109,HR,4665


In [43]:
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

### Shift and sub

In [58]:
s = pd.Series([1, 3, 5, np.nan, 6,8],index=dates).shift(2)
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [59]:
# Equivalent to dataframe - other, but with support to substitute a fill_value for missing data in one of the inputs. 
# With reverse version, rsub.
# subtracts two dataframe, 
df.sub(s, axis="index") 

Unnamed: 0,A,B,C,D,E,F
2013-01-01 00:00:00,,,,,,
2013-01-02 00:00:00,,,,,,
2013-01-03 00:00:00,,,,,,
2013-01-04 00:00:00,,,,,,
2013-01-05 00:00:00,,,,,,
2013-01-06 00:00:00,,,,,,
0,,,,,,
1,,,,,,
2,,,,,,
3,,,,,,


### Apply

In [83]:
def avg(a):
    return np.mean(a)

df = pd.DataFrame(np.arange(12).reshape(4,3))
df

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11


In [84]:
#apply works on an entire row or column
df.apply(avg,axis=0)

0    4.5
1    5.5
2    6.5
dtype: float64

In [86]:
df.apply(lambda x:np.mean(x))

0    4.5
1    5.5
2    6.5
dtype: float64

In [88]:
# Each row/column is passed as pandas series
df.apply(lambda x:x.max()-x.min(),axis=1)

0    2
1    2
2    2
3    2
dtype: int64

In [101]:
def avg(a):
    print(type(a))
    return np.mean(a)

df = pd.DataFrame(np.arange(12).reshape(4,3),columns=list("abc"))
df.apply(avg)

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


a    4.5
b    5.5
c    6.5
dtype: float64

### Applymap

In [102]:
# applymap works element wise

In [103]:
df = df.astype('str')

In [104]:
def MyApplyMapTest(x):
    print(type(x))
    return x*2
    
df.applymap(MyApplyMapTest)

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>


Unnamed: 0,a,b,c
0,0,11,22
1,33,44,55
2,66,77,88
3,99,1010,1111


### Comparison with SQL

In [1]:
%reset -f

#### SELECT

In [2]:
import pandas as pd 
import numpy as np 

url = (
   "https://raw.github.com/pandas-dev"
   "/pandas/master/pandas/tests/io/data/csv/tips.csv"
   )
    

tips = pd.read_csv(url)

tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
# selecting a subset of column with rowlimit
tips[["total_bill", "tip", "smoker", "time"]].head(5)

Unnamed: 0,total_bill,tip,smoker,time
0,16.99,1.01,No,Dinner
1,10.34,1.66,No,Dinner
2,21.01,3.5,No,Dinner
3,23.68,3.31,No,Dinner
4,24.59,3.61,No,Dinner


#### Calculated Column

In [4]:
# calculated column - without making change in the original data
tips.assign(tip_rate=tips["tip"] / tips["total_bill"]).head(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_rate
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


In [5]:
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


#### WHERE

In [6]:
tips[tips["time"] == "Dinner"].head(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [8]:
tips[(tips["time"] == "Dinner") & (tips["tip"] > 5.00)].head(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
23,39.42,7.58,Male,No,Sat,Dinner,4
44,30.4,5.6,Male,No,Sun,Dinner,4
47,32.4,6.0,Male,No,Sun,Dinner,4
52,34.81,5.2,Female,No,Sun,Dinner,4
59,48.27,6.73,Male,No,Sat,Dinner,4


#### Null Checking

In [9]:
#  notna() and isna() methods

#### Group By

In [12]:
tips.groupby("time").size()

time
Dinner    176
Lunch      68
dtype: int64

In [13]:
tips.groupby("time").count()

Unnamed: 0_level_0,total_bill,tip,sex,smoker,day,size
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dinner,176,176,176,176,176,176
Lunch,68,68,68,68,68,68


In [14]:
tips.groupby("time")['total_bill'].count()

time
Dinner    176
Lunch      68
Name: total_bill, dtype: int64

In [15]:
'''
SELECT day, AVG(tip), COUNT(*)
FROM tips
GROUP BY day;
'''

tips.groupby("day").agg({"tip": np.mean, "day": np.size})

Unnamed: 0_level_0,tip,day
day,Unnamed: 1_level_1,Unnamed: 2_level_1
Fri,2.734737,19
Sat,2.993103,87
Sun,3.255132,76
Thur,2.771452,62


In [19]:
tips.groupby(["smoker", "day"]).agg({"tip": [np.size, np.mean]})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,tip
Unnamed: 0_level_1,Unnamed: 1_level_1,size,mean
smoker,day,Unnamed: 2_level_2,Unnamed: 3_level_2
No,Fri,4.0,2.8125
No,Sat,45.0,3.102889
No,Sun,57.0,3.167895
No,Thur,45.0,2.673778
Yes,Fri,15.0,2.714
Yes,Sat,42.0,2.875476
Yes,Sun,19.0,3.516842
Yes,Thur,17.0,3.03


#### JOIN

In [None]:
# JOINs can be performed with join() or merge()