# Learning the fundamentals of pandas
#### Author: Alphonse Brandon


In [3]:
# Importing libraries and modules

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

<h4>Creating Objects in pandas


Creating a series and passing a list of values

In [4]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

Creating a DataFrame by passing a numpy array, with a datetime index and labeled columns

In [5]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [6]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.241879,0.759868,-0.890722,0.620202
2013-01-02,0.145822,0.513743,-0.628044,0.159255
2013-01-03,2.162329,-0.099182,-0.249508,-1.268438
2013-01-04,-0.07047,1.574316,-2.608989,-1.81274
2013-01-05,-1.513,1.700117,1.897296,1.648903
2013-01-06,0.754513,-0.582074,0.681169,0.69561


Creating a DataFrame and passing a dict of objects that can be converted into series like

In [7]:
df2 = pd.DataFrame({
    'A': 1,
    'B': pd.Timestamp('20130102'),
    'C': pd.Series(1, index=list(range(4)), dtype = 'float32'),
    'D': np.array([3]* 4, dtype = 'int32'),
    'E': pd.Categorical(['test', 'train', 'test', 'train']),
    'F': 'foo'
    
})

df2

Unnamed: 0,A,B,C,D,E,F
0,1,2013-01-02,1.0,3,test,foo
1,1,2013-01-02,1.0,3,train,foo
2,1,2013-01-02,1.0,3,test,foo
3,1,2013-01-02,1.0,3,train,foo


Creating specific data types for each column of the dataframe

In [8]:
# displaying the current data type

df2.dtypes

A             int64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

### Viewing Data

Seeing the top and bottom of the frame

In [15]:
df.head(2)

Unnamed: 0,A,B,C,D
2013-01-01,-0.241879,0.759868,-0.890722,0.620202
2013-01-02,0.145822,0.513743,-0.628044,0.159255


In [16]:
df.tail(2)

Unnamed: 0,A,B,C,D
2013-01-05,-1.513,1.700117,1.897296,1.648903
2013-01-06,0.754513,-0.582074,0.681169,0.69561


Displaying the index, columns and uderlying numpy array

In [17]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [18]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [19]:
df.values

array([[-0.241879  ,  0.75986765, -0.89072198,  0.62020239],
       [ 0.14582237,  0.51374285, -0.62804437,  0.15925495],
       [ 2.16232875, -0.09918239, -0.24950794, -1.26843839],
       [-0.07047014,  1.5743156 , -2.60898932, -1.81273986],
       [-1.51300012,  1.70011715,  1.89729598,  1.64890344],
       [ 0.75451321, -0.58207398,  0.68116917,  0.69560976]])

Describe() shows a brief statistical summary of the dataframe

In [21]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.206219,0.644464,-0.2998,0.007132
std,1.213565,0.901869,1.52193,1.30441
min,-1.513,-0.582074,-2.608989,-1.81274
25%,-0.199027,0.054049,-0.825053,-0.911515
50%,0.037676,0.636805,-0.438776,0.389729
75%,0.602341,1.370704,0.4485,0.676758
max,2.162329,1.700117,1.897296,1.648903


Transposing the data

In [22]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-0.241879,0.145822,2.162329,-0.07047,-1.513,0.754513
B,0.759868,0.513743,-0.099182,1.574316,1.700117,-0.582074
C,-0.890722,-0.628044,-0.249508,-2.608989,1.897296,0.681169
D,0.620202,0.159255,-1.268438,-1.81274,1.648903,0.69561


Sorting my data by an axis

In [23]:
df.sort_values(by = 'B')

Unnamed: 0,A,B,C,D
2013-01-06,0.754513,-0.582074,0.681169,0.69561
2013-01-03,2.162329,-0.099182,-0.249508,-1.268438
2013-01-02,0.145822,0.513743,-0.628044,0.159255
2013-01-01,-0.241879,0.759868,-0.890722,0.620202
2013-01-04,-0.07047,1.574316,-2.608989,-1.81274
2013-01-05,-1.513,1.700117,1.897296,1.648903


Selecting and Getting data

In [24]:
df['A']

2013-01-01   -0.241879
2013-01-02    0.145822
2013-01-03    2.162329
2013-01-04   -0.070470
2013-01-05   -1.513000
2013-01-06    0.754513
Freq: D, Name: A, dtype: float64

Selecting a range of rows

In [25]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.241879,0.759868,-0.890722,0.620202
2013-01-02,0.145822,0.513743,-0.628044,0.159255
2013-01-03,2.162329,-0.099182,-0.249508,-1.268438


In [26]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,0.145822,0.513743,-0.628044,0.159255
2013-01-03,2.162329,-0.099182,-0.249508,-1.268438
2013-01-04,-0.07047,1.574316,-2.608989,-1.81274


Selection by label

In [28]:
df.loc[[dates[0]]]

Unnamed: 0,A,B,C,D
2013-01-01,-0.241879,0.759868,-0.890722,0.620202


In [29]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,-0.241879,0.759868
2013-01-02,0.145822,0.513743
2013-01-03,2.162329,-0.099182
2013-01-04,-0.07047,1.574316
2013-01-05,-1.513,1.700117
2013-01-06,0.754513,-0.582074


Label slicing

In [30]:
df.loc['20130102':'20130104',['A','B']]

Unnamed: 0,A,B
2013-01-02,0.145822,0.513743
2013-01-03,2.162329,-0.099182
2013-01-04,-0.07047,1.574316


Reduction in the dimension of returned objects

In [33]:
df.loc['20130102', ['A', 'B']]

A    0.145822
B    0.513743
Name: 2013-01-02 00:00:00, dtype: float64

Getting a specific value

In [35]:
df.loc[dates[0], 'A']

-0.2418790034500852

A faster method to getting a specific value

In [36]:
df.at[dates[0], 'A']

-0.2418790034500852

Selection by position

In [37]:
df.iloc[3]

A   -0.070470
B    1.574316
C   -2.608989
D   -1.812740
Name: 2013-01-04 00:00:00, dtype: float64

Performing integer slicing

In [38]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-0.07047,1.574316
2013-01-05,-1.513,1.700117


Selecting a list of integer position locations

In [39]:
df.iloc[[1,2,4], [0,2]]

Unnamed: 0,A,C
2013-01-02,0.145822,-0.628044
2013-01-03,2.162329,-0.249508
2013-01-05,-1.513,1.897296


Slicing rows explicitly

In [40]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,0.145822,0.513743,-0.628044,0.159255
2013-01-03,2.162329,-0.099182,-0.249508,-1.268438


Slicing columns explicitly

In [41]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2013-01-01,0.759868,-0.890722
2013-01-02,0.513743,-0.628044
2013-01-03,-0.099182,-0.249508
2013-01-04,1.574316,-2.608989
2013-01-05,1.700117,1.897296
2013-01-06,-0.582074,0.681169


Getting an explicit value

In [42]:
df.iloc[2,2]

-0.2495079405106601

An equip to the above method is 

In [43]:
df.iat[2,2]

-0.2495079405106601

<h4>Boolean Indexing

Using a single columns value to select data

In [44]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-02,0.145822,0.513743,-0.628044,0.159255
2013-01-03,2.162329,-0.099182,-0.249508,-1.268438
2013-01-06,0.754513,-0.582074,0.681169,0.69561


A 'Where' operation for getting

In [46]:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,,0.759868,,0.620202
2013-01-02,0.145822,0.513743,,0.159255
2013-01-03,2.162329,,,
2013-01-04,,1.574316,,
2013-01-05,,1.700117,1.897296,1.648903
2013-01-06,0.754513,,0.681169,0.69561


Filetering through the 'isin' method

In [47]:
df2 = df.copy()

df2['E'] = ['One','One', 'Two', 'Three', 'Four', 'Three']

df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.241879,0.759868,-0.890722,0.620202,One
2013-01-02,0.145822,0.513743,-0.628044,0.159255,One
2013-01-03,2.162329,-0.099182,-0.249508,-1.268438,Two
2013-01-04,-0.07047,1.574316,-2.608989,-1.81274,Three
2013-01-05,-1.513,1.700117,1.897296,1.648903,Four
2013-01-06,0.754513,-0.582074,0.681169,0.69561,Three


In [48]:
df2[df2['E'].isin(['Two', 'Four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,2.162329,-0.099182,-0.249508,-1.268438,Two
2013-01-05,-1.513,1.700117,1.897296,1.648903,Four


<h4> Setting

Automatically align the data through the indexes by setting a new column

In [52]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))

s1

df['F'] = s1

s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

Setting value by labels

In [57]:
df.at[dates[0], 'A'] = 0

df.iat[0,1] = 0

df



Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.890722,0.620202,
2013-01-02,0.145822,0.513743,-0.628044,0.159255,1.0
2013-01-03,2.162329,-0.099182,-0.249508,-1.268438,2.0
2013-01-04,-0.07047,1.574316,-2.608989,-1.81274,3.0
2013-01-05,-1.513,1.700117,1.897296,1.648903,4.0
2013-01-06,0.754513,-0.582074,0.681169,0.69561,5.0


Setting by assigning with a numpy array

In [59]:
df.loc[:, 'D'] = np.array([5] * len(df))

df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.890722,5,
2013-01-02,0.145822,0.513743,-0.628044,5,1.0
2013-01-03,2.162329,-0.099182,-0.249508,5,2.0
2013-01-04,-0.07047,1.574316,-2.608989,5,3.0
2013-01-05,-1.513,1.700117,1.897296,5,4.0
2013-01-06,0.754513,-0.582074,0.681169,5,5.0


Using the where operator with setting

In [61]:
df2 = df.copy()

df2[df2 > 0] = -df2

df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.890722,-5,
2013-01-02,-0.145822,-0.513743,-0.628044,-5,-1.0
2013-01-03,-2.162329,-0.099182,-0.249508,-5,-2.0
2013-01-04,-0.07047,-1.574316,-2.608989,-5,-3.0
2013-01-05,-1.513,-1.700117,-1.897296,-5,-4.0
2013-01-06,-0.754513,-0.582074,-0.681169,-5,-5.0


### Missing Data

Changing the index of a specific index through reindexing

In [63]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,-0.890722,5,,
2013-01-02,0.145822,0.513743,-0.628044,5,1.0,
2013-01-03,2.162329,-0.099182,-0.249508,5,2.0,
2013-01-04,-0.07047,1.574316,-2.608989,5,3.0,


In [64]:
df1.loc[dates[0]:dates[1], 'E'] = 1
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,-0.890722,5,,1.0
2013-01-02,0.145822,0.513743,-0.628044,5,1.0,1.0
2013-01-03,2.162329,-0.099182,-0.249508,5,2.0,
2013-01-04,-0.07047,1.574316,-2.608989,5,3.0,


Dropping any rows with missing values

In [65]:
df1.dropna(how= 'any')

Unnamed: 0,A,B,C,D,F,E
2013-01-02,0.145822,0.513743,-0.628044,5,1.0,1.0


Filling missing values

In [67]:
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,-0.890722,5,5.0,1.0
2013-01-02,0.145822,0.513743,-0.628044,5,1.0,1.0
2013-01-03,2.162329,-0.099182,-0.249508,5,2.0,5.0
2013-01-04,-0.07047,1.574316,-2.608989,5,3.0,5.0


Getting boolean mask where values are nan

In [68]:
pd.isnull(df1)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,True,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


### Performing Operations

#### Stats


Operations generally will exclude missing data

Performing descriptive statistics on column axis is the default

In [69]:
df.mean()

A    0.246532
B    0.517820
C   -0.299800
D    5.000000
F    3.000000
dtype: float64

Same operation on the row axis

In [70]:
df.mean(1)

2013-01-01    1.027320
2013-01-02    1.206304
2013-01-03    1.762728
2013-01-04    1.378971
2013-01-05    2.216883
2013-01-06    2.170722
Freq: D, dtype: float64

Objects that have different dimensions and needs alignment

In [71]:
s = pd.Series([1,3,5,np.nan,6,8], index=dates)
s

2013-01-01    1.0
2013-01-02    3.0
2013-01-03    5.0
2013-01-04    NaN
2013-01-05    6.0
2013-01-06    8.0
Freq: D, dtype: float64

In [72]:
s.shift(2)

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [73]:
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D,F
2013-01-01,-1.0,-1.0,-1.890722,4.0,
2013-01-02,-2.854178,-2.486257,-3.628044,2.0,-2.0
2013-01-03,-2.837671,-5.099182,-5.249508,0.0,-3.0
2013-01-04,,,,,
2013-01-05,-7.513,-4.299883,-4.102704,-1.0,-2.0
2013-01-06,-7.245487,-8.582074,-7.318831,-3.0,-3.0


### Using 'apply' to apply functions to data

In [74]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.890722,5,
2013-01-02,0.145822,0.513743,-0.628044,5,1.0
2013-01-03,2.162329,-0.099182,-0.249508,5,2.0
2013-01-04,-0.07047,1.574316,-2.608989,5,3.0
2013-01-05,-1.513,1.700117,1.897296,5,4.0
2013-01-06,0.754513,-0.582074,0.681169,5,5.0


In [75]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.890722,5,
2013-01-02,0.145822,0.513743,-1.518766,10,1.0
2013-01-03,2.308151,0.41456,-1.768274,15,3.0
2013-01-04,2.237681,1.988876,-4.377264,20,6.0
2013-01-05,0.724681,3.688993,-2.479968,25,10.0
2013-01-06,1.479194,3.106919,-1.798798,30,15.0


In [76]:
df.apply(lambda x: x.max() - x.min())

A    3.675329
B    2.282191
C    4.506285
D    0.000000
F    4.000000
dtype: float64

## Histogramming

In [77]:
s = pd.Series(np.random.randint(0, 7, size=10))
s

0    0
1    5
2    1
3    2
4    3
5    6
6    5
7    2
8    3
9    5
dtype: int32

Counting the number of unique values generated

In [79]:
s.value_counts()

5    3
2    2
3    2
0    1
1    1
6    1
dtype: int64