In [1]:
import pandas as pd
import numpy as np
print(pd.__version__)

0.25.3


In [2]:
print(pd.__doc__)


pandas - a powerful data analysis and manipulation library for Python

**pandas** is a Python package providing fast, flexible, and expressive data
structures designed to make working with "relational" or "labeled" data both
easy and intuitive. It aims to be the fundamental high-level building block for
doing practical, **real world** data analysis in Python. Additionally, it has
the broader goal of becoming **the most powerful and flexible open source data
analysis / manipulation tool available in any language**. It is already well on
its way toward this goal.

Main Features
-------------
Here are just a few of the things that pandas does well:

  - Easy handling of missing data in floating point as well as non-floating
    point data.
  - Size mutability: columns can be inserted and deleted from DataFrame and
    higher dimensional objects
  - Automatic and explicit data alignment: objects can be explicitly aligned
    to a set of labels, or the user can simply ignore the labels and

# Pandas Series

In [3]:
data = pd.Series([1.0,2.5,3.9,4.2])
data

0    1.0
1    2.5
2    3.9
3    4.2
dtype: float64

In [4]:
data.values

array([1. , 2.5, 3.9, 4.2])

In [5]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
data.value_counts

<bound method IndexOpsMixin.value_counts of 0    1.0
1    2.5
2    3.9
3    4.2
dtype: float64>

In [7]:
data[0]

1.0

In [8]:
data[1:3]

1    2.5
2    3.9
dtype: float64

In [9]:
data[::-1]

3    4.2
2    3.9
1    2.5
0    1.0
dtype: float64

In [10]:
data.dtype

dtype('float64')

In [11]:
data = pd.Series([1. , 2.5, 3.9, 4.2], index=['a','b','c','d'])
data

a    1.0
b    2.5
c    3.9
d    4.2
dtype: float64

In [12]:
data['a']

1.0

In [13]:
data['a':'c']

a    1.0
b    2.5
c    3.9
dtype: float64

## Pandas Series From Dict

In [14]:
d = {'a':1,'b':2,'c':3,'d':4}
data = pd.Series(d)
data

a    1
b    2
c    3
d    4
dtype: int64

In [15]:
data['a':'c']

a    1
b    2
c    3
dtype: int64

In [16]:
a = pd.Series(d,index=['c','a'])
a

c    3
a    1
dtype: int64

## Pandas Series From List

In [17]:
d = pd.Series([1,2,3])
d

0    1
1    2
2    3
dtype: int64

In [18]:
d = pd.Series(3,[1,2,3])
d

1    3
2    3
3    3
dtype: int64

In [19]:
d = pd.Series(3,dtype=('f'))
d

0    3.0
dtype: float32

In [20]:
s = pd.Series(np.arange(1,10))
s

0    1
1    2
2    3
3    4
4    5
5    6
6    7
7    8
8    9
dtype: int32

## Dataframe

In [21]:
population_dict = {"India":1230000,"USA":7987868}
area_dict = {"India":908080,"USA":776657}
population = pd.Series(population_dict)
area = pd.Series(area_dict)
df = pd.DataFrame({'Population':population,'area':area})
df

Unnamed: 0,Population,area
India,1230000,908080
USA,7987868,776657


In [22]:
df1 = pd.DataFrame({'Population':population,'area':area,'x':pd.Series(np.arange(2),['USA','India'])})
df1

Unnamed: 0,Population,area,x
India,1230000,908080,1
USA,7987868,776657,0


In [23]:
df.columns

Index(['Population', 'area'], dtype='object')

In [24]:
pd.DataFrame(area,columns=['area'])

Unnamed: 0,area
India,908080
USA,776657


In [25]:
data = [{'a':1,'b':2 * i} for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,1,0
1,1,2
2,1,4


In [26]:
pd.DataFrame([{'a':1,'b':2},{'b':1,'c':3}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,1,3.0


In [27]:
pd.DataFrame([1,2,3],index=['a','b','c'],columns=['Values'])

Unnamed: 0,Values
a,1
b,2
c,3


In [28]:
pd.DataFrame([[1,2],[3,4]],columns=['a','b'],index=[1,2])

Unnamed: 0,a,b
1,1,2
2,3,4


## dataframe form a numpy array

In [29]:
pd.DataFrame(np.random.rand(3,2),columns=['A','B'],index=[1,2,3])

Unnamed: 0,A,B
1,0.770019,0.680131
2,0.449494,0.800557
3,0.783522,0.778078


In [30]:
A = np.zeros(3,dtype=[('A','i8'),('B','f8')])
A

array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])

In [31]:
pd.DataFrame(A)

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


### Selecting Data from Dataframe3

In [32]:
df = pd.DataFrame(np.random.rand(5,4),index=['A','B','C','D','E'],columns=['W','X','Y','Z'])
df

Unnamed: 0,W,X,Y,Z
A,0.142901,0.223609,0.251744,0.973928
B,0.024198,0.454734,0.352985,0.144078
C,0.365066,0.596264,0.553256,0.966719
D,0.59083,0.672203,0.483075,0.016721
E,0.244469,0.634405,0.045014,0.218126


In [33]:
df.W

A    0.142901
B    0.024198
C    0.365066
D    0.590830
E    0.244469
Name: W, dtype: float64

In [34]:
df['X']

A    0.223609
B    0.454734
C    0.596264
D    0.672203
E    0.634405
Name: X, dtype: float64

In [35]:
df[['X','Y']]

Unnamed: 0,X,Y
A,0.223609,0.251744
B,0.454734,0.352985
C,0.596264,0.553256
D,0.672203,0.483075
E,0.634405,0.045014


In [36]:
type(df.W)

pandas.core.series.Series

### Adding and Removing Column

In [37]:
df['new'] = df['W']+df['X']
df

Unnamed: 0,W,X,Y,Z,new
A,0.142901,0.223609,0.251744,0.973928,0.36651
B,0.024198,0.454734,0.352985,0.144078,0.478932
C,0.365066,0.596264,0.553256,0.966719,0.961331
D,0.59083,0.672203,0.483075,0.016721,1.263033
E,0.244469,0.634405,0.045014,0.218126,0.878874


In [38]:
df = df.drop('new',axis=1)
# df.drop('new',axis=1,inplace=True)
df

Unnamed: 0,W,X,Y,Z
A,0.142901,0.223609,0.251744,0.973928
B,0.024198,0.454734,0.352985,0.144078
C,0.365066,0.596264,0.553256,0.966719
D,0.59083,0.672203,0.483075,0.016721
E,0.244469,0.634405,0.045014,0.218126


In [39]:
df['W * Y'] = df['W']*df['Y']
df

Unnamed: 0,W,X,Y,Z,W * Y
A,0.142901,0.223609,0.251744,0.973928,0.035975
B,0.024198,0.454734,0.352985,0.144078,0.008541
C,0.365066,0.596264,0.553256,0.966719,0.201975
D,0.59083,0.672203,0.483075,0.016721,0.285415
E,0.244469,0.634405,0.045014,0.218126,0.011005


### Selecting Rows

In [40]:
df.loc['A']

W        0.142901
X        0.223609
Y        0.251744
Z        0.973928
W * Y    0.035975
Name: A, dtype: float64

In [41]:
df.iloc[0]

W        0.142901
X        0.223609
Y        0.251744
Z        0.973928
W * Y    0.035975
Name: A, dtype: float64

### Subset of Rows and column

In [42]:
df.loc[['A','B'],['X','Y']]

Unnamed: 0,X,Y
A,0.223609,0.251744
B,0.454734,0.352985


In [43]:
df.loc['A','X']

0.22360890249248544

In [44]:
df.head(2)

Unnamed: 0,W,X,Y,Z,W * Y
A,0.142901,0.223609,0.251744,0.973928,0.035975
B,0.024198,0.454734,0.352985,0.144078,0.008541


In [45]:
df.tail(2)

Unnamed: 0,W,X,Y,Z,W * Y
D,0.59083,0.672203,0.483075,0.016721,0.285415
E,0.244469,0.634405,0.045014,0.218126,0.011005


In [46]:
pd.DataFrame(np.arange(20).reshape(10,2),index=[ pd.date_range(start='1/1/2020', end='1/10/2020')],columns=list('AB'))

Unnamed: 0,A,B
2020-01-01,0,1
2020-01-02,2,3
2020-01-03,4,5
2020-01-04,6,7
2020-01-05,8,9
2020-01-06,10,11
2020-01-07,12,13
2020-01-08,14,15
2020-01-09,16,17
2020-01-10,18,19


In [47]:
pd.date_range(start='1/1/2020', periods=10,freq='2 D')

DatetimeIndex(['2020-01-01', '2020-01-03', '2020-01-05', '2020-01-07',
               '2020-01-09', '2020-01-11', '2020-01-13', '2020-01-15',
               '2020-01-17', '2020-01-19'],
              dtype='datetime64[ns]', freq='2D')

In [48]:
pd.timedelta_range('1 day','2 day',freq='3H')

TimedeltaIndex(['1 days 00:00:00', '1 days 03:00:00', '1 days 06:00:00',
                '1 days 09:00:00', '1 days 12:00:00', '1 days 15:00:00',
                '1 days 18:00:00', '1 days 21:00:00', '2 days 00:00:00'],
               dtype='timedelta64[ns]', freq='3H')

In [49]:
s 

0    1
1    2
2    3
3    4
4    5
5    6
6    7
7    8
8    9
dtype: int32

In [50]:
x = pd.DataFrame([{'a':1,'b':2},{'b':1,'c':3}])
print(x)
x.isnull()

     a  b    c
0  1.0  2  NaN
1  NaN  1  3.0


Unnamed: 0,a,b,c
0,False,False,True
1,True,False,False


In [51]:
s1 = pd.Series([1,2,3])
s2 = pd.Series([4,5,6])
print(s2-s1)
print(s2+s1)
print(s2*s1)
print(s2/s1)

0    3
1    3
2    3
dtype: int64
0    5
1    7
2    9
dtype: int64
0     4
1    10
2    18
dtype: int64
0    4.0
1    2.5
2    2.0
dtype: float64


In [54]:
df.index

Index(['A', 'B', 'C', 'D', 'E'], dtype='object')

In [55]:
i = pd.Index([1,3,2])
print(i)

Int64Index([1, 3, 2], dtype='int64')


## Conditional Selection

In [56]:
df

Unnamed: 0,W,X,Y,Z,W * Y
A,0.142901,0.223609,0.251744,0.973928,0.035975
B,0.024198,0.454734,0.352985,0.144078,0.008541
C,0.365066,0.596264,0.553256,0.966719,0.201975
D,0.59083,0.672203,0.483075,0.016721,0.285415
E,0.244469,0.634405,0.045014,0.218126,0.011005


In [57]:
df > 0.5

Unnamed: 0,W,X,Y,Z,W * Y
A,False,False,False,True,False
B,False,False,False,False,False
C,False,True,True,True,False
D,True,True,False,False,False
E,False,True,False,False,False


In [59]:
df[df>0.5]

Unnamed: 0,W,X,Y,Z,W * Y
A,,,,0.973928,
B,,,,,
C,,0.596264,0.553256,0.966719,
D,0.59083,0.672203,,,
E,,0.634405,,,


In [60]:
df[df['W'] > 0.5]

Unnamed: 0,W,X,Y,Z,W * Y
D,0.59083,0.672203,0.483075,0.016721,0.285415


In [61]:
df[df['W'] < 0.5]

Unnamed: 0,W,X,Y,Z,W * Y
A,0.142901,0.223609,0.251744,0.973928,0.035975
B,0.024198,0.454734,0.352985,0.144078,0.008541
C,0.365066,0.596264,0.553256,0.966719,0.201975
E,0.244469,0.634405,0.045014,0.218126,0.011005


In [62]:
df[df['W'] < 0.5]['Y']

A    0.251744
B    0.352985
C    0.553256
E    0.045014
Name: Y, dtype: float64

In [65]:
df[df['W'] > 0.5][['X','Y']]

Unnamed: 0,X,Y
D,0.672203,0.483075


In [75]:
df[(df['W'] < 0.5) & (df['Y'] > 0.5)]

Unnamed: 0,W,X,Y,Z,W * Y
C,0.365066,0.596264,0.553256,0.966719,0.201975


In [76]:
df['Y'] > 0.5

A    False
B    False
C     True
D    False
E    False
Name: Y, dtype: bool

In [152]:
np.random.seed(1)
d = pd.DataFrame(np.random.randn(5,5),index=list('ABCDE'),columns=list('VWXYZ'))
print(d)
d[(d['V'] <0) | (d['X'] > 0)][['W','Z']]

          V         W         X         Y         Z
A  1.624345 -0.611756 -0.528172 -1.072969  0.865408
B -2.301539  1.744812 -0.761207  0.319039 -0.249370
C  1.462108 -2.060141 -0.322417 -0.384054  1.133769
D -1.099891 -0.172428 -0.877858  0.042214  0.582815
E -1.100619  1.144724  0.901591  0.502494  0.900856


Unnamed: 0,W,Z
B,1.744812,-0.24937
D,-0.172428,0.582815
E,1.144724,0.900856


In [153]:
a = np.arange(10)
d = pd.Series(a)
d

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int32

### Indexing

In [None]:
df.drop('W * Y',axis=1,inplace=True)
df

In [154]:
df

Unnamed: 0,W,X,Y,Z,W * Y
A,0.142901,0.223609,0.251744,0.973928,0.035975
B,0.024198,0.454734,0.352985,0.144078,0.008541
C,0.365066,0.596264,0.553256,0.966719,0.201975
D,0.59083,0.672203,0.483075,0.016721,0.285415
E,0.244469,0.634405,0.045014,0.218126,0.011005


In [155]:
df.reset_index()

Unnamed: 0,index,W,X,Y,Z,W * Y
0,A,0.142901,0.223609,0.251744,0.973928,0.035975
1,B,0.024198,0.454734,0.352985,0.144078,0.008541
2,C,0.365066,0.596264,0.553256,0.966719,0.201975
3,D,0.59083,0.672203,0.483075,0.016721,0.285415
4,E,0.244469,0.634405,0.045014,0.218126,0.011005


In [166]:
newind = 'AA BB CC DD EE'.split()
df['myIndex'] = newind
df

Unnamed: 0_level_0,W,X,Y,Z,W * Y,myIndex
myIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AA,0.142901,0.223609,0.251744,0.973928,0.035975,AA
BB,0.024198,0.454734,0.352985,0.144078,0.008541,BB
CC,0.365066,0.596264,0.553256,0.966719,0.201975,CC
DD,0.59083,0.672203,0.483075,0.016721,0.285415,DD
EE,0.244469,0.634405,0.045014,0.218126,0.011005,EE


In [167]:
df.set_index('myIndex')

Unnamed: 0_level_0,W,X,Y,Z,W * Y
myIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AA,0.142901,0.223609,0.251744,0.973928,0.035975
BB,0.024198,0.454734,0.352985,0.144078,0.008541
CC,0.365066,0.596264,0.553256,0.966719,0.201975
DD,0.59083,0.672203,0.483075,0.016721,0.285415
EE,0.244469,0.634405,0.045014,0.218126,0.011005


In [168]:
df

Unnamed: 0_level_0,W,X,Y,Z,W * Y,myIndex
myIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AA,0.142901,0.223609,0.251744,0.973928,0.035975,AA
BB,0.024198,0.454734,0.352985,0.144078,0.008541,BB
CC,0.365066,0.596264,0.553256,0.966719,0.201975,CC
DD,0.59083,0.672203,0.483075,0.016721,0.285415,DD
EE,0.244469,0.634405,0.045014,0.218126,0.011005,EE


In [169]:
df.set_index('myIndex',inplace=True)
df

Unnamed: 0_level_0,W,X,Y,Z,W * Y
myIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AA,0.142901,0.223609,0.251744,0.973928,0.035975
BB,0.024198,0.454734,0.352985,0.144078,0.008541
CC,0.365066,0.596264,0.553256,0.966719,0.201975
DD,0.59083,0.672203,0.483075,0.016721,0.285415
EE,0.244469,0.634405,0.045014,0.218126,0.011005


In [173]:
df

Unnamed: 0_level_0,W,X,Y,Z
myIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AA,0.142901,0.223609,0.251744,0.973928
BB,0.024198,0.454734,0.352985,0.144078
CC,0.365066,0.596264,0.553256,0.966719
DD,0.59083,0.672203,0.483075,0.016721
EE,0.244469,0.634405,0.045014,0.218126
