In [67]:
import pandas as pd
import numpy as np

# Series

In [68]:
obj = pd.Series([4,7,5,8])

In [69]:
obj

0    4
1    7
2    5
3    8
dtype: int64

In [70]:
obj.values

array([4, 7, 5, 8], dtype=int64)

In [71]:
obj.index

RangeIndex(start=0, stop=4, step=1)

#### Updating Index

In [72]:
obj = pd.Series([4,7,-5,3],index = ['a','b','c','d'])

In [73]:
obj

a    4
b    7
c   -5
d    3
dtype: int64

In [74]:
obj.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [75]:
obj['a']

4

In [76]:
obj[["a","b","c"]]

a    4
b    7
c   -5
dtype: int64

### Applying numpy function

In [77]:
obj2 = pd.Series([1,2,-3,4,5],index=['a','b','c','d','e'])

In [78]:
obj2

a    1
b    2
c   -3
d    4
e    5
dtype: int64

In [79]:
obj2[obj2>0]

a    1
b    2
d    4
e    5
dtype: int64

In [80]:
obj2*2

a     2
b     4
c    -6
d     8
e    10
dtype: int64

In [81]:
np.exp(obj2)

a      2.718282
b      7.389056
c      0.049787
d     54.598150
e    148.413159
dtype: float64

### Creating serie from dict

In [82]:
data = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [83]:
obj3 = pd.Series(data) 

In [84]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

#### Index overriding

In [85]:
states = ['California', 'Ohio', 'Oregon', 'Texas']

In [86]:
obj4 = pd.Series(data, index=states)

In [87]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [88]:
obj4.name = 'population'

In [89]:
obj4.index.name = 'state'

In [90]:
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [91]:
obj5 = pd.Series([1,2,3,4])

In [92]:
obj5

0    1
1    2
2    3
3    4
dtype: int64

In [93]:
obj5.index = ['Bob','Steve','Jeff','Ryan']

In [94]:
obj5

Bob      1
Steve    2
Jeff     3
Ryan     4
dtype: int64

# DataFrame

In [95]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
 'year': [2000, 2001, 2002, 2001, 2002, 2003],
 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

In [96]:
frame = pd.DataFrame(data)

In [97]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [98]:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [99]:
# Specify a sequence of columns 
pd.DataFrame(data,columns=['year','state','pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [100]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],index=['one', 'two', 'three', 'four','five', 'six'])

In [101]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [102]:
frame2.state

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [103]:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [104]:
frame2['debt'] = 16.5

In [105]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


In [106]:
 frame2['debt'] = np.arange(6.)

In [107]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


In [108]:
frame2['eastern'] = frame2.state == 'Ohio'

In [109]:
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,0.0,True
two,2001,Ohio,1.7,1.0,True
three,2002,Ohio,3.6,2.0,True
four,2001,Nevada,2.4,3.0,False
five,2002,Nevada,2.9,4.0,False
six,2003,Nevada,3.2,5.0,False


### Transpose

In [110]:
frame2.T

Unnamed: 0,one,two,three,four,five,six
year,2000,2001,2002,2001,2002,2003
state,Ohio,Ohio,Ohio,Nevada,Nevada,Nevada
pop,1.5,1.7,3.6,2.4,2.9,3.2
debt,0.0,1.0,2.0,3.0,4.0,5.0
eastern,True,True,True,False,False,False


In [111]:
mydata = np.random.randint(0,101,(4,3))

In [112]:
mydata

array([[40, 60, 92],
       [64,  5, 12],
       [93, 40, 49],
       [83,  8, 29]])

In [113]:
myindex = ['CA','NY','AZ','TX']
mycolumn = ['Jan','Feb','Mar']

In [114]:
df = pd.DataFrame(data=mydata)

In [115]:
df

Unnamed: 0,0,1,2
0,40,60,92
1,64,5,12
2,93,40,49
3,83,8,29


In [116]:
df = pd.DataFrame(data=mydata,index=myindex,columns=mycolumn)
df

Unnamed: 0,Jan,Feb,Mar
CA,40,60,92
NY,64,5,12
AZ,93,40,49
TX,83,8,29


#### Dropping entries

In [122]:
d = df.drop('NY')
d

Unnamed: 0,Jan,Feb,Mar
CA,40,60,92
AZ,93,40,49
TX,83,8,29


In [124]:
d1 = df.drop(['Mar'],axis = 1)
d1

Unnamed: 0,Jan,Feb
CA,40,60
NY,64,5
AZ,93,40
TX,83,8


In [125]:
df[:2]

Unnamed: 0,Jan,Feb,Mar
CA,40,60,92
NY,64,5,12


In [132]:
df.iloc[1]

Jan    64
Feb     5
Mar    12
Name: NY, dtype: int32

In [133]:
df.loc['NY']

Jan    64
Feb     5
Mar    12
Name: NY, dtype: int32

## Applying function to a dataframe

In [138]:
frame = pd.DataFrame(np.random.randn(4, 3),columns=list('bde'),index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [139]:
frame

Unnamed: 0,b,d,e
Utah,1.431259,0.423347,0.062083
Ohio,0.265864,-0.095195,0.633114
Texas,-0.214138,1.436661,-0.285115
Oregon,0.263288,1.510803,1.698778


In [140]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,1.431259,0.423347,0.062083
Ohio,0.265864,0.095195,0.633114
Texas,0.214138,1.436661,0.285115
Oregon,0.263288,1.510803,1.698778


In [143]:
f = lambda x : x.max()

In [146]:
frame.apply(f)

b    1.431259
d    1.510803
e    1.698778
dtype: float64

In [148]:
frame.apply(f,axis=0)

b    1.431259
d    1.510803
e    1.698778
dtype: float64

In [149]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])

In [150]:
frame.apply(f)

Unnamed: 0,b,d,e
min,-0.214138,-0.095195,-0.285115
max,1.431259,1.510803,1.698778


## Sorting

In [154]:
frame = pd.DataFrame(np.arange(9).reshape((3,3)),index=[1,0,2],columns=['d', 'a', 'b'])

In [155]:
frame

Unnamed: 0,d,a,b
1,0,1,2
0,3,4,5
2,6,7,8


In [156]:
frame.sort_index()

Unnamed: 0,d,a,b
0,3,4,5
1,0,1,2
2,6,7,8


In [157]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,d
1,1,2,0
0,4,5,3
2,7,8,6


In [158]:
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,b,a
1,0,2,1
0,3,5,4
2,6,8,7


In [161]:
frame.sort_values(by='b')

Unnamed: 0,d,a,b
1,0,1,2
0,3,4,5
2,6,7,8
