In [1]:
import pandas as pd
import numpy as np

In [2]:
sdata = pd.Series([0.25, 0.50, 0.75, 1.0])

In [4]:
print(sdata)
print(type(sdata))

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64
<class 'pandas.core.series.Series'>


In [5]:
sdata[1]

0.5

In [7]:
print(sdata.values)
print(sdata.index)

[0.25 0.5  0.75 1.  ]
RangeIndex(start=0, stop=4, step=1)


In [8]:
sdata = pd.Series([0.25, 0.50, 0.75, 1.0], index=['a', 'b', 'c', 'd'])

In [9]:
sdata

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [10]:
sdata['b']

0.5

In [11]:
sdata = pd.Series([0.25, 0.50, 0.75, 1.0], index=[2, 5, 3, 7])

In [12]:
sdata

2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64

In [13]:
sdata[5]

0.5

# Series as a specialized dictionary

In [15]:
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                  'New York': 19651127,
                  'Florida': 19552860,
                  'Illinois': 12882135}

In [16]:
population_dict

{'California': 38332521,
 'Texas': 26448193,
 'New York': 19651127,
 'Florida': 19552860,
 'Illinois': 12882135}

In [17]:
population = pd.Series(population_dict)

In [18]:
type(population)

pandas.core.series.Series

In [19]:
population['California']

38332521

In [20]:
population['Texas':'Florida']

Texas       26448193
New York    19651127
Florida     19552860
dtype: int64

In [21]:
population.sort_values()

Illinois      12882135
Florida       19552860
New York      19651127
Texas         26448193
California    38332521
dtype: int64

In [22]:
# Create Series object using Scalar value

pd.Series(5, index=[100,200,300])

100    5
200    5
300    5
dtype: int64

In [23]:
pd.Series({2:'a', 1:'b', 3:'c'}, index=[3,2])

3    c
2    a
dtype: object

In [24]:
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [26]:
area_dict = {'California': 423967,
       'Texas': 695692,
       'New York': 141297,
       'Florida': 170312,
       'Illinois': 149995}

In [27]:
area = pd.Series(area_dict)

In [28]:
states = pd.DataFrame({'population': population,
             'area': area})

In [29]:
print(states)

            population    area
California    38332521  423967
Texas         26448193  695692
New York      19651127  141297
Florida       19552860  170312
Illinois      12882135  149995


In [30]:
states.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [31]:
states.columns

Index(['population', 'area'], dtype='object')

In [32]:
states['area']

California    423967
Texas         695692
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

# Constructing DataFrame Objects

- From a single Series object
- From a list of dictionaries
- From a dictionary of Series objects
- From a two-dimensional NumPy array
- From a NumPy structured array

In [33]:
print(type(population))
print(type(states))

<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>


In [36]:
populationDF= pd.DataFrame(population, columns=['population'])

In [38]:
populationDF

Unnamed: 0,population
California,38332521
Texas,26448193
New York,19651127
Florida,19552860
Illinois,12882135


In [37]:
print(type(populationDF))

<class 'pandas.core.frame.DataFrame'>


In [39]:
# Create the DataFrame from a list of dictionaries

data = [{'a': i, 'b': 2*i}
       for i in range(3)]

In [42]:
print(data)
print(type(data))

[{'a': 0, 'b': 0}, {'a': 1, 'b': 2}, {'a': 2, 'b': 4}]
<class 'list'>


In [43]:
print(pd.DataFrame(data))
print(type(pd.DataFrame(data)))

   a  b
0  0  0
1  1  2
2  2  4
<class 'pandas.core.frame.DataFrame'>


In [44]:
# DataFrame can handle missing values

pd.DataFrame([{'a':1, 'b':2}, {'b':3, 'c':4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [45]:
# Create DataFrame using two-dimensional NumPy array

pd.DataFrame(np.random.rand(3,2),
            columns=['foo', 'bar'],
            index=['a', 'b', 'c'])

Unnamed: 0,foo,bar
a,0.486869,0.209735
b,0.039043,0.855666
c,0.209693,0.269317


In [46]:
# Create an Index from the list of integer

ind = pd.Index([2,3,5,7,11])
ind

Int64Index([2, 3, 5, 7, 11], dtype='int64')

In [47]:
ind[1]

3

In [48]:
ind[1]=0

TypeError: Index does not support mutable operations

# Index as ordered set

In [50]:
indA = pd.Index([1,3,5,7,9])
indB = pd.Index([2,3,5,7,11])

In [51]:
indA & indB

  indA & indB


Int64Index([3, 5, 7], dtype='int64')

In [53]:
indA.intersection(indB)

Int64Index([3, 5, 7], dtype='int64')

In [54]:
indA | indB

  indA | indB


Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [55]:
indA.union(indB)

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [56]:
# Differences

indA^indB

  indA^indB


Int64Index([1, 2, 9, 11], dtype='int64')

In [57]:
indA.symmetric_difference(indB)

Int64Index([1, 2, 9, 11], dtype='int64')

# Data Indexing and Selection

- Series as dictionary

In [58]:
data = pd.Series([0.25,0.50,0.75,1.0],
                index=['a', 'b', 'c', 'd'])

In [59]:
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [60]:
'a' in data

True

In [61]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [63]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [64]:
data['e'] = 1.25

In [65]:
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [66]:
# Slicing using explicit indexing

data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [67]:
# Slicing using implicit indexing

data[0:2]

a    0.25
b    0.50
dtype: float64

In [68]:
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [70]:
# Masking

data[(data>0.3) & (data<0.8)]

b    0.50
c    0.75
dtype: float64

In [71]:
# Facny indexing, list of indices

data[['a', 'd']]

a    0.25
d    1.00
dtype: float64

# Indexer loc and iloc

In [73]:
data = pd.Series(['a', 'b', 'c'],
                index=[1,3,5])

In [74]:
data

1    a
3    b
5    c
dtype: object

In [75]:
# to access data using index it using explicit indexing
data[1]

'a'

In [76]:
# to access data using slicing it uses implicit indexing
data[1:3]

3    b
5    c
dtype: object

In [77]:
# loc uses explicit indexing

data.loc[1]

'a'

In [78]:
# iloc uses implicit indexing

data.iloc[1]

'b'

In [79]:
data

1    a
3    b
5    c
dtype: object

# Data Selection in DataFrame

## DataFrame as a dictionary

In [81]:
pop = pd.Series({'California': 38332521,
                   'Texas': 26448193,
                  'New York': 19651127,
                  'Florida': 19552860,
                  'Illinois': 12882135})

In [82]:
area = pd.Series({'California': 423967,
                   'Texas': 695692,
                   'New York': 141297,
                   'Florida': 170312,
                   'Illinois': 149995})

In [83]:
data = pd.DataFrame({'area': area,
                    'pop': pop})

In [84]:
print(data)

              area       pop
California  423967  38332521
Texas       695692  26448193
New York    141297  19651127
Florida     170312  19552860
Illinois    149995  12882135


In [85]:
# dictionary style indexing

data['area']

California    423967
Texas         695692
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [86]:
# attribute style

data.area

California    423967
Texas         695692
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [87]:
# are we getting the same data from using different methods

data.area is data['area']

True

In [88]:
data['pop']

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
Name: pop, dtype: int64

In [91]:
data.pop is data['pop']

False

In [92]:
# data.pop = avoid using attribute style indexing to modify the value
data['pop']

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
Name: pop, dtype: int64

In [93]:
data['density'] = data['pop']/data['area']

In [94]:
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695692,26448193,38.017101
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [96]:
# to get the values of DataFrame
data.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
       [6.95692000e+05, 2.64481930e+07, 3.80171010e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02],
       [1.70312000e+05, 1.95528600e+07, 1.14806121e+02],
       [1.49995000e+05, 1.28821350e+07, 8.58837628e+01]])

In [97]:
# To get the first row of the dataframe

data.values[0]

array([4.23967000e+05, 3.83325210e+07, 9.04139261e+01])

In [98]:
# Slicing using implicit indexing which exclude the final index

data.iloc[:3,:2]

Unnamed: 0,area,pop
California,423967,38332521
Texas,695692,26448193
New York,141297,19651127


In [100]:
# Explicit index includes the final index

data.loc[:'Illinois',:'pop']

Unnamed: 0,area,pop
California,423967,38332521
Texas,695692,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [101]:
data.loc[:'Florida',:'density']

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695692,26448193,38.017101
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121


In [102]:
# loc and iloc can combine masking and fancy indexing

data.loc[data.density>100, ['pop', 'density']]

Unnamed: 0,pop,density
New York,19651127,139.076746
Florida,19552860,114.806121


In [103]:
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695692,26448193,38.017101
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [104]:
# Modify Dataframe using iloc

data.iloc[0,2] = 90

In [105]:
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.0
Texas,695692,26448193,38.017101
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


# Operating on Data in Pandas

In [106]:
# Series

rng = np.random.RandomState(42)

ser = pd.Series(rng.randint(0,10,4)) 
print(ser)


0    6
1    3
2    7
3    4
dtype: int32


In [107]:
# DataFrame

df = pd.DataFrame(rng.randint(0,10,(3,4)),
                 columns=['a', 'b', 'c', 'd'])

In [108]:
print(df)

   a  b  c  d
0  6  9  2  6
1  7  4  3  7
2  7  2  5  4


In [109]:
# Index preserve in Series after applying NumPy operation

np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [110]:
# Index preserves in DataFrame

np.sin(df * np.pi/4)

Unnamed: 0,a,b,c,d
0,-1.0,0.7071068,1.0,-1.0
1,-0.707107,1.224647e-16,0.707107,-0.7071068
2,-0.707107,1.0,-0.707107,1.224647e-16


# Index Alignment in Series

In [113]:
area = pd.Series({'Alaska': 1723337,
                'Texas': 695662,
                'California': 423967})
print(area)

Alaska        1723337
Texas          695662
California     423967
dtype: int64


In [115]:
population = pd.Series({'California': 38332521,
                       'Texas': 26448193,
                       'New York': 19651127})

print(population)

California    38332521
Texas         26448193
New York      19651127
dtype: int64


In [116]:
area/population

Alaska             NaN
California    0.011060
New York           NaN
Texas         0.026303
dtype: float64

In [118]:
area.index.union(population.index)

Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')

# Index Alignment in DataFrame

In [120]:
A = pd.DataFrame(rng.randint(0,20,(2,2)),
                columns=list('xy'))

print(A)

    x   y
0   0  11
1  11  16


In [122]:
B = pd.DataFrame(rng.randint(0,10,(3,3)),
                columns=list('xyz'))
print(B)

   x  y  z
0  4  8  6
1  1  3  8
2  1  9  8


In [123]:
A+B

Unnamed: 0,x,y,z
0,4.0,19.0,
1,12.0,19.0,
2,,,


In [125]:
fill = A.stack().mean()

In [126]:
A.add(B, fill_value=fill)

Unnamed: 0,x,y,z
0,4.0,19.0,15.5
1,12.0,19.0,17.5
2,10.5,18.5,17.5


In [127]:
B

Unnamed: 0,x,y,z
0,4,8,6
1,1,3,8
2,1,9,8


# Operations Between Series and DataFrame

- Operations between Series and DataFrame are similar as operations between one-dimensional and two-dimensional array

In [4]:
rng = np.random.RandomState(42)
A = rng.randint(10, size=(3,4))
print(A)

[[6 3 7 4]
 [6 9 2 6]
 [7 4 3 7]]


In [5]:
print(A[0])

[6 3 7 4]


In [6]:
A-A[0]

array([[ 0,  0,  0,  0],
       [ 0,  6, -5,  2],
       [ 1,  1, -4,  3]])

In [7]:
# Create DataFrame

df = pd.DataFrame(A, columns=list('QRST'))
print(df)

   Q  R  S  T
0  6  3  7  4
1  6  9  2  6
2  7  4  3  7


In [8]:
df.iloc[0]

Q    6
R    3
S    7
T    4
Name: 0, dtype: int32

In [9]:
# By dafault the operation is done row-wise
df-df.iloc[0]

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,0,6,-5,2
2,1,1,-4,3


In [11]:
df

Unnamed: 0,Q,R,S,T
0,6,3,7,4
1,6,9,2,6
2,7,4,3,7


In [10]:
# Subracting one of the column elements to rest of the column elements
df.subtract(df['R'], axis=0)

Unnamed: 0,Q,R,S,T
0,3,0,4,1
1,-3,0,-7,-3
2,3,0,-1,3


In [14]:
df

Unnamed: 0,Q,R,S,T
0,6,3,7,4
1,6,9,2,6
2,7,4,3,7


In [12]:
# Subtracting one element of a column to rest of the elements in column
halfrow = df.iloc[0,::2]
print(halfrow)

Q    6
S    7
Name: 0, dtype: int32


In [13]:
df-halfrow

Unnamed: 0,Q,R,S,T
0,0.0,,0.0,
1,0.0,,-5.0,
2,1.0,,-4.0,


# Operating on Null Values
- Pandas provide several useful functions for detecting, removing and replacing null values
- isnull() # generate the boolean mask indicating missing values
- notnull() # opposite of isnull()
- fillna() # missing values filled

## Detecting Null Values

In [16]:
data = pd.Series([1, np.nan, 'hello', None])
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

- filter out the null values

In [18]:
data[data.notnull()]

0        1
2    hello
dtype: object

In [20]:
# Removal of NA values for series

ser1 = data.dropna()
print(ser1)

0        1
2    hello
dtype: object


## Operating on DataFrames

In [22]:
df = pd.DataFrame([[1, np.nan, 2],
                 [2,3,5],
                 [np.nan, 4, 6]])

print(df)

     0    1  2
0  1.0  NaN  2
1  2.0  3.0  5
2  NaN  4.0  6


In [23]:
df.dropna() # by dafault operation is done row-wise

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [25]:
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [24]:
df.dropna(axis='columns')

Unnamed: 0,2
0,2
1,5
2,6


In [26]:
# Creating the nan value column
df[3] = np.nan

In [27]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [28]:
df.dropna(axis='columns', how='all')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [29]:
# specifying the minimum number we want to keep to remove Nan
df.dropna(axis='rows', thresh=3)

Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


# Filling Null Values

In [30]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
print(data)

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64


In [31]:
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [32]:
# forward fill
data.fillna(method='ffill')

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [33]:
# Backward fill
data.fillna(method='bfill')

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

In [34]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [35]:
df.fillna(method='ffill', axis=1)

Unnamed: 0,0,1,2,3
0,1.0,1.0,2.0,2.0
1,2.0,3.0,5.0,5.0
2,,4.0,6.0,6.0
