In [3]:
import pandas as pd
import numpy as np

# Operating on Data in Pandas

# Ufuncs: Index Preservation

In [4]:
#Because Pandas is designed to work with NumPy, any NumPy ufunc will work on
#Pandas Series and DataFrame objects. Let’s start by defining a simple Series and
#DataFrame on which to demonstrate this

In [5]:
ser=pd.Series(np.random.randint(0,10,4))

In [6]:
df=pd.DataFrame(np.random.randint(0,10,(3,4)),columns=['A','B','C','D'])

In [7]:
ser

0    1
1    0
2    0
3    3
dtype: int64

In [8]:
df

Unnamed: 0,A,B,C,D
0,0,4,3,0
1,0,6,0,8
2,6,1,1,1


In [9]:
#If we apply a NumPy ufunc on either of these objects, the result will be another Pan‐
#das object with the indices preserved

In [10]:
np.exp(ser)

0     2.718282
1     1.000000
2     1.000000
3    20.085537
dtype: float64

In [11]:
np.sin(df*np.pi/4)

Unnamed: 0,A,B,C,D
0,0.0,1.224647e-16,0.707107,0.0
1,0.0,-1.0,0.0,-2.449294e-16
2,-1.0,0.7071068,0.707107,0.7071068


# UFuncs: Index Alignment

In [12]:
#Index alignment in Series

In [13]:
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
'New York': 19651127}, name='population')

In [15]:
#Let’s see what happens when we divide these to compute the population density

In [16]:
population / area

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [17]:
#The resulting array contains the union of indices of the two input arrays, which we
#could determine using standard Python set arithmetic on these indices

In [18]:
area.index | population.index

  area.index | population.index


Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')

In [19]:
#Any item for which one or the other does not have an entry is marked with NaN , or
#“Not a Number,” which is how Pandas marks missing data

In [20]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
A + B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [21]:
#If using NaN values is not the desired behavior, we can modify the fill value using
#appropriate object methods in place of the operators. For example, calling A.add(B)
#is equivalent to calling A + B , but allows optional explicit specification of the fill value
#for any elements in A or B that might be missing

In [22]:
A.add(B,fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

# Index alignment in DataFrame

In [23]:
A = pd.DataFrame(np.random.randint(0, 20, (2, 2)),
columns=list('AB'))

In [24]:
A

Unnamed: 0,A,B
0,9,5
1,18,18


In [25]:
B = pd.DataFrame(np.random.randint(0, 10, (3, 3)),
columns=list('BAC'))
B

Unnamed: 0,B,A,C
0,6,2,9
1,7,0,4
2,3,7,9


In [26]:
A+B

Unnamed: 0,A,B,C
0,11.0,11.0,
1,18.0,25.0,
2,,,


In [27]:
#Notice that indices are aligned correctly irrespective of their order in the two objects,
#and indices in the result are sorted.

In [28]:
#As was the case with Series , we can use the asso‐
#ciated object’s arithmetic method and pass any desired fill_value to be used in place
#of missing entries. Here we’ll fill with the mean of all values in A (which we compute
#by first stacking the rows of A )

In [29]:
fill=A.stack()

In [30]:
fill

0  A     9
   B     5
1  A    18
   B    18
dtype: int64

In [31]:
fill=fill.mean()

In [32]:
fill

12.5

In [33]:
A.add(B,fill_value=fill)

Unnamed: 0,A,B,C
0,11.0,11.0,21.5
1,18.0,25.0,16.5
2,19.5,15.5,21.5


In [34]:
#Mapping between Python operators and Pandas methods
#Python operator Pandas method(s)
#+ add()
#- sub() , subtract()
#* mul() , multiply()
#/ truediv() , div() , divide()
#// floordiv()
#% mod()
#** pow()

In [35]:
#When you are performing operations between a DataFrame and a Series , the index
#and column alignment is similarly maintained. Operations between a DataFrame and
#a Series are similar to operations between a two-dimensional and one-dimensional
#NumPy array. Consider one common operation, where we find the difference of a
#two-dimensional array and one of its rows

In [36]:
A = np.random.randint(10, size=(3, 4))
A

array([[4, 1, 8, 2],
       [9, 7, 8, 4],
       [8, 7, 8, 6]])

In [37]:
A-A[0]

array([[0, 0, 0, 0],
       [5, 6, 0, 2],
       [4, 6, 0, 4]])

In [38]:
#According to NumPy’s broadcasting rules, subtraction between a two-dimensional array and one of its rows is
#applied row-wise.

In [39]:
#In Pandas, the convention similarly operates row-wise by default

In [40]:
df=pd.DataFrame(A,columns=list('QRST'))

In [41]:
df-df.iloc[0]

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,5,6,0,2
2,4,6,0,4


In [42]:
#If you would instead like to operate column-wise, you can use the object methods
#mentioned earlier, while specifying the axis keyword

In [45]:
df.subtract(df['R'],axis=0)

Unnamed: 0,Q,R,S,T
0,3,0,7,1
1,2,0,1,-3
2,1,0,1,-1


In [46]:
#Note that these DataFrame / Series operations, like the operations discussed before,
#will automatically align indices between the two elements

In [47]:
halfrow = df.iloc[0, ::2]
halfrow

Q    4
S    8
Name: 0, dtype: int64

In [49]:
df

Unnamed: 0,Q,R,S,T
0,4,1,8,2
1,9,7,8,4
2,8,7,8,6


In [50]:
df-halfrow

Unnamed: 0,Q,R,S,T
0,0.0,,0.0,
1,5.0,,0.0,
2,4.0,,0.0,


# Thank You