In [4]:
import pandas as pd 
import numpy as np 

In [53]:
rng = np.random.RandomState(42) # Creating a random number generator 
ser = pd.Series(rng.randint(0, 10, 4)) # Creating a series object 
ser

0    6
1    3
2    7
3    4
dtype: int32

In [8]:
# Creating a DataFrame object 

df = pd.DataFrame(rng.randint(0, 10, (3,4)), columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,6,9,2,6
1,7,4,3,7
2,7,2,5,4


In [10]:
# Note: Pandas can perform basic arithmetic opertions like +, -, *, /, but numpy
# will still be needed for higher operations like trigonometric, exponential or
# logarithmic operations 

# All NumPy Ufuncs are in disposal to be used on Pandas Series or DataFrame object

In [5]:
np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [6]:
np.sin(df * np.pi / 4)

Unnamed: 0,A,B,C,D
0,0.7071068,-0.707107,-0.707107,0.707107
1,1.224647e-16,0.0,0.707107,-0.707107
2,-2.449294e-16,0.0,0.707107,1.0


In [34]:
# Index Alignment in Series object 

In [41]:
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
                  'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
                        'New York': 19651127}, name='population')
print(area)
print(population)

Alaska        1723337
Texas          695662
California     423967
Name: area, dtype: int64
California    38332521
Texas         26448193
New York      19651127
Name: population, dtype: int64


In [32]:
# Any item where one or the other does not have an entry is marked with 
# NaN (Not a Number ). 

population + area


Alaska               NaN
California    38756488.0
New York             NaN
Texas         27143855.0
dtype: float64

In [None]:
# Comparing the indexes of two series using set operations

In [18]:
area.index.union(population.index) 

Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')

In [20]:
area.index.intersection(population.index)

Index(['Texas', 'California'], dtype='object')

In [24]:
area.index.difference(population.index) 

Index(['Alaska'], dtype='object')

In [22]:
population.index.difference(area.index)

Index(['New York'], dtype='object')

In [33]:
# In other to avoid Nan output, the Ufunc can be used instead of arithmetic 
# symbols and the 'fill_value=' parameter 

population.add(area, fill_value=0)

Alaska         1723337.0
California    38756488.0
New York      19651127.0
Texas         27143855.0
dtype: float64

In [35]:
# Index alignment in DataFrame 

In [46]:
A = pd.DataFrame(rng.randint(0, 20, (2,2)), columns=list('AB'))
A

Unnamed: 0,A,B
0,1,19
1,14,6


In [39]:
B = pd.DataFrame(rng.randint(0, 10, (3,3)), columns=list('BAC'))
B

Unnamed: 0,B,A,C
0,4,8,6
1,1,3,8
2,1,9,8


In [44]:
A + B

Unnamed: 0,A,B,C
0,8.0,15.0,
1,14.0,17.0,
2,,,


In [50]:
fill = A.stack().mean()
A.add(B,fill_value=fill)

Unnamed: 0,A,B,C
0,9.0,23.0,16.0
1,17.0,7.0,18.0
2,19.0,11.0,18.0


In [51]:
# Operations between DataFrame and Series

In [54]:
A = rng.randint(10, size=(3,4))
A

array([[6, 9, 2, 6],
       [7, 4, 3, 7],
       [7, 2, 5, 4]])

In [55]:
A - A[0]

array([[ 0,  0,  0,  0],
       [ 1, -5,  1,  1],
       [ 1, -7,  3, -2]])

In [81]:
# The default is row-wise operation 

df = pd.DataFrame(A, columns=list('QRST'))
print(df.iloc[0])
df - df.iloc[0]

Q    6
R    9
S    2
T    6
Name: 0, dtype: int32


Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,1,-5,1,1
2,1,-7,3,-2


In [89]:
# For column-wise operation, keyword "axis=" is used
print(df)
df.subtract(df['R'], axis=0)

   Q  R  S  T
0  6  9  2  6
1  7  4  3  7
2  7  2  5  4


Unnamed: 0,Q,R,S,T
0,-3,0,-7,-3
1,3,0,-1,3
2,5,0,3,2


In [94]:
## When performing DataFrame and Series operatons the indexes of the rows and
# and columns are automatically aligned

half_row =  df.iloc[0, ::2]
print(half_row)
df -half_row

Q    6
S    2
Name: 0, dtype: int32


Unnamed: 0,Q,R,S,T
0,0.0,,0.0,
1,1.0,,1.0,
2,1.0,,3.0,


In [None]:
# Handling Missing Data 

In [3]:
# Using "None" python takes the whole array to be a python object, therefore 
# making fast numerical computations to be impossible to carry out.
vals1 = np.array([1, None, 3, 4])
vals1

array([1, None, 3, 4], dtype=object)

In [11]:
for dtype in ['object', 'int']:
    print("dtype=", dtype) 
    sth = %timeit np.arange(1E6, dtype=dtype)
    print()

dtype= object
82.3 ms ± 2.97 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

dtype= int
2.93 ms ± 200 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)



In [12]:
#Also I would not be possible to perform aggregations on an array with None value

In [3]:
# The other value used for representing missing data is "NaN" value
# meaning "Not a Number"

In [7]:
# Arithmetic operation can be carried on an array with NaN value because NumPy 
# see the array as a native dtype(e.g float64) rather than an object dtype

vals2 = np.array([1, np.nan, 3, 4])
vals2.dtype

dtype('float64')

In [12]:
# The basic ufuncs for aggregation won't work with arrays with NaN values 
# Special Ufuncs have been provided in NumPy to address the issue

print(np.sum(vals2))
print(np.min(vals2))
print(np.max(vals2))
print()
print(np.nansum(vals2))
print(np.nanmin(vals2))
print(np.nanmax(vals2))

nan
nan
nan

8.0
1.0
4.0
