In [1]:
# Removing NA values
import pandas as pd
import numpy as np

d1 = {'Name': ['Pankaj', 'Meghna', 'David', 'Lisa'], 'ID': [1, 2, 3, 4], 'Salary': [100, 200, np.nan, pd.NaT],
      'Role': ['CEO', None, pd.NaT, pd.NaT]}

df = pd.DataFrame(d1)

print(df)

# drop all rows with any NaN and NaT values
df1 = df.dropna()
print(df1)

     Name  ID Salary  Role
0  Pankaj   1    100   CEO
1  Meghna   2    200  None
2   David   3    NaN   NaT
3    Lisa   4    NaT   NaT
     Name  ID Salary Role
0  Pankaj   1    100  CEO


In [2]:
# Dropping rows with NA values
import pandas as pd

d1 = {'Name': ['Pankaj', 'Meghna'], 'ID': [1, 2], 'Salary': [100, pd.NaT]}

df = pd.DataFrame(d1)

print(df)

df.dropna(inplace=True)
print(df)

     Name  ID Salary
0  Pankaj   1    100
1  Meghna   2    NaT
     Name  ID Salary
0  Pankaj   1    100


In [3]:
# Drop Row/Column Only if All the Values are Null
import pandas as pd
import numpy as np

d1 = {'Name': ['Pankaj', 'Meghna', 'David', pd.NaT], 'ID': [1, 2, 3, pd.NaT], 'Salary': [100, 200, np.nan, pd.NaT],
      'Role': [np.nan, np.nan, pd.NaT, pd.NaT]}

df = pd.DataFrame(d1)

print(df)

df1 = df.dropna(how='all')
print(df1)

df1 = df.dropna(how='all', axis=1)
print(df1)

     Name   ID Salary Role
0  Pankaj    1    100  NaT
1  Meghna    2    200  NaT
2   David    3    NaN  NaT
3     NaT  NaT    NaT  NaT
     Name ID Salary Role
0  Pankaj  1    100  NaT
1  Meghna  2    200  NaT
2   David  3    NaN  NaT
     Name   ID Salary
0  Pankaj    1    100
1  Meghna    2    200
2   David    3    NaN
3     NaT  NaT    NaT


In [5]:
# Imputing example
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
 
boston_bunch = load_boston()
dfx = pd.DataFrame(boston_bunch.data, columns = boston_bunch.feature_names) #independent variables
dfy = pd.DataFrame(boston_bunch.target, columns = ['target']) #dependent variables
boston = dfx.join(dfy)

In [6]:
# Finding the sum of null values
boston.isnull().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
target     0
dtype: int64

In [7]:
# Changing 20% values to NaN
import collections
import random
df = boston
replaced = collections.defaultdict(set)
ix = [(row, col) for row in range(df.shape[0]) for col in range(df.shape[1])]
random.shuffle(ix)
to_replace = int(round(.2*len(ix)))
for row, col in ix:
    if len(replaced[row]) < df.shape[1] - 1:
        df.iloc[row, col] = np.nan
        to_replace -= 1
        replaced[row].add(col)
        if to_replace == 0:
            break

In [8]:
# Finding the sum of null values
boston.isnull().sum()

CRIM        92
ZN         102
INDUS      105
CHAS       112
NOX        104
RM         106
AGE         92
DIS         91
RAD         99
TAX        105
PTRATIO    108
B          114
LSTAT      106
target      81
dtype: int64

In [None]:
# Fill NA with mean() of each column in boston dataset
df = df.apply(lambda x: x.fillna(x.mean()),axis=0)

In [9]:
# Z-Score
import numpy as np
from scipy import stats
    
arr1 = [[20, 2, 7, 1, 34],
        [50, 12, 12, 34, 4]]
  
arr2 = [[50, 12, 12, 34, 4], 
        [12, 11, 10, 34, 21]]
  
print ("\narr1 : ", arr1)
print ("\narr2 : ", arr2)
  
print ("\nZ-score for arr1 : \n", stats.zscore(arr1))
print ("\nZ-score for arr1 : \n", stats.zscore(arr1, axis = 1))


arr1 :  [[20, 2, 7, 1, 34], [50, 12, 12, 34, 4]]

arr2 :  [[50, 12, 12, 34, 4], [12, 11, 10, 34, 21]]

Z-score for arr1 : 
 [[-1. -1. -1. -1.  1.]
 [ 1.  1.  1.  1. -1.]]

Z-score for arr1 : 
 [[ 0.57251144 -0.85876716 -0.46118977 -0.93828264  1.68572813]
 [ 1.62005758 -0.61045648 -0.61045648  0.68089376 -1.08003838]]


In [10]:
# Min-Max Scalar
from numpy import asarray
from sklearn.preprocessing import MinMaxScaler
# define data
data = asarray([[100, 0.001],
                [8, 0.05],
                [50, 0.005],
                [88, 0.07],
                [4, 0.1]])
print(data)
# define min max scaler
scaler = MinMaxScaler()
# transform data
scaled = scaler.fit_transform(data)
print(scaled)

[[1.0e+02 1.0e-03]
 [8.0e+00 5.0e-02]
 [5.0e+01 5.0e-03]
 [8.8e+01 7.0e-02]
 [4.0e+00 1.0e-01]]
[[1.         0.        ]
 [0.04166667 0.49494949]
 [0.47916667 0.04040404]
 [0.875      0.6969697 ]
 [0.         1.        ]]


In [11]:
# Standardization
from numpy import asarray
from sklearn.preprocessing import StandardScaler
# define data
data = asarray([[100, 0.001],
                [8, 0.05],
                [50, 0.005],
                [88, 0.07],
                [4, 0.1]])
print(data)
# define standard scaler
scaler = StandardScaler()
# transform data
scaled = scaler.fit_transform(data)
print(scaled)

[[1.0e+02 1.0e-03]
 [8.0e+00 5.0e-02]
 [5.0e+01 5.0e-03]
 [8.8e+01 7.0e-02]
 [4.0e+00 1.0e-01]]
[[ 1.26398112 -1.16389967]
 [-1.06174414  0.12639634]
 [ 0.         -1.05856939]
 [ 0.96062565  0.65304778]
 [-1.16286263  1.44302493]]


In [26]:
# Label Encoding

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
# creating initial dataframe
bridge_types = ('Arch','Beam','Truss','Cantilever','Tied Arch','Suspension','Cable')
bridge_df = pd.DataFrame(bridge_types, columns=['Bridge_Types'])
# creating instance of labelencoder
labelencoder = LabelEncoder()
# Assigning numerical values and storing in another column
bridge_df['Bridge_Types_Cat'] = labelencoder.fit_transform(bridge_df['Bridge_Types'])
bridge_df

Unnamed: 0,Bridge_Types,Bridge_Types_Cat
0,Arch,0
1,Beam,1
2,Truss,6
3,Cantilever,3
4,Tied Arch,5
5,Suspension,4
6,Cable,2
