In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from io import StringIO

In [5]:
csv_data = '''A,B,C,D
    1.0,2.0,3.0,4.0
    5.0,6.0,,7.0
    10.0,20.0,11.0,'''

In [7]:
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,7.0
2,10.0,20.0,11.0,


In [8]:
#Eleminnating training examples with missing values
df.dropna(axis = 0) # removing rows

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [9]:
df.dropna(axis = 1) #Removving Columns

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,20.0


In [12]:
#only drop the rows where all columns are nan
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,7.0
2,10.0,20.0,11.0,


Removing the rows or columns may seem convinient but in many cases it eads to loss of information, for example, we may remove to many rows such that our model may not be able to learn from the dataset very well, or we can delete too many columns such that our model loses the critical feature set of the dataset

Alternative is to interpolate the data, the simplest way is mean imputation

In [13]:
from sklearn.impute import SimpleImputer

In [16]:
imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
imputed_data
#we have replaced the nan value with the mean which is calculted separately for each parameter

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7. ,  7. ],
       [10. , 20. , 11. ,  5.5]])

In [17]:
#Alternatively
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,7.0
2,10.0,20.0,11.0,5.5


In [61]:
#Catagorical data encodings with pandas
df = pd.DataFrame([
['green', 'M', 10.1, 'class2'],
['red', 'L', 13.5, 'class1'],
['blue', 'XL', 15.3, 'class2']])
df.columns = ['color', 'size', 'price', 'class_labels']

In [62]:
df
#nominal features  - color
#ordinal features - size

Unnamed: 0,color,size,price,class_labels
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [63]:
#converting ordinal features to integers so that our model interprets them conviniently
size_mapping = {
    'XL':3,
    'L':2,
    'M':1
}
df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,class_labels
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [64]:
#when we awnt to trasform the results back into their original form we will need reverse mapping
inv_size_mapping = {v: k for k , v in size_mapping.items()}
df['size'].map(inv_size_mapping)

0     M
1     L
2    XL
Name: size, dtype: object

In [65]:
#encoding class labels
class_mapping = {label:idx for idx,label in enumerate(np.unique(df['class_labels']))}

In [66]:
class_mapping

{'class1': 0, 'class2': 1}

In [67]:
df['class_labels'] = df['class_labels'].map(class_mapping)
df

Unnamed: 0,color,size,price,class_labels
0,green,1,10.1,1
1,red,2,13.5,0
2,blue,3,15.3,1


In [68]:
#we can also create an inverse mapping to revert back the class encoding
inv_class_map = {v:k for k,v in clas_mapping.items()}

In [69]:
inv_class_map

{0: 'class1', 1: 'class2'}

In [70]:
df['class_labels'] = df['class_labels'].map(inv_class_map)

In [71]:
df

Unnamed: 0,color,size,price,class_labels
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [72]:
#Alternativel we can use sklearn libraries to encode the class variables
from sklearn.preprocessing import LabelEncoder

In [74]:
class_le = LabelEncoder()
y = class_le.fit_transform(df['class_labels'].values)
y

array([1, 0, 1])

In [75]:
#we can use inverse transform method to revert back the changes
class_le.inverse_transform(y)

array(['class2', 'class1', 'class2'], dtype=object)