In [1]:
from io import StringIO 
import pandas as pd 
csv_data = \
'''A,B,C,D
 1.0,2.0,3.0,4.0
 5.0,6.0,,8.0
 10.0,11.0,12.0,''' 

df = pd.read_csv(StringIO(csv_data)) 
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [2]:
# convert into array 
df.values

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

# Drop the Missing Value

In [3]:
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [4]:
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [5]:
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [6]:
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [7]:
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [8]:
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


# Imputer missing values

In [9]:
from sklearn.impute import SimpleImputer 
import numpy as np 
# df.fillna(df.mean())
imr = SimpleImputer(missing_values=np.nan, strategy='mean') 
imr = imr.fit(df.values) 
imputed_data = imr.transform(df.values) 
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

In [10]:
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,10.0,11.0,12.0,6.0


# Categorical data encoding with pandas 

In [53]:
 import pandas as pd
df = pd.DataFrame([['green', 'M', 10.1, 'class2'],
    ['red', 'L', 13.5, 'class1'],
    ['blue', 'XL', 15.3, 'class2']])

df.columns = ['color', 'size', 'price', 'classlabel']

 df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [54]:
size_mapping = {'XL':3, 
               'L': 2, 
               'M':1} 
df['size'] = df['size'].map(size_mapping) 
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [55]:
inv_size_mapping = {v: k for k, v in size_mapping.items()} 

df['size'] = df['size'].map(inv_size_mapping) 
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [56]:
import numpy as np 

class_mapping = {label:idx for idx, label in enumerate(np.unique(df['classlabel']))} 

class_mapping

{'class1': 0, 'class2': 1}

In [57]:
df['classlabel'] = df['classlabel'].map(class_mapping) 
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,1
1,red,L,13.5,0
2,blue,XL,15.3,1


In [58]:
inv_class_mapping = {v:k for k, v in class_mapping.items()}

df['classlabel'] = df['classlabel'].map(inv_class_mapping) 

df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [59]:
from sklearn.preprocessing import LabelEncoder 

class_le = LabelEncoder() 
y = class_le.fit_transform(df['classlabel'].values) 

y

array([1, 0, 1])

In [60]:
class_le.inverse_transform(y)

array(['class2', 'class1', 'class2'], dtype=object)

# Performing one-hot encoding on nominala features 

In [62]:
X = df[['color', 'size', 'price']].values 

color_le = LabelEncoder() 

X[:, 0] = color_le.fit_transform(X[:, 0]) 

X

array([[1, 'M', 10.1],
       [2, 'L', 13.5],
       [0, 'XL', 15.3]], dtype=object)

In [64]:
from sklearn.preprocessing import OneHotEncoder 

X = df[['color', 'size', 'price']].values

color_ohe = OneHotEncoder() 
color_ohe.fit_transform(X[:, 0].reshape(-1, 1)).toarray()  #sklearn 用的是稀疏矩阵，即只记录非零值的位置


array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [69]:
from sklearn.compose import ColumnTransformer 

X = df[['color', 'size', 'price']].values 

c_transf = ColumnTransformer([
    ('onehot', OneHotEncoder(), [0, 1]),   # ('步骤名1', 变换器1, 作用列索引),
    ('nothing', 'passthrough', [ 2])
]) 

c_transf.fit_transform(X).astype(float) 


array([[ 0. ,  1. ,  0. ,  0. ,  1. ,  0. , 10.1],
       [ 0. ,  0. ,  1. ,  1. ,  0. ,  0. , 13.5],
       [ 1. ,  0. ,  0. ,  0. ,  0. ,  1. , 15.3]])

In [78]:
df = pd.DataFrame([['green', 'M', 10.1, 'class2'],
    ['red', 'L', 13.5, 'class1'],
    ['blue', 'XL', 15.3, 'class2']])

df.columns = ['color', 'size', 'price', 'classlabel']

pd.get_dummies(df[['price', 'color', 'size']]).astype(float)

Unnamed: 0,price,color_blue,color_green,color_red,size_L,size_M,size_XL
0,10.1,0.0,1.0,0.0,0.0,1.0,0.0
1,13.5,0.0,0.0,1.0,1.0,0.0,0.0
2,15.3,1.0,0.0,0.0,0.0,0.0,1.0


In [81]:
color_ohe = OneHotEncoder(categories='auto', drop='first') 

c_transf = ColumnTransformer([
    ('onehot', color_ohe, [0, 1]), 
    ('nothing', 'passthrough', [2]),
]) 

c_transf.fit_transform(X).astype(float)

array([[ 1. ,  0. ,  1. ,  0. , 10.1],
       [ 0. ,  1. ,  0. ,  0. , 13.5],
       [ 0. ,  0. ,  0. ,  1. , 15.3]])