In [334]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [335]:
## modules to use:
# preprocessing
# impute - to fill missing values
# feature_selection
# decomposition

In [336]:
###  zero-centered or mean-subtraction and scaling
## preprocessing.MinMaxScaler -> [0,1]
from sklearn.preprocessing import MinMaxScaler

In [337]:
data = [[-1,2],[-0.5,6],[0,10],[1,18]]
pd.DataFrame(data)

Unnamed: 0,0,1
0,-1.0,2
1,-0.5,6
2,0.0,10
3,1.0,18


In [338]:
scaler = MinMaxScaler(feature_range=[5,10]) #by default, feature_range=[0,1]
scaler.fit(data)
#sometimes .fit doesn't work because of enoumous amount of features
# in such cases, we can use .partial_fit
# scaler = scaler.partial_fit(data)
result = scaler.transform(data) 
result

array([[ 5.  ,  5.  ],
       [ 6.25,  6.25],
       [ 7.5 ,  7.5 ],
       [10.  , 10.  ]])

In [339]:
scaler.inverse_transform(result)

array([[-1. ,  2. ],
       [-0.5,  6. ],
       [ 0. , 10. ],
       [ 1. , 18. ]])

In [340]:
# or we can use numpy to do all of the above
datanp = np.array(data)
datanp

array([[-1. ,  2. ],
       [-0.5,  6. ],
       [ 0. , 10. ],
       [ 1. , 18. ]])

In [341]:
datanp_nor = (datanp-datanp.min(axis=0))/(datanp.max(axis=0)-datanp.min(axis=0))
datanp_nor

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [342]:
datanp_returned = datanp_nor * (datanp.max(axis=0)-datanp.min(axis=0)) + datanp.min(axis=0)
datanp_returned

array([[-1. ,  2. ],
       [-0.5,  6. ],
       [ 0. , 10. ],
       [ 1. , 18. ]])

In [343]:
## standard scaling
from sklearn.preprocessing import StandardScaler
scaler= StandardScaler()
scaler.fit(data)
print(scaler.mean_)
print(scaler.var_)
x_std = scaler.transform(data)
x_std

[-0.125  9.   ]
[ 0.546875 35.      ]


array([[-1.18321596, -1.18321596],
       [-0.50709255, -0.50709255],
       [ 0.16903085,  0.16903085],
       [ 1.52127766,  1.52127766]])

In [344]:
scaler.fit_transform(data)

array([[-1.18321596, -1.18321596],
       [-0.50709255, -0.50709255],
       [ 0.16903085,  0.16903085],
       [ 1.52127766,  1.52127766]])

In [345]:
scaler.inverse_transform(x_std)

array([[-1. ,  2. ],
       [-0.5,  6. ],
       [ 0. , 10. ],
       [ 1. , 18. ]])

In [346]:
### fill missing values
data = pd.read_csv(r"E:\machine learning\decision tree\titanic\train.csv"
                  ,index_col=0)

In [347]:
data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [348]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Name        891 non-null object
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Ticket      891 non-null object
Fare        891 non-null float64
Cabin       204 non-null object
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [349]:
##.impute.SimpleImputer
#parameters:
#missing_values: np.nan by default
#strategy: mean (number), median (number), most_frequent (number and character), constant (number and character)
#fill_value: for strategy = 'constant'
#copy: True by default
from sklearn.impute import SimpleImputer

In [350]:
age = data.loc[:,'Age'].values.reshape(-1,1)

In [351]:
imp_mean = SimpleImputer(strategy='mean')
imp_median = SimpleImputer(strategy='median')
imp_0 = SimpleImputer(strategy='constant',fill_value=0)

In [352]:
age_imp_mean = imp_mean.fit_transform(age)
age_imp_median = imp_median.fit_transform(age)
age_imp_0 = imp_0.fit_transform(age)

In [353]:
data.loc[:,'Age'] = age_imp_median
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Name        891 non-null object
Sex         891 non-null object
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Ticket      891 non-null object
Fare        891 non-null float64
Cabin       204 non-null object
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [354]:
# or we can use numpy and pandas to fill missing values
data_ = pd.read_csv(r"E:\machine learning\decision tree\titanic\train.csv"
                  ,index_col=0)

In [355]:
age = data_.loc[:,'Age']
data_.loc[:,'Age'] = age.fillna(age.median())
data_.loc[:,'Age'][:20]

PassengerId
1     22.0
2     38.0
3     26.0
4     35.0
5     35.0
6     28.0
7     54.0
8      2.0
9     27.0
10    14.0
11     4.0
12    58.0
13    20.0
14    39.0
15    14.0
16    55.0
17     2.0
18    28.0
19    31.0
20    28.0
Name: Age, dtype: float64

In [356]:
data_.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Name        891 non-null object
Sex         891 non-null object
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Ticket      891 non-null object
Fare        891 non-null float64
Cabin       204 non-null object
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [357]:
data_.drop(['Cabin'],axis=1
           ,inplace=True
          )
data_.dropna(axis=0,inplace=True)

In [358]:
data_.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 1 to 891
Data columns (total 10 columns):
Survived    889 non-null int64
Pclass      889 non-null int64
Name        889 non-null object
Sex         889 non-null object
Age         889 non-null float64
SibSp       889 non-null int64
Parch       889 non-null int64
Ticket      889 non-null object
Fare        889 non-null float64
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(4)
memory usage: 76.4+ KB


In [359]:
### preprocess characters -- encoding characters and dummy variables
##.preprocess.LabelEncoder : characters -> numbers
from sklearn.preprocessing import LabelEncoder

In [360]:
y = data.loc[:,'Sex']
y[:5]
# note that LabelEncoder accept one-dimension ndarray 

PassengerId
1      male
2    female
3    female
4    female
5      male
Name: Sex, dtype: object

In [361]:
le = LabelEncoder()
le.fit(y)
sex_le = le.transform(y)
sex_le[:5]

array([1, 0, 0, 0, 1])

In [362]:
le.classes_

array(['female', 'male'], dtype=object)

In [363]:
le.fit_transform(y)
le.inverse_transform(sex_le)

array(['male', 'female', 'female', 'female', 'male', 'male', 'male',
       'male', 'female', 'female', 'female', 'female', 'male', 'male',
       'female', 'female', 'male', 'male', 'female', 'female', 'male',
       'male', 'female', 'male', 'female', 'female', 'male', 'male',
       'female', 'male', 'male', 'female', 'female', 'male', 'male',
       'male', 'male', 'male', 'female', 'female', 'female', 'female',
       'male', 'female', 'female', 'male', 'male', 'female', 'male',
       'female', 'male', 'male', 'female', 'female', 'male', 'male',
       'female', 'male', 'female', 'male', 'male', 'female', 'male',
       'male', 'male', 'male', 'female', 'male', 'female', 'male', 'male',
       'female', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'female', 'male', 'male', 'female', 'male', 'female', 'female',
       'male', 'male', 'female', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'female', 'male', 'female', 'male',
      

In [364]:
data.loc[:,'Sex'] = LabelEncoder().fit_transform(data.loc[:,'Sex'])
data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S


In [365]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Name        891 non-null object
Sex         891 non-null int32
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Ticket      891 non-null object
Fare        891 non-null float64
Cabin       204 non-null object
Embarked    889 non-null object
dtypes: float64(2), int32(1), int64(4), object(4)
memory usage: 80.1+ KB


In [366]:
data.drop(['Cabin'], axis=1, inplace=True)
data.dropna(inplace=True)
data.loc[:,'Embarked'] = LabelEncoder().fit_transform(data.loc[:,'Embarked'].values)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 1 to 891
Data columns (total 10 columns):
Survived    889 non-null int64
Pclass      889 non-null int64
Name        889 non-null object
Sex         889 non-null int32
Age         889 non-null float64
SibSp       889 non-null int64
Parch       889 non-null int64
Ticket      889 non-null object
Fare        889 non-null float64
Embarked    889 non-null int32
dtypes: float64(2), int32(2), int64(4), object(2)
memory usage: 69.5+ KB


In [367]:
data.loc[:,'Embarked'].unique()

array([2, 0, 1], dtype=int64)

In [368]:
data.loc[:,'Embarked'][:20]

PassengerId
1     2
2     0
3     2
4     2
5     2
6     1
7     2
8     2
9     2
10    0
11    2
12    2
13    2
14    2
15    2
16    2
17    1
18    2
19    2
20    0
Name: Embarked, dtype: int32

In [369]:
##.OrdinalEncoder() works exactly like LabelEncoder()
## except that it only accept 2d array
## the method .classes_ in LabelEncoder() becomes 
## .categories_ in OrdinalEncoder() 

In [370]:
## preprocessing.OneHotEncoder()
from sklearn.preprocessing import OneHotEncoder
data = pd.read_csv(r"E:\machine learning\decision tree\titanic\train.csv"
                  ,index_col=0)
data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [371]:
x = data.loc[:,['Sex','Embarked']]
x.dropna(how='any',inplace=True)
x.isnull().sum()

Sex         0
Embarked    0
dtype: int64

In [372]:
ohe = OneHotEncoder(categories='auto')
ohe.fit(x)
x_ohe = ohe.transform(x).toarray()
x_ohe.shape

(889, 5)

In [393]:
ohe.categories_

[array(['female', 'male'], dtype=object), array(['C', 'Q', 'S'], dtype=object)]

In [373]:
ohe.inverse_transform(x_ohe)

array([['male', 'S'],
       ['female', 'C'],
       ['female', 'S'],
       ...,
       ['female', 'S'],
       ['male', 'C'],
       ['male', 'Q']], dtype=object)

In [374]:
ohe.get_feature_names().tolist()

['x0_female', 'x0_male', 'x1_C', 'x1_Q', 'x1_S']

In [375]:
data.drop(['Cabin'],axis=1,inplace=True)

In [376]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 10 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Name        891 non-null object
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Ticket      891 non-null object
Fare        891 non-null float64
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(4)
memory usage: 76.6+ KB


In [377]:
data.loc[:,'Age'] = data.loc[:,'Age'].fillna(data.loc[:,'Age'].median())

In [378]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 10 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Name        891 non-null object
Sex         891 non-null object
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Ticket      891 non-null object
Fare        891 non-null float64
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(4)
memory usage: 76.6+ KB


In [379]:
data.dropna(how='any',inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 1 to 891
Data columns (total 10 columns):
Survived    889 non-null int64
Pclass      889 non-null int64
Name        889 non-null object
Sex         889 non-null object
Age         889 non-null float64
SibSp       889 non-null int64
Parch       889 non-null int64
Ticket      889 non-null object
Fare        889 non-null float64
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(4)
memory usage: 76.4+ KB


In [384]:
data.drop(['Sex','Embarked'],axis=1,inplace=True)

In [388]:
data_new = pd.concat([data,pd.DataFrame(x_ohe)],axis=1)

In [390]:
data_new.head()

Unnamed: 0,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,0,1,2,3,4
0,,,,,,,,,0.0,1.0,0.0,0.0,1.0
1,0.0,3.0,"Braund, Mr. Owen Harris",22.0,1.0,0.0,A/5 21171,7.25,1.0,0.0,1.0,0.0,0.0
2,1.0,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1.0,0.0,PC 17599,71.2833,1.0,0.0,0.0,0.0,1.0
3,1.0,3.0,"Heikkinen, Miss. Laina",26.0,0.0,0.0,STON/O2. 3101282,7.925,1.0,0.0,0.0,0.0,1.0
4,1.0,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1.0,0.0,113803,53.1,0.0,1.0,0.0,0.0,1.0


In [391]:
data_new.columns = data.columns.tolist() + ohe.get_feature_names().tolist()

In [395]:
data_new.dropna(how='any',inplace=True)
data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833
3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1
5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05


In [394]:
### to encode the labels as dummy variables 
### we can use sklearn.preprocessing.LabelBinarizer

In [396]:
#### how to catogarize continous features 
### .preprocessing.Binarizer
from sklearn.preprocessing import Binarizer
data2 = data_new.copy()

In [397]:
#binarize Age
age = data2.loc[:,'Age'].values.reshape(-1,1)

In [398]:
age = Binarizer(threshold=40).fit_transform(age)
age[:20]

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.]])

In [401]:
## binning continuous data, age for example
# KBinsDiscretizer
#parameters:
#n_bins - number of bins, 5 by default
#encode:'onehot'(by default), 'ordinal'
#strategy - width of bins: 'uniform', 'quantile'(by default), 'kmeans'
from sklearn.preprocessing import KBinsDiscretizer

In [402]:
age = data2.loc[:,'Age'].values.reshape(-1,1)

In [403]:
kbd = KBinsDiscretizer(n_bins=3,encode='ordinal',strategy='uniform')
kbd.fit_transform(age) 

array([[0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [2.],
       [0.],
       [1.],
       [0.],
       [0.],
       [2.],
       [0.],
       [1.],
       [0.],
       [2.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [2.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [2.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],

In [404]:
set(kbd.fit_transform(age).ravel())

{0.0, 1.0, 2.0}

In [405]:
kbd2 = kbd = KBinsDiscretizer(n_bins=3,encode='onehot',strategy='uniform')
kbd.fit_transform(age).toarray() 

array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])