In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [13]:
X = [[1, 2, np.nan, 'Male'], [3, 4, 3, np.NaN], [np.nan, 6, 5, 'Female'], [8, 8, 7, np.NaN]]
X = pd.DataFrame(data=X, columns=['A', 'B', 'C', 'D'])
X.head()

Unnamed: 0,A,B,C,D
0,1.0,2,,Male
1,3.0,4,3.0,
2,,6,5.0,Female
3,8.0,8,7.0,


In [14]:
details = pd.DataFrame({'missing_count' : X.isnull().sum(), 'data_type':X.dtypes , 'missing_percent': ((X.isnull().sum()/X.isnull().count())*100)})
missing = details.query('missing_count > 0')
missing

Unnamed: 0,missing_count,data_type,missing_percent
A,1,float64,25.0
C,1,float64,25.0
D,2,object,50.0


## Each sample’s missing values are imputed using the mean value from n_neighbors nearest neighbors found in the training set. Two samples are close if the features that neither is missing are close.

In [15]:
from sklearn.impute import KNNImputer

In [17]:
imputer = KNNImputer(n_neighbors=2, weights="uniform")
res = imputer.fit_transform(X.drop(columns=['D']))

In [18]:
resDF=pd.DataFrame(data=res, columns=['A', 'B', 'C'])

In [19]:
resDF.isnull().sum()

A    0
B    0
C    0
dtype: int64

In [20]:
resDF.head()

Unnamed: 0,A,B,C
0,1.0,2.0,4.0
1,3.0,4.0,3.0
2,5.5,6.0,5.0
3,8.0,8.0,7.0


# Linear value imputation

In [21]:
dataset = pd.read_csv(filepath_or_buffer=r'./titanic/train.csv')

In [27]:
X = dataset
details = pd.DataFrame({'missing_count' : X.isnull().sum(), 'data_type':X.dtypes , 'missing_percent': ((X.isnull().sum()/X.isnull().count())*100)})
missing = details.query('missing_count > 0')
missing

Unnamed: 0,missing_count,data_type,missing_percent
Age,177,float64,19.86532
Cabin,687,object,77.104377
Embarked,2,object,0.224467


In [22]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
data1 = dataset[['Pclass','SibSp','Parch','Fare','Age']]

x_train = data1[data1['Age'].notnull()].drop(columns='Age')
y_train = data1[data1['Age'].notnull()]['Age']
x_test = data1[data1['Age'].isnull()].drop(columns='Age')
y_test = data1[data1['Age'].isnull()]['Age']

In [25]:
linreg.fit(x_train, y_train)
predicted = linreg.predict(x_test)
data1.Age[dataset.Age.isnull()] = predicted

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [26]:
data1.isnull().sum()

Pclass    0
SibSp     0
Parch     0
Fare      0
Age       0
dtype: int64

## Multivariate imputer that estimates each feature from all the others.

## A strategy for imputing missing values by modeling each feature with missing values as a function of other features in a round-robin fashion.

In [28]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [29]:
details = pd.DataFrame({'missing_count' : X.isnull().sum(), 'data_type':X.dtypes , 'missing_percent': ((X.isnull().sum()/X.isnull().count())*100)})
missing = details.query('missing_count > 0')
missing

Unnamed: 0,missing_count,data_type,missing_percent
Age,177,float64,19.86532
Cabin,687,object,77.104377
Embarked,2,object,0.224467


In [46]:
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(dataset[['Pclass','SibSp','Parch','Fare','Age']][dataset.Age.notnull()])

IterativeImputer(add_indicator=False, estimator=None,
                 imputation_order='ascending', initial_strategy='mean',
                 max_iter=10, max_value=None, min_value=None,
                 missing_values=nan, n_nearest_features=None, random_state=0,
                 sample_posterior=False, skip_complete=False, tol=0.001,
                 verbose=0)

In [47]:
IterativeImputer(random_state=0)
arr = imp.transform(dataset[['Pclass','SibSp','Parch','Fare','Age']][dataset.Age.isnull()])
resDF1=pd.DataFrame(data=arr, columns=['Pclass','SibSp','Parch','Fare','Age'])

In [48]:
X = resDF1
details = pd.DataFrame({'missing_count' : X.isnull().sum(), 'data_type':X.dtypes , 'missing_percent': ((X.isnull().sum()/X.isnull().count())*100)})
missing = details.query('missing_count > 0')
missing

Unnamed: 0,missing_count,data_type,missing_percent


## Note that categorical varriable have to encoded for any of above imputation