# Script to load the example data and test the iterative imputer from sklearn

## Looks like you can't use the iterative_imputer on categorical data. Numberical only

### Load libraries

In [9]:
# The iterative imputer is still an experimental thing, so you've got to enable it explicitly
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

import pandas as pd
import numpy as np

### Load data

In [10]:
df = pd.read_csv('data.csv').iloc[:,1:]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   boatClass    1000 non-null   object 
 1   boatLength   1000 non-null   int64  
 2   boatEngines  800 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 23.6+ KB


In [11]:
imputer = IterativeImputer(missing_values=np.nan
                          , tol=1e-3
                          , add_indicator=True
                          , n_nearest_features=2
                          , initial_strategy='most_frequent')
imputer.fit(df[['boatLength', 'boatEngines']])

IterativeImputer(add_indicator=True, initial_strategy='most_frequent',
                 n_nearest_features=2)

In [18]:
imputer.imputation_sequence_

[_ImputerTriplet(feat_idx=0, neighbor_feat_idx=array([1], dtype=int64), estimator=BayesianRidge()),
 _ImputerTriplet(feat_idx=1, neighbor_feat_idx=array([0], dtype=int64), estimator=BayesianRidge()),
 _ImputerTriplet(feat_idx=0, neighbor_feat_idx=array([1], dtype=int64), estimator=BayesianRidge()),
 _ImputerTriplet(feat_idx=1, neighbor_feat_idx=array([0], dtype=int64), estimator=BayesianRidge())]

In [24]:
df2 = pd.DataFrame(imputer.transform(df[['boatLength', 'boatEngines']]))
df2.columns = ['boatLength', 'boatEngines', 'boatEngines_na']
df2.loc[df2['boatEngines_na'] == 1]

Unnamed: 0,boatLength,boatEngines,boatEngines_na
12,18.0,1.989446,1.0
16,53.0,1.858218,1.0
19,16.0,1.996945,1.0
22,54.0,1.854469,1.0
23,59.0,1.835722,1.0
...,...,...,...
975,21.0,1.978198,1.0
984,57.0,1.843221,1.0
986,26.0,1.959451,1.0
990,21.0,1.978198,1.0


In [40]:
for length in list(sorted(df['boatLength'].unique()))[1:]:
    try:
        most_common_engines = df.loc[df['boatLength'] == length, 'boatEngines'].value_counts()[0]
    except KeyError:
        most_common_engines = np.nan
        
    print(length, most_common_engines)

1 3
2 1
3 2
4 nan
5 6
6 5
7 nan
8 1
9 7
10 2
11 1
12 2
13 3
14 2
15 2
16 4
17 3
18 5
19 2
20 3
21 3
22 nan
23 2
24 6
25 1
26 1
27 3
28 4
29 4
30 5
31 4
32 1
33 2
34 5
35 3
36 nan
37 4
38 6
39 5
40 1
41 2
42 2
43 2
44 6
45 1
46 2
47 4
48 2
49 2
50 2
51 7
52 7
53 6
54 5
55 5
56 3
57 1
58 1
59 1
