## Imputer function from sklearn

In [46]:
import pandas as pd
import numpy as np 
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [30]:
a = np.array([[2,7,45],
              [4,3,33],
              [np.nan, 1, np.nan],
              [6,np.nan, 56],
              [np.nan, 7,67],
              [5,4,87]])


imputer = SimpleImputer(strategy='mean')
#calculates mean and stores in statistics_a
imputer.fit(a)

#prints the mean values that will replace nan
print(imputer.statistics_)
imputer.transform(a)

#combines fit and transform together
imputer.fit_transform(a)

[ 4.25  4.4  57.6 ]


array([[ 2.  ,  7.  , 45.  ],
       [ 4.  ,  3.  , 33.  ],
       [ 4.25,  1.  , 57.6 ],
       [ 6.  ,  4.4 , 56.  ],
       [ 4.25,  7.  , 67.  ],
       [ 5.  ,  4.  , 87.  ]])

### Chemical Process Dataset

In [23]:
chem = pd.read_csv("/home/darkstar/Documents/pg-dbda/module7_statistics/Daywise Study Material/datasets/ChemicalProcess.csv")
chem

Unnamed: 0,Yield,BiologicalMaterial01,BiologicalMaterial02,BiologicalMaterial03,BiologicalMaterial04,BiologicalMaterial05,BiologicalMaterial06,BiologicalMaterial07,BiologicalMaterial08,BiologicalMaterial09,...,ManufacturingProcess36,ManufacturingProcess37,ManufacturingProcess38,ManufacturingProcess39,ManufacturingProcess40,ManufacturingProcess41,ManufacturingProcess42,ManufacturingProcess43,ManufacturingProcess44,ManufacturingProcess45
0,38.00,6.25,49.58,56.97,12.74,19.51,43.73,100.0,16.66,11.44,...,0.019,0.5,3,7.2,,,11.6,3.0,1.8,2.4
1,42.44,8.01,60.97,67.48,14.65,19.36,53.14,100.0,19.04,12.55,...,0.019,2.0,2,7.2,0.1,0.15,11.1,0.9,1.9,2.2
2,42.03,8.01,60.97,67.48,14.65,19.36,53.14,100.0,19.04,12.55,...,0.018,0.7,2,7.2,0.0,0.00,12.0,1.0,1.8,2.3
3,41.42,8.01,60.97,67.48,14.65,19.36,53.14,100.0,19.04,12.55,...,0.018,1.2,2,7.2,0.0,0.00,10.6,1.1,1.8,2.1
4,42.49,7.47,63.33,72.25,14.02,17.91,54.66,100.0,18.22,12.80,...,0.017,0.2,2,7.3,0.0,0.00,11.0,1.1,1.7,2.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171,39.66,6.71,56.32,66.19,12.35,20.02,50.26,100.0,17.54,12.50,...,,2.3,0,0.0,0.0,0.00,0.0,0.6,0.0,0.0
172,39.68,6.87,56.74,66.61,12.55,20.18,50.80,100.0,17.48,12.41,...,,1.0,0,0.0,0.0,0.00,0.0,0.6,0.0,0.0
173,42.23,7.50,58.41,68.30,13.33,20.81,52.96,100.0,17.23,12.04,...,,1.3,0,0.0,0.0,0.00,0.0,0.6,0.0,0.0
174,38.48,7.53,58.36,69.25,14.35,20.57,51.31,100.0,17.87,12.77,...,,2.3,0,0.0,0.0,0.00,0.0,0.5,0.0,0.0


In [27]:
X = chem.drop(['Yield'], axis = 1)
y = chem['Yield']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=23)

By fitting the imputer only on the training data and then transforming both the training and test data, we ensure that the imputation process is consistent across both datasets, providing a fair evaluation of the model's performance on unseen data.

In [32]:
imputer = SimpleImputer()
imp_X_train = imputer.fit_transform(X_train)
imp_X_test = imputer.transform(X_test)

In [33]:
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(imp_X_train)
X_test_poly = poly.fit_transform(imp_X_test)

In [37]:
lr = LinearRegression()
lr.fit(X_train_poly, y_train)
y_pred = lr.predict(X_test_poly)
print(mean_squared_error(y_test, y_pred))

43.00838650014057


## Pipes

In [42]:
from sklearn.pipeline import Pipeline
imputer = SimpleImputer()
poly = PolynomialFeatures(degree=2)
lr = LinearRegression()
pipe = Pipeline([('imputer',imputer),('POLY',poly),('LR', lr)])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print(mean_squared_error(y_test, y_pred))


43.00838650014057


In [47]:
kfold = KFold(n_splits=5, shuffle=True, random_state=23)
imputer = SimpleImputer() 
poly = PolynomialFeatures()
lr = LinearRegression()
pipe = Pipeline([('IMPUTER',imputer),
                 ('POLY',poly),('LR',lr)])
print(pipe.get_params())
params = {'IMPUTER__strategy':['mean','median'],
          'POLY__degree':[1,2,3]}
gcv = GridSearchCV(pipe, param_grid=params,verbose=3,
                   cv=kfold, scoring='neg_mean_squared_error')
gcv.fit(X, y)
print(gcv.best_params_)
print(gcv.best_score_)

gcv_res = pd.DataFrame(gcv.cv_results_)
gcv_res.to_csv("GridResults.csv")


{'memory': None, 'steps': [('IMPUTER', SimpleImputer()), ('POLY', PolynomialFeatures()), ('LR', LinearRegression())], 'verbose': False, 'IMPUTER': SimpleImputer(), 'POLY': PolynomialFeatures(), 'LR': LinearRegression(), 'IMPUTER__add_indicator': False, 'IMPUTER__copy': True, 'IMPUTER__fill_value': None, 'IMPUTER__keep_empty_features': False, 'IMPUTER__missing_values': nan, 'IMPUTER__strategy': 'mean', 'POLY__degree': 2, 'POLY__include_bias': True, 'POLY__interaction_only': False, 'POLY__order': 'C', 'LR__copy_X': True, 'LR__fit_intercept': True, 'LR__n_jobs': None, 'LR__positive': False}
Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END IMPUTER__strategy=mean, POLY__degree=1;, score=-5.551 total time=   0.0s
[CV 2/5] END IMPUTER__strategy=mean, POLY__degree=1;, score=-1.957 total time=   0.0s
[CV 3/5] END IMPUTER__strategy=mean, POLY__degree=1;, score=-84.655 total time=   0.0s
[CV 4/5] END IMPUTER__strategy=mean, POLY__degree=1;, score=-5.076 total time=   0.0s
