In [1]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split 
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures # for polynomial features
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [2]:
chem = pd.read_csv(r"..\Cases\Chemical Process Data\ChemicalProcess.csv")
chem.head()

Unnamed: 0,Yield,BiologicalMaterial01,BiologicalMaterial02,BiologicalMaterial03,BiologicalMaterial04,BiologicalMaterial05,BiologicalMaterial06,BiologicalMaterial07,BiologicalMaterial08,BiologicalMaterial09,...,ManufacturingProcess36,ManufacturingProcess37,ManufacturingProcess38,ManufacturingProcess39,ManufacturingProcess40,ManufacturingProcess41,ManufacturingProcess42,ManufacturingProcess43,ManufacturingProcess44,ManufacturingProcess45
0,38.0,6.25,49.58,56.97,12.74,19.51,43.73,100.0,16.66,11.44,...,0.019,0.5,3,7.2,,,11.6,3.0,1.8,2.4
1,42.44,8.01,60.97,67.48,14.65,19.36,53.14,100.0,19.04,12.55,...,0.019,2.0,2,7.2,0.1,0.15,11.1,0.9,1.9,2.2
2,42.03,8.01,60.97,67.48,14.65,19.36,53.14,100.0,19.04,12.55,...,0.018,0.7,2,7.2,0.0,0.0,12.0,1.0,1.8,2.3
3,41.42,8.01,60.97,67.48,14.65,19.36,53.14,100.0,19.04,12.55,...,0.018,1.2,2,7.2,0.0,0.0,10.6,1.1,1.8,2.1
4,42.49,7.47,63.33,72.25,14.02,17.91,54.66,100.0,18.22,12.8,...,0.017,0.2,2,7.3,0.0,0.0,11.0,1.1,1.7,2.1


In [3]:
y = chem['Yield']
X = chem.drop('Yield', axis=1) 


In [4]:
chem.isna().sum()

Yield                      0
BiologicalMaterial01       0
BiologicalMaterial02       0
BiologicalMaterial03       0
BiologicalMaterial04       0
BiologicalMaterial05       0
BiologicalMaterial06       0
BiologicalMaterial07       0
BiologicalMaterial08       0
BiologicalMaterial09       0
BiologicalMaterial10       0
BiologicalMaterial11       0
BiologicalMaterial12       0
ManufacturingProcess01     1
ManufacturingProcess02     3
ManufacturingProcess03    15
ManufacturingProcess04     1
ManufacturingProcess05     1
ManufacturingProcess06     2
ManufacturingProcess07     1
ManufacturingProcess08     1
ManufacturingProcess09     0
ManufacturingProcess10     9
ManufacturingProcess11    10
ManufacturingProcess12     1
ManufacturingProcess13     0
ManufacturingProcess14     1
ManufacturingProcess15     0
ManufacturingProcess16     0
ManufacturingProcess17     0
ManufacturingProcess18     0
ManufacturingProcess19     0
ManufacturingProcess20     0
ManufacturingProcess21     0
ManufacturingP

In [5]:
chem.isna().sum().sum()

106

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=24)

In [7]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean').set_output(transform='pandas')
X_imputed_trn = imp.fit_transform(X_train)
# X_imputed.isna().sum().sum()
X_imputed_tst = imp.transform(X_test)

In [8]:
X_imputed_trn.head()

Unnamed: 0,BiologicalMaterial01,BiologicalMaterial02,BiologicalMaterial03,BiologicalMaterial04,BiologicalMaterial05,BiologicalMaterial06,BiologicalMaterial07,BiologicalMaterial08,BiologicalMaterial09,BiologicalMaterial10,...,ManufacturingProcess36,ManufacturingProcess37,ManufacturingProcess38,ManufacturingProcess39,ManufacturingProcess40,ManufacturingProcess41,ManufacturingProcess42,ManufacturingProcess43,ManufacturingProcess44,ManufacturingProcess45
126,6.23,52.95,66.71,12.75,16.31,45.84,100.0,16.7,12.75,2.98,...,0.02,1.0,3.0,7.2,0.0,0.0,11.4,0.6,1.9,2.2
64,6.37,52.67,64.34,12.02,17.4,46.52,100.0,17.38,12.48,2.75,...,0.019,1.0,2.0,7.0,0.0,0.0,11.6,0.9,2.0,2.3
53,6.87,57.33,71.52,13.22,15.62,50.85,100.0,17.74,13.16,2.91,...,0.02,1.4,3.0,7.3,0.0,0.0,11.6,1.2,1.9,2.2
141,6.25,54.57,67.56,12.1,17.66,47.8,100.0,17.32,12.89,2.73,...,0.02,1.0,3.0,7.2,0.0,0.0,11.7,0.6,1.8,2.3
9,6.94,63.6,72.06,15.7,19.42,54.72,100.0,18.85,13.13,3.85,...,0.019,1.8,3.0,7.1,0.0,0.0,11.3,0.8,1.9,2.4


In [9]:
X_imputed_tst.head()

Unnamed: 0,BiologicalMaterial01,BiologicalMaterial02,BiologicalMaterial03,BiologicalMaterial04,BiologicalMaterial05,BiologicalMaterial06,BiologicalMaterial07,BiologicalMaterial08,BiologicalMaterial09,BiologicalMaterial10,...,ManufacturingProcess36,ManufacturingProcess37,ManufacturingProcess38,ManufacturingProcess39,ManufacturingProcess40,ManufacturingProcess41,ManufacturingProcess42,ManufacturingProcess43,ManufacturingProcess44,ManufacturingProcess45
108,5.79,53.96,66.53,10.4,18.26,47.57,100.0,17.24,12.99,2.16,...,0.02,1.5,2.0,7.1,0.0,0.1,11.7,0.8,1.9,1.9
159,5.7,52.77,66.25,10.5,15.18,47.07,100.0,16.67,12.84,2.17,...,0.019,1.3,2.0,7.2,0.0,0.0,12.1,0.7,1.8,2.1
93,7.22,57.32,71.02,14.05,19.16,49.21,100.0,18.02,13.43,3.52,...,0.019,0.9,2.0,7.2,0.0,0.0,11.9,0.9,1.9,2.5
151,6.25,52.68,65.12,11.64,18.11,45.42,100.0,17.51,13.01,2.88,...,0.02,0.8,3.0,7.4,0.0,0.0,11.6,0.7,1.9,1.9
57,6.65,55.61,68.93,12.72,15.91,48.64,100.0,17.87,13.31,2.98,...,0.02,0.5,3.0,7.3,0.0,0.0,11.3,0.0,2.0,2.2


In [10]:
lr = LinearRegression()
lr.fit(X_imputed_trn, y_train)
y_pred = lr.predict(X_imputed_tst)
print(r2_score(y_test,y_pred))

0.23930185859427278


In [11]:
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor

pipe = Pipeline([('IMP', imp), ('LR', lr)])

print(pipe.get_params())
pipe.fit(X_imputed_trn, y_train)
y_pred = pipe.predict(X_imputed_tst)
print(r2_score(y_test, y_pred))


{'memory': None, 'steps': [('IMP', SimpleImputer()), ('LR', LinearRegression())], 'verbose': False, 'IMP': SimpleImputer(), 'LR': LinearRegression(), 'IMP__add_indicator': False, 'IMP__copy': True, 'IMP__fill_value': None, 'IMP__keep_empty_features': False, 'IMP__missing_values': nan, 'IMP__strategy': 'mean', 'LR__copy_X': True, 'LR__fit_intercept': True, 'LR__n_jobs': None, 'LR__positive': False}
0.23930185859427278


In [17]:
kfold = KFold(shuffle=True, random_state=24, n_splits=5)
imp = SimpleImputer()
lr = LinearRegression()
pipe = Pipeline([('IMP', imp), ('LR', lr)])

print(pipe.get_params())

params = {'IMP__strategy':['mean', 'median']}

gcv = GridSearchCV(pipe, param_grid=params, cv= kfold)
gcv.fit(X, y)


print(gcv.best_params_)
print(gcv.best_score_)

{'memory': None, 'steps': [('IMP', SimpleImputer()), ('LR', LinearRegression())], 'verbose': False, 'IMP': SimpleImputer(), 'LR': LinearRegression(), 'IMP__add_indicator': False, 'IMP__copy': True, 'IMP__fill_value': None, 'IMP__keep_empty_features': False, 'IMP__missing_values': nan, 'IMP__strategy': 'mean', 'LR__copy_X': True, 'LR__fit_intercept': True, 'LR__n_jobs': None, 'LR__positive': False}
{'IMP__strategy': 'mean'}
-8.218967417875016


Imputed, Standard Sclar, KNN\


In [19]:
kfold = KFold(shuffle=True, random_state=24, n_splits=5)
imp = SimpleImputer()
lr = LinearRegression()
std_scaler = StandardScaler()
mm_scaler = MinMaxScaler()
knn = KNeighborsRegressor()
pipe = Pipeline([('IMP', imp),('SCL', None),('KNN', knn)])
print(pipe)
print(pipe.get_params())

params = {'IMP__strategy':['mean', 'median'],
          'KNN__n_neighbors': [1,2,3,4,5,6,7,8,9,10],
          'SCL':[std_scaler, mm_scaler, None]}

gcv = GridSearchCV(pipe, param_grid=params, cv= kfold, verbose=1)
gcv.fit(X, y)


print(gcv.best_params_)
print(gcv.best_score_)

Pipeline(steps=[('IMP', SimpleImputer()), ('SCL', None),
                ('KNN', KNeighborsRegressor())])
{'memory': None, 'steps': [('IMP', SimpleImputer()), ('SCL', None), ('KNN', KNeighborsRegressor())], 'verbose': False, 'IMP': SimpleImputer(), 'SCL': None, 'KNN': KNeighborsRegressor(), 'IMP__add_indicator': False, 'IMP__copy': True, 'IMP__fill_value': None, 'IMP__keep_empty_features': False, 'IMP__missing_values': nan, 'IMP__strategy': 'mean', 'KNN__algorithm': 'auto', 'KNN__leaf_size': 30, 'KNN__metric': 'minkowski', 'KNN__metric_params': None, 'KNN__n_jobs': None, 'KNN__n_neighbors': 5, 'KNN__p': 2, 'KNN__weights': 'uniform'}
Fitting 5 folds for each of 60 candidates, totalling 300 fits
{'IMP__strategy': 'mean', 'KNN__n_neighbors': 3, 'SCL': StandardScaler()}
0.4835363219733173


In [14]:
from sklearn import set_config

set_config(display='diagram')