In [1]:
# grid search kernel for gaussian process classifier
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process.kernels import DotProduct
from sklearn.gaussian_process.kernels import Matern
from sklearn.gaussian_process.kernels import RationalQuadratic
from sklearn.gaussian_process.kernels import WhiteKernel
# define dataset
X, y = make_classification(n_samples=100, n_features=20, n_informative=15, n_redundant=5, random_state=1)
# define model
model = GaussianProcessClassifier()
# define model evaluation method
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid
grid = dict()
grid['kernel'] = [1*RBF(), 1*DotProduct(), 1*Matern(),  1*RationalQuadratic(), 1*WhiteKernel()]
# define search
search = GridSearchCV(model, grid, scoring='accuracy', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(X, y)
# summarize best
print('Best Mean Accuracy: %.3f' % results.best_score_)
print('Best Config: %s' % results.best_params_)
# summarize all
means = results.cv_results_['mean_test_score']
params = results.cv_results_['params']
for mean, param in zip(means, params):
    print(">%.3f with: %r" % (mean, param))

Best Mean Accuracy: 0.913
Best Config: {'kernel': 1**2 * RationalQuadratic(alpha=1, length_scale=1)}
>0.790 with: {'kernel': 1**2 * RBF(length_scale=1)}
>0.800 with: {'kernel': 1**2 * DotProduct(sigma_0=1)}
>0.830 with: {'kernel': 1**2 * Matern(length_scale=1, nu=1.5)}
>0.913 with: {'kernel': 1**2 * RationalQuadratic(alpha=1, length_scale=1)}
>0.510 with: {'kernel': 1**2 * WhiteKernel(noise_level=1)}


In [7]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score

In [8]:
data = pd.read_csv('Structured_features.csv')
data

Unnamed: 0,Longitudinal modulus,Shear modulus,Bulk modulus,Young modulus,Poisson's ratio,Hardness,fractal bond connectivity,acoustic impedance,Erbium Concentration
0,60.505571,20.125896,33.671042,50.346610,0.250792,3.343693,2.390885,17.829646,0.01
1,61.749309,20.539600,34.363176,51.381524,0.250792,3.412425,2.390885,18.196148,0.01
2,60.206152,20.026301,33.504417,50.097465,0.250792,3.327146,2.390885,17.741414,0.01
3,62.014179,20.627703,34.510575,51.601922,0.250792,3.427063,2.390885,18.274199,0.01
4,62.831822,20.899675,34.965589,52.282281,0.250792,3.472248,2.390885,18.515140,0.01
...,...,...,...,...,...,...,...,...,...
245,58.273308,19.835527,31.825938,49.270607,0.241979,3.411991,2.493001,17.663388,0.05
246,57.141365,19.450228,31.207728,48.313539,0.241979,3.345714,2.493001,17.320282,0.05
247,61.777979,21.028475,33.740012,52.233838,0.241979,3.617195,2.493001,18.725699,0.05
248,58.251540,19.828118,31.814050,49.252202,0.241979,3.410717,2.493001,17.656790,0.05


In [10]:
train_data = data.sample(frac=0.8, random_state=0)
test_data = data.drop(train_data.index)

print('Shape of training data :',train_data.shape)
print('Shape of testing data :',test_data.shape)

Shape of training data : (200, 9)
Shape of testing data : (50, 9)


In [11]:
train_x = train_data.drop(columns=['Erbium Concentration'],axis=1)
train_y = train_data['Erbium Concentration'].astype(str)

test_x = test_data.drop(columns=['Erbium Concentration'],axis=1)
test_y = test_data['Erbium Concentration'].astype(str)

In [22]:
kernel = 1.0 * RBF(1.0)
gpc = GaussianProcessClassifier(
        kernel=kernel,
        random_state=0).fit(train_x, train_y)

In [23]:
gpc.score(train_x, train_y)

0.59

In [33]:
gpc.predict_proba(test_x)

array([[0.33680619, 0.00508841, 0.33680621, 0.3130319 , 0.0082673 ],
       [0.32102101, 0.01515159, 0.32102107, 0.29836101, 0.04444533],
       [0.26514881, 0.11007239, 0.26514888, 0.24643271, 0.11319721],
       [0.32028912, 0.02707239, 0.32028917, 0.29768076, 0.03466857],
       [0.33733857, 0.00599614, 0.33733858, 0.31352669, 0.00580001],
       [0.33403826, 0.00569341, 0.33403827, 0.31045933, 0.01577072],
       [0.33545313, 0.00577474, 0.33545314, 0.31177434, 0.01154465],
       [0.23484634, 0.29226306, 0.23484632, 0.21826909, 0.01977518],
       [0.19892281, 0.27309959, 0.19892281, 0.18488134, 0.14417345],
       [0.12792753, 0.46080245, 0.12792755, 0.11889746, 0.164445  ],
       [0.13606308, 0.50166794, 0.13606312, 0.12645878, 0.09974709],
       [0.13193966, 0.4977069 , 0.1319397 , 0.12262641, 0.11578733],
       [0.14676274, 0.41321446, 0.14676277, 0.13640316, 0.15685687],
       [0.12968836, 0.46086763, 0.12968838, 0.120534  , 0.15922163],
       [0.15057936, 0.53744787, 0.

In [34]:
prediction = gpc.predict(test_x)

In [35]:
dt_test_accuracy = accuracy_score(test_y,prediction)
print('test accuracy: ', dt_test_accuracy)

test accuracy:  0.58


In [36]:
# define model evaluation method
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid
grid = dict()
grid['kernel'] = [1*RBF(), 1*DotProduct(), 1*Matern(),  1*RationalQuadratic(), 1*WhiteKernel()]
# define search
search = GridSearchCV(gpc, grid, scoring='accuracy', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(train_x, train_y)
# summarize best
print('Best Mean Accuracy: %.3f' % results.best_score_)
print('Best Config: %s' % results.best_params_)
# summarize all
means = results.cv_results_['mean_test_score']
params = results.cv_results_['params']
for mean, param in zip(means, params):
    print(">%.3f with: %r" % (mean, param))

Best Mean Accuracy: 0.627
Best Config: {'kernel': 1**2 * RationalQuadratic(alpha=1, length_scale=1)}
>0.540 with: {'kernel': 1**2 * RBF(length_scale=1)}
>nan with: {'kernel': 1**2 * DotProduct(sigma_0=1)}
>0.493 with: {'kernel': 1**2 * Matern(length_scale=1, nu=1.5)}
>0.627 with: {'kernel': 1**2 * RationalQuadratic(alpha=1, length_scale=1)}
>0.190 with: {'kernel': 1**2 * WhiteKernel(noise_level=1)}


In [None]:
DotProduct() + WhiteKernel()