In [None]:
# Load feature vector csv
import pandas as pd
import numpy as np

#disable an unexpected warning on the new pandas version
import warnings
from sklearn.utils.multiclass import type_of_target

warnings.filterwarnings(
    "ignore",
    message="The number of unique classes is greater than 50% of the number of samples."
)

#csv_path = r"C:\Users\Davide Mascheroni\Desktop\movingText\movingText\Feature_csv\feature_vector.csv"
csv_path = r"C:\Users\david\OneDrive\Documenti\Tesi_BehavBio\Programs\Feature_csv\feature_vector.csv"
dataset = pd.read_csv(csv_path)
display(dataset)


Unnamed: 0,file_key,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f73,f74,f75,f76,f77,f78,f79,f80,f81,f82
0,T1_S1_TRY1_VB_SL_LIT,57,0.04688,0.60925,0.251515,0.228373,0.246580,0.108956,0.071780,0.900959,...,0.18165,0.18165,1.079605,1.610681,2.843415,3.024820,33,23,41,15
1,T1_S1_TRY1_HS_SL_LIT,45,0.11426,0.77112,0.322219,0.282340,0.267210,0.172174,0.099730,0.966898,...,0.12931,0.17533,0.508646,0.626663,0.901792,0.438980,22,22,23,21
2,T1_S1_TRY1_VB_SL_BIG,59,0.03345,0.93579,0.239702,0.215981,0.226560,0.128417,0.041380,3.328440,...,0.18513,0.18513,1.060574,1.026440,5.407974,5.173056,37,21,43,15
3,T1_S1_TRY1_HS_SL_BIG,90,0.01978,0.72949,0.154105,0.132798,0.120730,0.106873,0.032840,3.084167,...,0.15181,0.21207,2.847437,3.061352,2.888738,2.369702,45,43,46,42
4,T1_S1_TRY1_VB_FA_LIT,65,0.02087,1.28003,0.239979,0.207705,0.221190,0.159702,0.063050,4.316584,...,0.00000,0.00000,1.624835,1.000478,4.037489,3.748897,44,19,48,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2299,T33_S3_TRY3_HS_SL_BIG,42,0.09625,0.99805,0.373094,0.319370,0.300230,0.225034,0.104915,1.419665,...,0.00000,0.00000,0.475639,0.557531,0.921234,0.387021,23,18,24,17
2300,T33_S3_TRY3_VB_FA_LIT,57,0.02625,0.62970,0.261618,0.234139,0.247500,0.116631,0.061700,0.879215,...,0.00000,0.00000,1.291447,0.896257,2.576642,2.272187,38,18,38,18
2301,T33_S3_TRY3_HS_FA_LIT,55,0.06659,1.94910,0.291847,0.253836,0.261050,0.242696,0.059450,5.758985,...,0.06423,0.06423,0.668593,0.677116,1.471649,1.645241,23,31,18,36
2302,T33_S3_TRY3_VB_FA_BIG,58,0.09448,0.51691,0.247106,0.227703,0.245025,0.094850,0.080200,0.305149,...,0.00000,0.00000,1.755014,1.697530,2.751862,2.710229,39,18,36,21


In [6]:
#split the dataset with random split (80-20) with stratification
from sklearn.model_selection import train_test_split

# Extract the tester number from file key
dataset['person_id'] = dataset['file_key'].apply(lambda x: x.split('_')[0])

# In x I put all the rows and all the column labeled from f0 to f82
X = dataset.loc[:, 'f0':'f82']

# The y indicate the classes and in our case we use the tester's number for identification
y = dataset['person_id']

# Use stratification on y to keep the same proportion when split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0, stratify=y)


In [10]:
#Pipeline using naivy bayes as classifier and random split(80-20) with stratification

from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif

pipeline = Pipeline([
    ('inputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler()),
    ('nb', GaussianNB())
])

param_grid = {
    'scaler': [MinMaxScaler(), StandardScaler()],
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best params:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

best_scaler = grid_search.best_estimator_.named_steps['scaler']
print("Best scaler:", best_scaler)

print("Train set accuracy:", grid_search.best_estimator_.score(X_train, y_train))
print("Test set accuracy:", grid_search.best_estimator_.score(X_test, y_test))



Best params: {'scaler': MinMaxScaler()}
Best cross-validation accuracy: 0.6131156474608224
Best scaler: MinMaxScaler()
Train set accuracy: 0.6972327726532827
Test set accuracy: 0.6442516268980477


In [14]:
#Pipeline using knn as classifier and random split(80-20) with stratification

from sklearn.neighbors import KNeighborsClassifier

pipeline = Pipeline([
    #Fill the Nan with the mean on the dataset of that value
    ('imputer', SimpleImputer(strategy='mean')),
    #It is simply a placeholder
    ('scaler', MinMaxScaler()), 
    ('knn', KNeighborsClassifier())
])

param_grid = {
    #MinMax and Standard scaler
    'scaler': [MinMaxScaler(), StandardScaler()],
    #Number of neighbors
    'knn__n_neighbors': [3, 5, 7, 9, 11],
    #weight that each neighbour have on the voting process
    #Uniform = everybody have the same weight
    #Distance = Neighbors are weighted by the inverse of their distance 
    'knn__weights': ['uniform', 'distance'],
    #Distance metric used
    'knn__metric': ['minkowski', 'euclidean', 'manhattan'],
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best params:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

best_scaler = grid_search.best_estimator_.named_steps['scaler']
print("Best scaler:", best_scaler)

print("Train set accuracy:", grid_search.best_estimator_.score(X_train, y_train))
print("Test set accuracy:", grid_search.best_estimator_.score(X_test, y_test))

Best params: {'knn__metric': 'manhattan', 'knn__n_neighbors': 7, 'knn__weights': 'distance', 'scaler': StandardScaler()}
Best cross-validation accuracy: 0.6988232001885235
Best scaler: StandardScaler()
Train set accuracy: 1.0
Test set accuracy: 0.7266811279826464


In [None]:
#Pipeline using logistic regression as classifier and random split(80-20) with stratification

from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([
    ('inputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler()),
    ('logreg', LogisticRegression(max_iter = 1000, random_state = 0))
])

param_grid = {
    'scaler': [MinMaxScaler(), StandardScaler()],
    'logreg__penalty': ['l1', 'l2'],
    'logreg__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'logreg__solver': ['liblinear', 'saga', 'lbfgs', 'newton-cg']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best params:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

best_scaler = grid_search.best_estimator_.named_steps['scaler']
print("Best scaler:", best_scaler)

print("Train set accuracy:", grid_search.best_estimator_.score(X_train, y_train))
print("Test set accuracy:", grid_search.best_estimator_.score(X_test, y_test))


Best params: {'logreg__C': 1, 'scaler': StandardScaler()}
Best cross-validation accuracy: 0.7096912925650997
Best scaler: StandardScaler()
Train set accuracy: 0.9186109603906674
Test set accuracy: 0.7245119305856833


In [34]:
#Pipeline using NuSVC as classifier and random split(80-20) with stratification

from sklearn.svm import NuSVC

pipeline = Pipeline([
    ('inputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler()),  # Placeholder; to be searched
    ('svc', NuSVC())  # Replace rf with NuSVC
])

param_grid = {
    'scaler': [MinMaxScaler(), StandardScaler()],
    'svc__nu': [0.25, 0.5, 0.75],  
    'svc__kernel': ['rbf', 'poly', 'sigmoid'],
    'svc__gamma': ['scale', 'auto']
}

# Grid search with 5-fold CV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Output best parameters and scores
print("Best params:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

# Get best scaler
best_scaler = grid_search.best_estimator_.named_steps['scaler']
print("Best scaler:", best_scaler)

# Train/test accuracy
print("Train set accuracy:", grid_search.best_estimator_.score(X_train, y_train))
print("Test set accuracy:", grid_search.best_estimator_.score(X_test, y_test))


Best params: {'scaler': StandardScaler(), 'svc__gamma': 'scale', 'svc__kernel': 'rbf', 'svc__nu': 0.25}
Best cross-validation accuracy: 0.7596117591610698
Best scaler: StandardScaler()
Train set accuracy: 0.9744981009224091
Test set accuracy: 0.7787418655097614


In [31]:
#Pipeline using random forest as classifier and random split(80-20) with stratification

from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline([
    ('inputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler()),
    ('rf', RandomForestClassifier(random_state = 0))
])

param_grid = {
    'scaler': [MinMaxScaler(), StandardScaler()],
    'rf__n_estimators': [20, 30, 50, 100, 200],
    #Number of features considered at each step. sqrt mean the sqrt of n_features
    'rf__max_features': ['sqrt'],
    'rf__max_depth':  [5, 10, 20, 30]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best params:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

best_scaler = grid_search.best_estimator_.named_steps['scaler']
print("Best scaler:", best_scaler)

print("Train set accuracy:", grid_search.best_estimator_.score(X_train, y_train))
print("Test set accuracy:", grid_search.best_estimator_.score(X_test, y_test))


Best params: {'rf__max_depth': 20, 'rf__max_features': 'sqrt', 'rf__n_estimators': 200, 'scaler': StandardScaler()}
Best cross-validation accuracy: 0.7721088134794393
Best scaler: StandardScaler()
Train set accuracy: 1.0
Test set accuracy: 0.8047722342733189


In [33]:
#Pipeline using SVC as classifier and random split(80-20) with stratification

from sklearn.svm import SVC

pipeline = Pipeline([
    ('inputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler()),
    ('svc', SVC())
])

param_grid = {
    'scaler': [MinMaxScaler(), StandardScaler()],
    'svc__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'svc__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
    'svc__kernel':  ['rbf', 'poly']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best params:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

best_scaler = grid_search.best_estimator_.named_steps['scaler']
print("Best scaler:", best_scaler)

print("Train set accuracy:", grid_search.best_estimator_.score(X_train, y_train))
print("Test set accuracy:", grid_search.best_estimator_.score(X_test, y_test))


Best params: {'scaler': StandardScaler(), 'svc__C': 100, 'svc__gamma': 0.01, 'svc__kernel': 'rbf'}
Best cross-validation accuracy: 0.7813111228938376
Best scaler: StandardScaler()
Train set accuracy: 1.0
Test set accuracy: 0.7830802603036876


In [None]:
#Pipeline using MLP as classifier and random split(80-20) with stratification

from sklearn.neural_network import MLPClassifier

pipeline = Pipeline([
    ('inputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler()),
    ('mlp' , MLPClassifier(max_iter=2000, random_state = 0))
])

param_grid = {
    'scaler': [MinMaxScaler(), StandardScaler()],
    'mlp__hidden_layer_sizes': [(100,), (100, 50), (150, 100, 50)],
    'mlp__activation': ['tanh', 'relu'],
    'mlp__alpha':  [0.0001, 0.001, 0.01],
    'mlp__learning_rate_init': [0.001, 0.01],
    'mlp__solver': ['adam']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best params:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

best_scaler = grid_search.best_estimator_.named_steps['scaler']
print("Best scaler:", best_scaler)

print("Train set accuracy:", grid_search.best_estimator_.score(X_train, y_train))
print("Test set accuracy:", grid_search.best_estimator_.score(X_test, y_test))




In [36]:
#Split the dataset into train and test set.
#This time I used S1 and S2 as train and s3 as test.

#Extract the tester number from file key
dataset['tester_id'] = dataset['file_key'].apply(lambda x: x.split('_')[0])

#Extract the session number from file key
dataset['session_id'] = dataset['file_key'].apply(lambda x: x.split('_')[1])

#isin is a pandas method used to filter row with a certain value
train_subset = dataset[dataset['session_id'].isin(['S1', 'S2'])]
test_subset = dataset[dataset['session_id'] == 'S3']

X_train = train_subset.loc[:, 'f0':'f82']
y_train = train_subset['tester_id']

X_test = test_subset.loc[:, 'f0':'f82']
y_test = test_subset['tester_id']


In [37]:
#Pipeline using naivy bayes as classifier and S1, S2 as train and s3 as test set

from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ('inputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler()),
    ('nb', GaussianNB())
])

param_grid = {
    'scaler': [MinMaxScaler(), StandardScaler()]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best params:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

best_scaler = grid_search.best_estimator_.named_steps['scaler']
print("Best scaler:", best_scaler)

print("Train set accuracy:", grid_search.best_estimator_.score(X_train, y_train))
print("Test set accuracy:", grid_search.best_estimator_.score(X_test, y_test))

Best params: {'scaler': MinMaxScaler()}
Best cross-validation accuracy: 0.5677503278480478
Best scaler: MinMaxScaler()
Train set accuracy: 0.7037760416666666
Test set accuracy: 0.5651041666666666


In [38]:
#Pipeline using KNN as classifier and S1, S2 as train and s3 as test set

from sklearn.neighbors import KNeighborsClassifier

pipeline = Pipeline([
    #Fill the Nan with the mean on the dataset of that value
    ('imputer', SimpleImputer(strategy='mean')),
    #It is simply a placeholder
    ('scaler', MinMaxScaler()), 
    ('knn', KNeighborsClassifier())
])

param_grid = {
    #MinMax and Standard scaler
    'scaler': [MinMaxScaler(), StandardScaler()],
    #Number of neighbors
    'knn__n_neighbors': [3, 5, 7, 9, 11],
    #weight that each neighbour have on the voting process
    #Uniform = everybody have the same weight
    #Distance = Neighbors are weighted by the inverse of their distance 
    'knn__weights': ['uniform', 'distance'],
    #Distance metric used
    'knn__metric': ['minkowski', 'euclidean', 'manhattan'],
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best params:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

best_scaler = grid_search.best_estimator_.named_steps['scaler']
print("Best scaler:", best_scaler)

print("Train set accuracy:", grid_search.best_estimator_.score(X_train, y_train))
print("Test set accuracy:", grid_search.best_estimator_.score(X_test, y_test))


Best params: {'knn__metric': 'manhattan', 'knn__n_neighbors': 5, 'knn__weights': 'distance', 'scaler': StandardScaler()}
Best cross-validation accuracy: 0.6843182875756165
Best scaler: StandardScaler()
Train set accuracy: 1.0
Test set accuracy: 0.63671875


In [39]:
#Pipeline using Logistic Regression as classifier and S1, S2 as train and s3 as test set

from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([
    ('inputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler()),
    ('logreg', LogisticRegression(max_iter = 1000, random_state = 0))
])

param_grid = {
    'scaler': [MinMaxScaler(), StandardScaler()],
    'logreg__C': [0.001, 0.01, 0.1, 1, 10, 100]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best params:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

best_scaler = grid_search.best_estimator_.named_steps['scaler']
print("Best scaler:", best_scaler)

print("Train set accuracy:", grid_search.best_estimator_.score(X_train, y_train))
print("Test set accuracy:", grid_search.best_estimator_.score(X_test, y_test))


Best params: {'logreg__C': 1, 'scaler': StandardScaler()}
Best cross-validation accuracy: 0.669315537882313
Best scaler: StandardScaler()
Train set accuracy: 0.943359375
Test set accuracy: 0.6966145833333334


In [40]:
#Pipeline using NUsvc as classifier and S1, S2 as train and s3 as test set

from sklearn.svm import NuSVC

pipeline = Pipeline([
    ('inputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler()),  # Placeholder; to be searched
    ('svc', NuSVC())  # Replace rf with NuSVC
])

param_grid = {
    'scaler': [MinMaxScaler(), StandardScaler()],
    'svc__nu': [0.25, 0.5, 0.75],  
    'svc__kernel': ['rbf', 'poly', 'sigmoid'],
    'svc__gamma': ['scale', 'auto']
}

# Grid search with 5-fold CV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Output best parameters and scores
print("Best params:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

# Get best scaler
best_scaler = grid_search.best_estimator_.named_steps['scaler']
print("Best scaler:", best_scaler)

# Train/test accuracy
print("Train set accuracy:", grid_search.best_estimator_.score(X_train, y_train))
print("Test set accuracy:", grid_search.best_estimator_.score(X_test, y_test))


Best params: {'scaler': StandardScaler(), 'svc__gamma': 'scale', 'svc__kernel': 'rbf', 'svc__nu': 0.25}
Best cross-validation accuracy: 0.7253119844325056
Best scaler: StandardScaler()
Train set accuracy: 0.9837239583333334
Test set accuracy: 0.6783854166666666


In [42]:
#Pipeline using Random Forest as classifier and S1, S2 as train and s3 as test set

from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline([
    ('inputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler()),
    ('rf', RandomForestClassifier(random_state = 0))
])

param_grid = {
    'scaler': [MinMaxScaler(), StandardScaler()],
    'rf__n_estimators': [20, 30, 50, 100, 200],
    #Number of features considered at each step. sqrt mean the sqrt of n_features
    'rf__max_features': ['sqrt'],
    'rf__max_depth':  [5, 10, 20, 30]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best params:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

best_scaler = grid_search.best_estimator_.named_steps['scaler']
print("Best scaler:", best_scaler)

print("Train set accuracy:", grid_search.best_estimator_.score(X_train, y_train))
print("Test set accuracy:", grid_search.best_estimator_.score(X_test, y_test))


Best params: {'rf__max_depth': 30, 'rf__max_features': 'sqrt', 'rf__n_estimators': 200, 'scaler': MinMaxScaler()}
Best cross-validation accuracy: 0.7318372181564363
Best scaler: MinMaxScaler()
Train set accuracy: 1.0
Test set accuracy: 0.6614583333333334


In [41]:
#Pipeline using SVC as classifier and S1, S2 as train and s3 as test set

from sklearn.svm import SVC

pipeline = Pipeline([
    ('inputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler()),
    ('svc', SVC())
])

param_grid = {
    'scaler': [MinMaxScaler(), StandardScaler()],
    'svc__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'svc__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
    'svc__kernel':  ['rbf', 'poly']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best params:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

best_scaler = grid_search.best_estimator_.named_steps['scaler']
print("Best scaler:", best_scaler)

print("Train set accuracy:", grid_search.best_estimator_.score(X_train, y_train))
print("Test set accuracy:", grid_search.best_estimator_.score(X_test, y_test))


Best params: {'scaler': StandardScaler(), 'svc__C': 10, 'svc__gamma': 0.01, 'svc__kernel': 'rbf'}
Best cross-validation accuracy: 0.7324802233597022
Best scaler: StandardScaler()
Train set accuracy: 0.9986979166666666
Test set accuracy: 0.671875


In [None]:
#Pipeline using MLP as classifier and S1, S2 as train and s3 as test set

from sklearn.neural_network import MLPClassifier

# Define pipeline
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler()), 
    ('mlp', MLPClassifier(max_iter=500, early_stopping=True, n_iter_no_change=10, validation_fraction=0.1, random_state=0))
])

param_grid = {
    'scaler': [MinMaxScaler(), StandardScaler()],
    'mlp__hidden_layer_sizes': [(100,), (100, 50)],
    'mlp__activation': ['relu'], 
    'mlp__alpha': [0.0001, 0.001],
    'mlp__learning_rate_init': [0.001, 0.005],
    'mlp__solver': ['adam']  
}

# GridSearchCV with verbose logging and parallel processing
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy', verbose=3, n_jobs=-1)

# Run grid search
grid_search.fit(X_train, y_train)

# Output results
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

best_scaler = grid_search.best_estimator_.named_steps['scaler']
print("Best scaler:", best_scaler)

print("Train set accuracy:", grid_search.best_estimator_.score(X_train, y_train))
print("Test set accuracy:", grid_search.best_estimator_.score(X_test, y_test))