In [1]:
import sys
import pickle
import pandas as pd
import numpy as np
path = 'C:/Users/vincent/Desktop/Udacity/Data analyst/P5 - Identify Fraud from Enron Email/ud120-projects-master/'
sys.path.append(path + "tools/")
from feature_format import featureFormat, targetFeatureSplit
sys.path.append(path + "tools/")

sys.path.append(path + "final_project/")
from tester import dump_classifier_and_data, test_classifier


from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report

from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif

from sklearn import tree
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from sklearn.pipeline import Pipeline



In [2]:
# load dataset
with open(path + "final_project/final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)
dataDF = pd.DataFrame(data_dict).transpose()
dataDF.drop(['LOCKHART EUGENE E', 'TOTAL', 'THE TRAVEL AGENCY IN THE PARK'], inplace=True)
del dataDF['email_address']

# add new feature
dataDF.replace('NaN', np.nan, inplace=True)
dataDF['BtoN_ratio'] = dataDF['bonus'] / dataDF['salary']
dataDF.replace(np.nan, 'NaN', inplace=True)

temp = list(dataDF.columns.values)
try:
    temp.remove('poi')
except:
    pass
temp.insert(0, 'poi')
features_list = temp

my_dataset = dataDF.transpose().to_dict()
data = featureFormat(my_dataset, features_list, sort_keys = True)
scaler = MinMaxScaler()
data = scaler.fit_transform(data)
labels, features = targetFeatureSplit(data)

In [3]:
def clfResult(featurs, labels, clf):
    print 'result'
    print '--------'
    pred = clf.predict(featurs)
    print 'best parameters: ', clf.best_params_
    print clf.best_score_
    print clf.best_estimator_
    print 'accuracy: ', accuracy_score(labels, pred)
    print 'recall: ', recall_score(labels, pred)
    print 'precision: ', precision_score(labels, pred)
    print 'f1_score: ', f1_score(labels, pred)
    print classification_report(labels, pred)
    print '--------\n'

### Classifiers without using SKB and PCA, and use GridSearchCV to tune its parameters

In [14]:
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=41)

clf = GridSearchCV(GaussianNB(), n_jobs=-1, scoring= 'f1', cv = cv, param_grid= {})
clf.fit(features, labels)
clfResult(features, labels, clf)

result
--------
best parameters:  {}
0.224263233499
GaussianNB(priors=None)
accuracy:  0.398601398601
recall:  1.0
precision:  0.173076923077
f1_score:  0.295081967213
             precision    recall  f1-score   support

        0.0       1.00      0.31      0.48       125
        1.0       0.17      1.00      0.30        18

avg / total       0.90      0.40      0.45       143

--------



In [16]:
pred = clf.predict(features)
print pred
print classification_report(labels, pred)

[ 0.  1.  0.  1.  0.  0.  1.  1.  0.  1.  1.  0.  1.  1.  0.  1.  1.  1.
  1.  1.  1.  0.  1.  0.  1.  0.  1.  1.  1.  1.  1.  1.  0.  0.  1.  1.
  1.  1.  0.  1.  1.  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.  0.  0.  1.
  1.  1.  0.  1.  0.  1.  1.  1.  1.  1.  1.  1.  0.  1.  0.  1.  1.  0.
  0.  0.  1.  1.  1.  1.  1.  1.  1.  0.  1.  1.  0.  1.  1.  1.  1.  1.
  1.  0.  1.  0.  0.  1.  1.  1.  1.  0.  1.  1.  1.  0.  1.  0.  1.  0.
  0.  1.  0.  1.  1.  0.  0.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  0.  0.  1.  1.  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.]
             precision    recall  f1-score   support

        0.0       1.00      0.31      0.48       125
        1.0       0.17      1.00      0.30        18

avg / total       0.90      0.40      0.45       143



In [14]:
clf = GaussianNB()
clf.fit(features, labels)
clfResult(features, labels, clf)

result
--------
accuracy:  0.398601398601
recall:  1.0
precision:  0.173076923077
f1_score:  0.295081967213
             precision    recall  f1-score   support

        0.0       1.00      0.31      0.48       125
        1.0       0.17      1.00      0.30        18

avg / total       0.90      0.40      0.45       143

--------



In [4]:
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=42)

parameters = {'kernel': ('linear', 'poly', 'rbf', 'sigmoid'), 
              'C':range(7,15,1), 
              'tol': [x*0.1 for x in range(1, 20)]}

clf = GridSearchCV(svm.SVC(), n_jobs= -1, scoring= 'f1', cv = cv, param_grid= parameters)
clf.fit(features, labels)
clfResult(features, labels, clf)

result
--------
best parameters:  {'kernel': 'linear', 'C': 11, 'tol': 1.8}
0.227380952381
SVC(C=11, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=1.8, verbose=False)
accuracy:  0.916083916084
recall:  0.388888888889
precision:  0.875
f1_score:  0.538461538462
--------



In [None]:
svmClf = svm.SVC(kernel= 'linear', C= 11, tol= 1.8)
print svmClf
test_classifier(svmClf, my_dataset, features_list, folds = 10)


SVC(C=11, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=1.8, verbose=False)


In [7]:
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=42)

parameters = {'criterion': ('gini', 'entropy'), 
              'class_weight': ('balanced', None), 
              'min_samples_leaf': range(1,13)}
clf = GridSearchCV(tree.DecisionTreeClassifier(), param_grid= parameters, n_jobs= -1, scoring= 'precision', cv= cv)
clf.fit(features, labels)
clfResult(features, labels, clf)

result
--------
best parameters:  {'min_samples_leaf': 5, 'criterion': 'gini', 'class_weight': None}
0.279090909091
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=5,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
accuracy:  0.916083916084
recall:  0.555555555556
precision:  0.714285714286
f1_score:  0.625
--------



In [47]:
cv = StratifiedShuffleSplit(n_splits= 10, test_size=0.3, random_state=42)

parameters = {'C':[10**x for x in range(0,7,1)], 
              'tol':[10**x for x in range(-130,-120,1)],
              'class_weight': ('balanced', None)}

clf = GridSearchCV(LogisticRegression(), param_grid= parameters, n_jobs= -1, scoring= 'f1', cv = cv)
clf.fit(features, labels)
clfResult(features, labels, clf)

result
--------
best parameters:  {'C': 1000000, 'tol': 1e-130, 'class_weight': 'balanced'}
0.331993430546
LogisticRegression(C=1000000, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=1e-130, verbose=0, warm_start=False)
accuracy:  0.909090909091
recall:  0.944444444444
precision:  0.586206896552
f1_score:  0.723404255319
             precision    recall  f1-score   support

        0.0       0.99      0.90      0.95       125
        1.0       0.59      0.94      0.72        18

avg / total       0.94      0.91      0.92       143

--------



In [48]:
Log = LogisticRegression.set_params(clf.best_estimator_)
test_classifier(Log, my_dataset, features_list, folds = 1000)

LogisticRegression(C=1000000, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=1e-130, verbose=0, warm_start=False)
	Accuracy: 0.62127	Precision: 0.18349	Recall: 0.53350	F1: 0.27306	F2: 0.38617
	Total predictions: 15000	True positives: 1067	False positives: 4748	False negatives:  933	True negatives: 8252



In [32]:
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=42)

parameters = {'criterion': ('gini', 'entropy'), 
              'class_weight': ('balanced', None), 
              'min_samples_leaf': range(1,11)}

clf = GridSearchCV(RandomForestClassifier(), param_grid= parameters, n_jobs= -1, scoring= 'f1', cv = cv)
clf.fit(features, labels)
clfResult(features, labels, clf)

result
--------
best parameters:  {'min_samples_leaf': 5, 'criterion': 'gini', 'class_weight': 'balanced'}
0.343611111111
RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
accuracy:  0.937062937063
recall:  0.833333333333
precision:  0.714285714286
f1_score:  0.769230769231
--------



In [26]:
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=42)

parameters = {'n_estimators': range(40, 100, 10), 
              'learning_rate': [x*0.1 for x in range(1, 21)],
              'algorithm' : ('SAMME', 'SAMME.R')}

clf = GridSearchCV(AdaBoostClassifier(), param_grid= parameters, n_jobs= -1, scoring= 'precision', cv = cv)
clf.fit(features, labels)
clfResult(features, labels, clf)

result
--------
best parameters:  {'n_estimators': 60, 'learning_rate': 0.6000000000000001, 'algorithm': 'SAMME.R'}
0.386904761905
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.6, n_estimators=60, random_state=None)
accuracy:  1.0
recall:  1.0
precision:  1.0
f1_score:  1.0
--------



In [25]:
temp = {'importances' : clf.best_estimator_.feature_importances_}
temp = pd.DataFrame(data= temp, index= features_list[1:])
temp.sort_values(by= 'importances', ascending= False)
pd.DataFrame(data= temp, index= features_list[1:]).sort_values(by= 'importances', ascending= False)

Unnamed: 0,importances
other,0.191295
expenses,0.138526
shared_receipt_with_poi,0.11223
BtoN_ratio,0.092154
restricted_stock,0.063514
total_stock_value,0.06237
from_poi_to_this_person,0.053273
bonus,0.051287
deferred_income,0.046471
salary,0.044054


### Classifiers with PCA, and use GridSearchCV and Pipeline to tune its parameters

In [27]:
# for logistic regression, in this case, C is between 100 to 10000, tol is close to zero,
# the more pca__n_components the better
parameters = {'Logistic__C':[10**x for x in range(0,7,1)], 
              'Logistic__tol':[10**x for x in range(-120,-100,1)],
              'Logistic__class_weight': ('balanced', None),
              'pca__n_components': range(9, 15)}
pca = PCA()
log = LogisticRegression()
pipe = Pipeline(steps=[('pca', pca), ('Logistic', log)])
clf = GridSearchCV(pipe, param_grid= parameters, scoring= 'f1', cv = cv, n_jobs= -1)
clf.fit(features, labels)
clfResult(features, labels, clf)

result
--------
best parameters:  {'Logistic__C': 1000, 'Logistic__tol': 1e-120, 'pca__n_components': 14, 'Logistic__class_weight': 'balanced'}
0.336612627139
Pipeline(steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=14, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('Logistic', LogisticRegression(C=1000, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=1e-120, verbose=0, warm_start=False))])
accuracy:  0.853146853147
recall:  0.888888888889
precision:  0.457142857143
f1_score:  0.603773584906
--------



In [20]:
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=42)


parameters = {'pca__n_components': range(2, 10),
              'RandomForest__criterion': ('gini', 'entropy'), 
              'RandomForest__class_weight': ('balanced', None), 
              'RandomForest__min_samples_leaf': range(1,11)}
pca = PCA()
ran = RandomForestClassifier()
pipe = Pipeline(steps=[('pca', pca), ('RandomForest', ran)])
clf = GridSearchCV(pipe, param_grid= parameters, scoring= 'precision', cv = cv, n_jobs= -1)
clf.fit(features, labels)
clfResult(features, labels, clf)

result
--------
best parameters:  {'RandomForest__class_weight': 'balanced', 'RandomForest__criterion': 'entropy', 'pca__n_components': 5, 'RandomForest__min_samples_leaf': 5}
0.331465201465
Pipeline(steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('RandomForest', RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='entropy', max_depth=None, max_features='auto',
            max...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])
accuracy:  0.895104895105
recall:  0.888888888889
precision:  0.551724137931
f1_score:  0.68085106383
             precision    recall  f1-score   support

        0.0       0.98      0.90      0.94       125
        1.0       0.55      0.89      0.68        18

avg / total       0.93      0.90      0.90       143

--------



In [21]:
pca = PCA(n_components=9)
ran = RandomForestClassifier(class_weight= None, criterion = 'gini', min_samples_leaf= 1)
clf = Pipeline(steps=[('pca', pca), ('RandomForest', ran)])

test_classifier(clf, my_dataset, features_list, folds = 1000)


Pipeline(steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=9, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('RandomForest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
   ...imators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))])
	Accuracy: 0.85133	Precision: 0.35065	Recall: 0.13500	F1: 0.19495	F2: 0.15393
	Total predictions: 15000	True positives:  270	False positives:  500	False negatives: 1730	True negatives: 12500



### Classifiers SKB, and use GridSearchCV and Pipeline to tune its parameters

In [49]:
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=42)

parameters = {'Logistic__C':[10**x for x in range(0,9,1)], 
              'Logistic__tol':[10**x for x in range(-130,-120,1)],
              'Logistic__class_weight': ('balanced', None),
              'skb__k': range(6, 15)}
skb = SelectKBest(f_classif)
log = LogisticRegression()
pipe = Pipeline(steps=[('skb', skb), ('Logistic', log)])
clf = GridSearchCV(pipe, param_grid= parameters, scoring= 'f1', cv = cv, n_jobs= -1)
clf.fit(features, labels)
clfResult(features, labels, clf)

result
--------
best parameters:  {'Logistic__C': 1000000, 'Logistic__tol': 1e-130, 'Logistic__class_weight': 'balanced', 'skb__k': 11}
0.328049289891
Pipeline(steps=[('skb', SelectKBest(k=11, score_func=<function f_classif at 0x000000000A9940B8>)), ('Logistic', LogisticRegression(C=1000000, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=1e-130, verbose=0, warm_start=False))])
accuracy:  0.699300699301
recall:  0.888888888889
precision:  0.280701754386
f1_score:  0.426666666667
             precision    recall  f1-score   support

        0.0       0.98      0.67      0.80       125
        1.0       0.28      0.89      0.43        18

avg / total       0.89      0.70      0.75       143

--------



In [59]:
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=42)

parameters = {'Ada__n_estimators': range(40, 110, 10), 
              'Ada__learning_rate': [x*0.1 for x in range(1, 21)],
              'Ada__algorithm' : ('SAMME', 'SAMME.R'),
              'skb__k': range(14, 19)}
skb = SelectKBest(f_classif)
ada = AdaBoostClassifier()
pipe = Pipeline(steps=[('skb', skb), ('Ada', ada)])

clf = GridSearchCV(pipe, param_grid= parameters, n_jobs= -1, scoring= 'f1', cv = cv)
clf.fit(features, labels)
clfResult(features, labels, clf)

# print out best k features (you can also rifit skb, I tried couple of times, had the same result)
skb_indices = clf.best_estimator_.named_steps['skb'].get_support(indices = True)
skb_score =clf.best_estimator_.named_steps['skb'].scores_

features_selected=[features_list[i+1] for i in skb_indices]
features_score = [skb_score[i] for i in skb_indices]
tempS = pd.DataFrame(data={'SKB_score': features_score}, index= features_selected).sort_values('SKB_score', ascending = False)
print tempS

temp = {'Ada_importances' :  clf.best_estimator_.named_steps['Ada'].feature_importances_}
tempR = pd.DataFrame(data= temp, index= features_selected).sort_values('Ada_importances', ascending = False)

print tempR

pd.concat([tempS, tempR], axis=1)

result
--------
best parameters:  {'Ada__algorithm': 'SAMME.R', 'Ada__learning_rate': 0.6000000000000001, 'Ada__n_estimators': 60, 'skb__k': 16}
0.320411255411
Pipeline(steps=[('skb', SelectKBest(k=16, score_func=<function f_classif at 0x000000000AF33F98>)), ('Ada', AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.6, n_estimators=60, random_state=None))])
accuracy:  1.0
recall:  1.0
precision:  1.0
f1_score:  1.0
--------

                         SKB_score
exercised_stock_options  24.815080
total_stock_value        24.182899
bonus                    20.792252
salary                   18.289684
deferred_income          11.458477
BtoN_ratio               10.783585
long_term_incentive       9.922186
restricted_stock          9.212811
total_payments            8.772778
shared_receipt_with_poi   8.589421
loan_advances             7.184056
expenses                  6.094173
from_poi_to_this_person   5.243450
other                     4.187478
from_this_

Unnamed: 0,SKB_score,Ada_importances
BtoN_ratio,10.783585,0.033333
bonus,20.792252,0.05
deferred_income,11.458477,0.083333
director_fees,2.126328,0.0
exercised_stock_options,24.81508,0.1
expenses,6.094173,0.166667
from_poi_to_this_person,5.24345,0.016667
from_this_person_to_poi,2.382612,0.133333
loan_advances,7.184056,0.0
long_term_incentive,9.922186,0.0


In [60]:

ada= AdaBoostClassifier.set_params(clf.best_estimator_.named_steps['Ada'])

tempList = features_selected[:]
tempList.insert(0, 'poi')



test_classifier(ada, my_dataset, tempList, folds = 100)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.6, n_estimators=60, random_state=None)
	Accuracy: 0.86333	Precision: 0.48062	Recall: 0.31000	F1: 0.37690	F2: 0.33369
	Total predictions: 1500	True positives:   62	False positives:   67	False negatives:  138	True negatives: 1233



In [17]:
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=42)


parameters = {'skb__k': range(6, 15),
              'RandomForest__criterion': ('gini', 'entropy'), 
              'RandomForest__class_weight': ('balanced', None), 
              'RandomForest__min_samples_leaf': range(1,11)}
skb = SelectKBest(f_classif)
ran = RandomForestClassifier()
pipe = Pipeline(steps=[('skb', skb), ('RandomForest', ran)])
clf = GridSearchCV(pipe, param_grid= parameters, scoring= 'precision', cv = cv, n_jobs= -1)
clf.fit(features, labels)
clfResult(features, labels, clf)

# print out best k features (you can also rifit skb, I tried couple of times, had the same result)
skb_indices = clf.best_estimator_.named_steps['skb'].get_support(indices = True)
skb_score =clf.best_estimator_.named_steps['skb'].scores_

features_selected=[features_list[i+1] for i in skb_indices]
features_score = [skb_score[i] for i in skb_indices]
tempS = pd.DataFrame(data={'SKB_score': features_score}, index= features_selected).sort_values('SKB_score', ascending = False)
print tempS

temp = {'RF_importances' :  clf.best_estimator_.named_steps['RandomForest'].feature_importances_}
tempR = pd.DataFrame(data= temp, index= features_selected).sort_values('RF_importances', ascending = False)

print tempR

pd.concat([tempS, tempR], axis=1)

result
--------
best parameters:  {'RandomForest__class_weight': 'balanced', 'RandomForest__criterion': 'entropy', 'skb__k': 8, 'RandomForest__min_samples_leaf': 2}
0.5
Pipeline(steps=[('skb', SelectKBest(k=8, score_func=<function f_classif at 0x000000000A9940B8>)), ('RandomForest', RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='entropy', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
 ...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])
accuracy:  0.965034965035
recall:  0.777777777778
precision:  0.933333333333
f1_score:  0.848484848485
             precision    recall  f1-score   support

        0.0       0.97      0.99      0.98       125
        1.0       0.93      0.78      0.85        18

avg / total       0.96      0.97      0.96       143

--------

                         SKB_score
exercised_stock_options  24.815080
total_stock_value  

Unnamed: 0,SKB_score,RF_importances
BtoN_ratio,10.783585,0.167732
bonus,20.792252,0.161801
deferred_income,11.458477,0.065906
exercised_stock_options,24.81508,0.192909
long_term_incentive,9.922186,0.065431
restricted_stock,9.212811,0.084377
salary,18.289684,0.149826
total_stock_value,24.182899,0.112019


### Classifiers SKB and PCA, and use GridSearchCV and Pipeline to tune its parameters

In [32]:
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=42)


parameters = {'skb__k': range(6, 12),
              'pca__n_components': range(2, 5),
              'RandomForest__criterion': ('gini', 'entropy'), 
              'RandomForest__class_weight': ('balanced', None), 
              'RandomForest__min_samples_leaf': range(1,11)}
pca = PCA()
skb = SelectKBest(f_classif)
ran = RandomForestClassifier()
pipe = Pipeline(steps=[('skb', skb), ('pca', pca), ('RandomForest', ran)])
clf = GridSearchCV(pipe, param_grid= parameters, scoring= 'precision', cv = cv, n_jobs= -1)
clf.fit(features, labels)
clfResult(features, labels, clf)

result
--------
best parameters:  {'RandomForest__class_weight': None, 'RandomForest__criterion': 'gini', 'skb__k': 7, 'pca__n_components': 3, 'RandomForest__min_samples_leaf': 1}
0.425
Pipeline(steps=[('skb', SelectKBest(k=7, score_func=<function f_classif at 0x000000000AC81438>)), ('pca', PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('RandomForest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion...imators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))])
accuracy:  1.0
recall:  1.0
precision:  1.0
f1_score:  1.0
--------

