# Creating the Dataset for ICO fraud detection

## 1. File with information about ICO

In [6]:
import pandas as pd
import os
import re

In [7]:
df_info = pd.read_csv('../data_and_models/ico_info_table_2020-08-01.tsv', sep='\t')

In [8]:
df_info.head()

Unnamed: 0,ico,adress,fraud,date_start_tr,date_start_market,diff_days,market_time
0,AMPLEFORTH,0xd46ba6d942050d489dbd938a2c909a5d5039a161,0,14/06/2019,27/06/2019,13,401
1,BAND,0xba11d00c5f74255f56a5e366f4f77f5a186d7f55,0,09/09/2019,19/09/2019,10,317
2,BANKEX,0x45245bc59219eeaaf6cd3f382e078a461ff9de7b,0,28/11/2017,28/12/2017,30,947
3,CARTESI,0x491604c0fdf08347dd1fa4ee062a822a5dd06b5d,0,20/04/2020,22/05/2020,32,71
4,CELER,0x4f9254c83eb525f9fcf346490bbb3ed28a81c667,0,19/03/2019,25/03/2019,6,495


## 2. Importing the `ICOParser` class

In [9]:
from ico_parser import ICOParser

In [10]:
help(ICOParser)

Help on class ICOParser in module ico_parser:

class ICOParser(builtins.object)
 |  ICOParser(path_to_csv, ico_start_date=None, fraud_flag=None)
 |  
 |  Methods defined here:
 |  
 |  __init__(self, path_to_csv, ico_start_date=None, fraud_flag=None)
 |      Class for parsing data coming from ICO.
 |      
 |      Args:
 |          path_to_csv (str):
 |          ico_start_date (str, default=None):
 |          fraud_flag (int, default=None):
 |      
 |      Attributes:
 |          fraud_flag (int):
 |          df (pd.DataFrame):
 |          df_resample_day (pd.DataFrame):
 |          df_resample_hour (pd.DataFrame):
 |          ico_start_date (datetime.date):
 |          ico_end_date (datetime.date):
 |  
 |  define_ico_start_date(self)
 |  
 |  filter_df_for_training_days(self)
 |  
 |  filter_df_for_training_hours(self)
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance v

## 3. Creating the dataset

In [6]:
path_to_files = '/home/gabriel/Documents/Repos/time_series_study/data_and_models/tokens/all_tokens'

In [7]:
os.listdir(path_to_files)[:5]

['STOX_T_COMPLETO.csv',
 'BEZANT_T_COMPLETO.csv',
 'POWERLEDGER_T_COMPLETO.csv',
 'CARTESI_T_COMPLETO.csv',
 'SFCAPITAL_T_COMPLETO.csv']

In [8]:
title_search = re.search('(\w+)_T', 'STOX_T_COMPLETO.csv', re.IGNORECASE)
title_search.group(1)

'STOX'

### 3.1. Iterating through the files

In [9]:
df_info.head()

Unnamed: 0,ico,adress,fraud,date_start_tr,date_start_market,diff_days,market_time
0,AMPLEFORTH,0xd46ba6d942050d489dbd938a2c909a5d5039a161,0,14/06/2019,27/06/2019,13,401
1,BAND,0xba11d00c5f74255f56a5e366f4f77f5a186d7f55,0,09/09/2019,19/09/2019,10,317
2,BANKEX,0x45245bc59219eeaaf6cd3f382e078a461ff9de7b,0,28/11/2017,28/12/2017,30,947
3,CARTESI,0x491604c0fdf08347dd1fa4ee062a822a5dd06b5d,0,20/04/2020,22/05/2020,32,71
4,CELER,0x4f9254c83eb525f9fcf346490bbb3ed28a81c667,0,19/03/2019,25/03/2019,6,495


In [11]:
df_info = pd.read_csv('../data_and_models/ico_info_table_2020-08-01.tsv', sep='\t')

In [20]:
import pytz

utc=pytz.UTC
utc

<UTC>

In [29]:
from datetime import datetime, timedelta

date = datetime.strptime('20/07/2020', '%d/%m/%Y')
date

datetime.datetime(2020, 7, 20, 0, 0)

In [27]:
list_row = []

# Iterate over files extracting data
for csv in sorted(os.listdir(path_to_files)):
    current_row_list = []
    ico_name = re.search('(\w+)_T_', csv, re.IGNORECASE).group(1)
    current_row_list.append(ico_name)
    print(ico_name, csv)
    ico_info_row = df_info.loc[df_info.ico == ico_name]
    ico_start_date = ico_info_row.at[ico_info_row.index[0], 'date_start_market']
    fraud_label = ico_info_row.at[ico_info_row.index[0], 'fraud']
    print(ico_start_date, fraud_label)
    path_to_csv = f'{path_to_files}/{csv}'
    ico_parser = ICOParser(path_to_csv, ico_start_date, fraud_label)
    ico_parser.define_ico_start_date()
    list_transactions_ts = ico_parser.filter_df_for_training_days().transactions.to_list()
    current_row_list.extend(list_transactions_ts)
    current_row_list.append(fraud_label)
    list_row.append(current_row_list)
    
    

0XCERT 0XCERT_T_COMPLETO.csv
11/07/2018 1
0X 0X_T_COMPLETO.csv
15/08/2017 0
4NEW 4NEW_T_COMPLETO.csv
20/08/2018 1
ABULABA ABULABA_T_COMPLETO.csv
16/12/2018 1
AERGO AERGO_T_COMPLETO.csv
17/12/2018 0
AIRSWAP AIRSWAP_T_COMPLETO.csv
10/10/2017 0
AKROPOLIS AKROPOLIS_T_COMPLETO.csv
17/09/2019 0
ALLME ALLME_T_COMPLETO.csv
30/05/2018 1
AMPLEFORTH AMPLEFORTH_T_COMPLETO.csv
27/06/2019 0
ARBITRAGE ARBITRAGE_T_COMPLETO.csv
09/03/2018 1
ARCBLOCK ARCBLOCK_T_COMPLETO.csv
24/02/2018 0
BAND BAND_T_COMPLETO.csv
19/09/2019 0
BANKERA BANKERA_T_COMPLETO.csv
04/06/2018 1
BANKEX BANKEX_T_COMPLETO.csv
28/12/2017 0
BELANCE BELANCE_T_COMPLETO.csv
09/11/2018 1
BEZANT BEZANT_T_COMPLETO.csv
10/05/2018 0
BILLIONAIRETOKEN BILLIONAIRETOKEN_T_COMPLETO.csv
21/09/2017 1
BITCOINCROWN BITCOINCROWN_T_COMPLETO.csv
10/06/2018 1
BITCOINMAX BITCOINMAX_T_COMPLETO.csv
06/10/2018 1
BITDEAL BITDEAL_T_COMPLETO.csv
11/10/2018 1
BITDEPOSITARY BITDEPOSITARY_T_COMPLETO.csv
18/10/2018 1
BLOCKMALL BLOCKMALL_T_COMPLETO.csv
01/06/2018 1
BL

## 4. Plotting the dataset

In [32]:
import plotly.graph_objects as go

In [30]:
header = ['ico']
header.extend(list(range(60)))
header.append('is_fraud')


df_dataset = pd.DataFrame(list_row, columns=header)
df_dataset

Unnamed: 0,ico,0,1,2,3,4,5,6,7,8,...,51,52,53,54,55,56,57,58,59,is_fraud
0,0XCERT,1053.0,646.0,158.0,93.0,57.0,247.0,114.0,125.0,48.0,...,9.0,14.0,4.0,16.0,26.0,13.0,18.0,8.0,11.0,1.0
1,0X,22670.0,21019.0,9884.0,9497.0,14077.0,4066.0,3265.0,2618.0,3187.0,...,585.0,729.0,783.0,1174.0,953.0,1139.0,882.0,934.0,1144.0,0.0
2,4NEW,5417.0,3336.0,377.0,376.0,206.0,126.0,93.0,178.0,217.0,...,161.0,93.0,78.0,74.0,39.0,55.0,87.0,167.0,116.0,1.0
3,ABULABA,342.0,25.0,11.0,29.0,9.0,8.0,9.0,5.0,31.0,...,7.0,5.0,6.0,14.0,31.0,18.0,20.0,28.0,22.0,1.0
4,AERGO,657.0,576.0,2250.0,650.0,267.0,225.0,144.0,89.0,83.0,...,22.0,71.0,38.0,23.0,19.0,15.0,21.0,30.0,69.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,UMA,567.0,254.0,126.0,72.0,16.0,45.0,20.0,19.0,20.0,...,212.0,142.0,188.0,354.0,261.0,332.0,299.0,272.0,231.0,0.0
146,UTRUST,3820.0,4002.0,2034.0,1734.0,1379.0,1324.0,1786.0,943.0,1075.0,...,262.0,208.0,941.0,4155.0,817.0,338.0,296.0,292.0,241.0,0.0
147,WALTONCHAIN,576.0,10.0,15.0,68.0,113.0,91.0,49.0,0.0,27.0,...,917.0,423.0,340.0,303.0,224.0,174.0,202.0,256.0,194.0,1.0
148,ZPAY,3956.0,9.0,505.0,9.0,273.0,84.0,53.0,20.0,15.0,...,11.0,3.0,2.0,0.0,2.0,1.0,3.0,1.0,1.0,1.0


In [36]:
df_plot = df_dataset.drop(columns='is_fraud')
df_plot.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,140,141,142,143,144,145,146,147,148,149
ico,0XCERT,0X,4NEW,ABULABA,AERGO,AIRSWAP,AKROPOLIS,ALLME,AMPLEFORTH,ARBITRAGE,...,TATATU,TAYLOR,TIERION,TIMENEWBANK,ULTRA,UMA,UTRUST,WALTONCHAIN,ZPAY,ZYNECOIN
0,1053,22670,5417,342,657,7612,1428,550,735,647,...,5125,783,4724,6003,691,567,3820,576,3956,147
1,646,21019,3336,25,576,2596,281,0,322,37,...,2333,218,2142,1264,36,254,4002,10,9,129
2,158,9884,377,11,2250,50,98,1,85,327,...,537,102,2011,685,12,126,2034,15,505,2
3,93,9497,376,29,650,32,183,0,59,126,...,21,86,3072,1052,8,72,1734,68,9,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55,26,953,39,31,19,136,230,0,18,133,...,12,6,334,538,7,261,817,224,2,4
56,13,1139,55,18,15,863,29,0,49,149,...,20,5,356,511,8,332,338,174,1,3
57,18,882,87,20,21,114,8,0,66,32,...,5,6,355,722,4,299,296,202,3,5
58,8,934,167,28,30,122,36,0,33,60,...,8,4,329,695,9,272,292,256,1,11


In [31]:
fig = go.Figure(data=go.Scatter(x=df_ico_resample_day.index, y=df_ico_resample_day['transactions'], mode='lines'))
#fig.add_trace(go.Scatter(x=df_resample_hours_fraud.index, y=df_resample_hours_fraud['gas'], mode='lines'))
#fig.update_layout(yaxis_type="log")
fig.show()

NameError: name 'go' is not defined

## 5. Creating the first DNN for our dataset

In [5]:
# Binary Classification with Sonar Dataset: Standardized Larger
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [37]:
df_plot

Unnamed: 0,ico,0,1,2,3,4,5,6,7,8,...,50,51,52,53,54,55,56,57,58,59
0,0XCERT,1053.0,646.0,158.0,93.0,57.0,247.0,114.0,125.0,48.0,...,28.0,9.0,14.0,4.0,16.0,26.0,13.0,18.0,8.0,11.0
1,0X,22670.0,21019.0,9884.0,9497.0,14077.0,4066.0,3265.0,2618.0,3187.0,...,645.0,585.0,729.0,783.0,1174.0,953.0,1139.0,882.0,934.0,1144.0
2,4NEW,5417.0,3336.0,377.0,376.0,206.0,126.0,93.0,178.0,217.0,...,164.0,161.0,93.0,78.0,74.0,39.0,55.0,87.0,167.0,116.0
3,ABULABA,342.0,25.0,11.0,29.0,9.0,8.0,9.0,5.0,31.0,...,21.0,7.0,5.0,6.0,14.0,31.0,18.0,20.0,28.0,22.0
4,AERGO,657.0,576.0,2250.0,650.0,267.0,225.0,144.0,89.0,83.0,...,38.0,22.0,71.0,38.0,23.0,19.0,15.0,21.0,30.0,69.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,UMA,567.0,254.0,126.0,72.0,16.0,45.0,20.0,19.0,20.0,...,98.0,212.0,142.0,188.0,354.0,261.0,332.0,299.0,272.0,231.0
146,UTRUST,3820.0,4002.0,2034.0,1734.0,1379.0,1324.0,1786.0,943.0,1075.0,...,226.0,262.0,208.0,941.0,4155.0,817.0,338.0,296.0,292.0,241.0
147,WALTONCHAIN,576.0,10.0,15.0,68.0,113.0,91.0,49.0,0.0,27.0,...,648.0,917.0,423.0,340.0,303.0,224.0,174.0,202.0,256.0,194.0
148,ZPAY,3956.0,9.0,505.0,9.0,273.0,84.0,53.0,20.0,15.0,...,1.0,11.0,3.0,2.0,0.0,2.0,1.0,3.0,1.0,1.0


In [49]:
df_dataset_filtered = df_dataset[~df_dataset.is_fraud.isnull()]
df_dataset_filtered.head()

Unnamed: 0,ico,0,1,2,3,4,5,6,7,8,...,51,52,53,54,55,56,57,58,59,is_fraud
0,0XCERT,1053.0,646.0,158.0,93.0,57.0,247.0,114.0,125.0,48.0,...,9.0,14.0,4.0,16.0,26.0,13.0,18.0,8.0,11.0,1.0
1,0X,22670.0,21019.0,9884.0,9497.0,14077.0,4066.0,3265.0,2618.0,3187.0,...,585.0,729.0,783.0,1174.0,953.0,1139.0,882.0,934.0,1144.0,0.0
2,4NEW,5417.0,3336.0,377.0,376.0,206.0,126.0,93.0,178.0,217.0,...,161.0,93.0,78.0,74.0,39.0,55.0,87.0,167.0,116.0,1.0
3,ABULABA,342.0,25.0,11.0,29.0,9.0,8.0,9.0,5.0,31.0,...,7.0,5.0,6.0,14.0,31.0,18.0,20.0,28.0,22.0,1.0
4,AERGO,657.0,576.0,2250.0,650.0,267.0,225.0,144.0,89.0,83.0,...,22.0,71.0,38.0,23.0,19.0,15.0,21.0,30.0,69.0,0.0


In [50]:
# load dataset
#dataframe = read_csv("../data_and_models/sonar_all_data.csv", header=None)
dataset = df_dataset_filtered.values
dataset

array([['0XCERT', 1053.0, 646.0, ..., 8.0, 11.0, 1.0],
       ['0X', 22670.0, 21019.0, ..., 934.0, 1144.0, 0.0],
       ['4NEW', 5417.0, 3336.0, ..., 167.0, 116.0, 1.0],
       ...,
       ['WALTONCHAIN', 576.0, 10.0, ..., 256.0, 194.0, 1.0],
       ['ZPAY', 3956.0, 9.0, ..., 1.0, 1.0, 1.0],
       ['ZYNECOIN', 147.0, 129.0, ..., 11.0, 4.0, 1.0]], dtype=object)

In [59]:
# split into input (X) and output (Y) variables
X = dataset[:,1:61].astype(float)
Y = dataset[:,61]

In [60]:
X

array([[1.0530e+03, 6.4600e+02, 1.5800e+02, ..., 1.8000e+01, 8.0000e+00,
        1.1000e+01],
       [2.2670e+04, 2.1019e+04, 9.8840e+03, ..., 8.8200e+02, 9.3400e+02,
        1.1440e+03],
       [5.4170e+03, 3.3360e+03, 3.7700e+02, ..., 8.7000e+01, 1.6700e+02,
        1.1600e+02],
       ...,
       [5.7600e+02, 1.0000e+01, 1.5000e+01, ..., 2.0200e+02, 2.5600e+02,
        1.9400e+02],
       [3.9560e+03, 9.0000e+00, 5.0500e+02, ..., 3.0000e+00, 1.0000e+00,
        1.0000e+00],
       [1.4700e+02, 1.2900e+02, 2.0000e+00, ..., 5.0000e+00, 1.1000e+01,
        4.0000e+00]])

In [61]:
Y

array([1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0,
       0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0,
       1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0,
       1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0,
       1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0,
       1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0,
       0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0,
       1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0,
       0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0,
       1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0,
       1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,
       1.0, 1.0], dtype=object)

In [62]:
# Binary Classification with Sonar Dataset: Standardized Larger
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [63]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

In [64]:
X.shape

(145, 60)

In [65]:
encoded_Y

array([1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1])

In [66]:
# larger model
def create_larger():
    # create model
    model = Sequential()
    model.add(Dense(60, input_dim=60, activation='relu'))
    model.add(Dense(30, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [67]:
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(build_fn=create_larger, epochs=100, batch_size=5, verbose=0)))

In [68]:
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True)
results = cross_val_score(pipeline, X, encoded_Y, cv=kfold)
print("Larger: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Larger: 68.90% (11.02%)


In [257]:
optimizer = tf.keras.optimizers.SGD(lr=1e-4, momentum=0.9)

In [260]:
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [70]:
model = Sequential()
model.add(Dense(60, input_dim=60, activation='relu'))
model.add(Dense(30, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X, encoded_Y, epochs=100)#, batch_size=32)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
encoded_Y

In [11]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

In [12]:
# larger model
def create_larger():
    # create model
    model = Sequential()
    model.add(Dense(60, input_dim=60, activation='relu'))
    model.add(Dense(30, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [13]:
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(build_fn=create_larger, epochs=100, batch_size=5, verbose=0)))

In [14]:
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True)
results = cross_val_score(pipeline, X, encoded_Y, cv=kfold)
print("Larger: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Larger: 83.64% (8.05%)
