# Titanic LightGBM

Kaggle score: 

重要：
- 因为model.fit(features.as_matrix(), survived.as_matrix(), batch_size = 2, epochs = 20)需要numpy.array输入，而不是pandas.DataFrame，这里需要DataFrame.as_matrix()转换
- 因为使用了kernel_initializer = 'uniform'，导致报错：InternalError: Blas GEMM launch failed

Reference: 
1. https://www.kaggle.com/c/titanic#tutorials
2. https://www.kaggle.com/sinakhorami/titanic-best-working-classifier
3. https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python/notebook


### Import pkgs

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from IPython.display import display

import os
import gc
import time
import random
import zipfile
import h5py
import pickle
import math
from PIL import Image
import shutil

from tqdm import tqdm
import multiprocessing

from sklearn.metrics import accuracy_score, precision_score, recall_score, precision_recall_fscore_support

## Run name

In [2]:
project_name = 'Titanic'
step_name = 'LightGBM'
time_str = time.strftime("%Y%m%d_%H%M%S", time.localtime())
run_name = project_name + '_' + step_name + '_' + time_str
print('run_name: ' + run_name)

run_name: Titanic_LightGBM_20210515_181002


In [3]:
project_folder_name = project_name
print(project_folder_name)

nrows = 0
random_num = 2021

print(nrows)
print(random_num)

Titanic
0
2021


## Project folders

In [4]:
CWD = os.getcwd()
input_folder = os.path.join(CWD, 'input')
model_folder = os.path.join(CWD, 'model')
output_folder = os.path.join(CWD, 'output')
figure_folder = os.path.join(CWD, 'figure')
test_folder = os.path.join(CWD, 'test')
temp_folder = os.path.join(CWD, 'temp')
raw_data_folder = os.path.join(CWD, 'raw_data')

project_folder = os.path.join(output_folder, project_folder_name)

for folder in [
    input_folder, model_folder, output_folder, figure_folder, test_folder, temp_folder, raw_data_folder, 
    project_folder,
]:
    if not os.path.exists(folder):
        os.mkdir(folder)
        print('Folder created:', folder)
    else:
        print('Folder already exist:', folder)

Folder already exist: D:\业余项目\qiang-power\input
Folder already exist: D:\业余项目\qiang-power\model
Folder already exist: D:\业余项目\qiang-power\output
Folder already exist: D:\业余项目\qiang-power\figure
Folder already exist: D:\业余项目\qiang-power\test
Folder already exist: D:\业余项目\qiang-power\temp
Folder already exist: D:\业余项目\qiang-power\raw_data
Folder already exist: D:\业余项目\qiang-power\output\Titanic


In [5]:
train_csv_file = os.path.join(raw_data_folder, 'data_train.csv')
test_csv_file = os.path.join(raw_data_folder, 'data_test.csv')
print(train_csv_file)
print(test_csv_file)

D:\业余项目\qiang-power\raw_data\data_train.csv
D:\业余项目\qiang-power\raw_data\data_test.csv


### Import original data as DataFrame

In [6]:
data_train = pd.read_csv(train_csv_file)
data_test = pd.read_csv(test_csv_file)

display(data_train.head(20))
display(data_test.head(20))

Unnamed: 0,live alone,0,1,2,3,4,5,6,7,8,...,5082,5083,5084,5085,5086,5087,5088,5089,5090,5091
0,0,58.443,1.656,0.014,42.546,15.897,0.177275,0.165594,0.95,1.656,...,0.066,0.03175,0.2515,0.032738,0.002976,0.0,0.354167,0.026973,1.675633,3.498237
1,1,196.85,3.151,0.205,147.129,49.721,0.613038,0.517927,2.7,3.151,...,0.5915,0.327,0.89925,0.589286,0.202381,0.03869,0.395833,0.260297,1.84987,4.364473
2,1,324.851,5.645,0.012,213.367,111.484,0.889029,1.161292,5.645,5.629,...,1.1655,0.45475,1.84,0.720238,0.559524,0.21131,0.416667,1.137758,1.462098,2.810261
3,1,153.976,3.805,0.115,105.592,48.384,0.439967,0.504,3.805,3.696,...,0.6515,0.3495,1.29075,0.583333,0.35119,0.139881,0.360119,0.751666,1.498387,1.633043
4,1,205.985,2.791,0.144,132.419,73.566,0.551746,0.766312,2.722,2.791,...,0.7585,0.4905,1.4465,0.729167,0.380952,0.130952,0.357143,0.562726,1.357149,1.717601
5,0,74.451,4.133,0.047,55.87,18.581,0.232792,0.193552,4.133,1.405,...,0.098,0.038,0.1675,0.053571,0.008929,0.0,0.28869,0.032203,3.492725,17.096458
6,1,123.588,1.921,0.074,85.444,38.144,0.356017,0.397333,1.921,1.176,...,0.4115,0.192,0.66725,0.407738,0.03869,0.0,0.458333,0.080506,0.990989,1.106434
7,1,28.074,0.979,0.05,20.719,7.355,0.086329,0.076615,0.979,0.1,...,0.027,0.026,0.03825,0.0,0.0,0.0,0.255952,0.000483,3.145575,17.275897
8,0,129.033,3.641,0.046,86.586,42.447,0.360775,0.442156,3.641,3.436,...,0.241,0.153,0.365,0.107143,0.047619,0.008929,0.309524,0.09688,3.790354,19.272617
9,1,304.253,3.139,0.354,227.247,77.006,0.946863,0.802146,3.139,2.977,...,0.6175,0.50875,0.78025,0.800595,0.14881,0.035714,0.270833,0.207552,2.826386,9.229339


Unnamed: 0,live alone,0,1,2,3,4,5,6,7,8,...,5082,5083,5084,5085,5086,5087,5088,5089,5090,5091
0,1,110.599,3.711,0.05,47.533,63.066,0.198054,0.656937,0.236,3.711,...,0.914,0.393,1.853,0.678571,0.47619,0.220238,0.383929,1.106346,1.324739,1.31566
1,1,199.633,4.461,0.054,146.867,52.766,0.611946,0.549646,4.461,2.956,...,1.2485,0.8065,1.88875,0.982143,0.613095,0.229167,0.377976,0.873395,1.515718,2.372774
2,0,32.943,1.344,0.02,17.724,15.219,0.07385,0.158531,1.344,0.656,...,0.11,0.03775,0.30425,0.077381,0.008929,0.005952,0.369048,0.078284,5.387899,47.307501
3,1,174.263,3.694,0.083,122.124,52.139,0.50885,0.543115,3.694,3.113,...,0.504,0.19275,1.23175,0.505952,0.321429,0.136905,0.372024,0.88861,1.670407,2.542706
4,1,99.037,1.801,0.082,69.574,29.463,0.289892,0.306906,1.688,1.801,...,0.7075,0.331,1.1525,0.607143,0.330357,0.080357,0.404762,0.589455,2.223621,6.440414
5,1,89.018,1.732,0.039,65.879,23.139,0.274496,0.241031,1.732,1.394,...,0.1795,0.08175,0.31025,0.080357,0.032738,0.0,0.404762,0.06549,2.816788,10.062656
6,1,246.329,4.86,0.112,166.695,79.634,0.694562,0.829521,4.194,4.86,...,0.909,0.52175,1.5265,0.767857,0.434524,0.181548,0.345238,0.731906,1.728772,4.901681
7,1,62.104,1.813,0.014,26.825,35.279,0.111771,0.36749,1.754,1.813,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.180393,-0.380838
8,1,126.407,4.204,0.215,75.247,51.16,0.313529,0.532917,0.326,4.204,...,0.0,0.0,0.0,0.377976,0.113095,0.050595,0.0,0.0,2.859848,8.848587
9,1,124.318,8.44,0.022,97.322,26.996,0.405508,0.281208,8.44,5.188,...,0.322,0.203,0.50425,0.252976,0.181548,0.068452,0.220238,0.713589,3.090653,11.232273


### Show columns of dataframe

In [7]:
data_train_original_col = data_train.columns
data_test_original_col = data_test.columns
print(data_train_original_col)
print(data_test_original_col)
# data_train0 = data_train.drop(data_train_original_col, axis = 1)
# data_test0  = data_test.drop(data_test_original_col, axis = 1)
# display(data_train0.head(2))
# display(data_test0.head(2))

Index(['live alone', '0', '1', '2', '3', '4', '5', '6', '7', '8',
       ...
       '5082', '5083', '5084', '5085', '5086', '5087', '5088', '5089', '5090',
       '5091'],
      dtype='object', length=5093)
Index(['live alone', '0', '1', '2', '3', '4', '5', '6', '7', '8',
       ...
       '5082', '5083', '5084', '5085', '5086', '5087', '5088', '5089', '5090',
       '5091'],
      dtype='object', length=5093)


### Preprocess features

In [8]:
# column_x_arr = list(data_train.columns)[1:]
column_x_arr = [str(i) for i in range(0, 4490)]
column_y = list(data_train.columns)[0]
print(len(column_x_arr), column_x_arr[:5], column_x_arr[-5:])
print(column_y)

4490 ['0', '1', '2', '3', '4'] ['4485', '4486', '4487', '4488', '4489']
live alone


In [9]:
features = data_train[column_x_arr]
survived = data_train[column_y]

data_test0 = data_test[column_x_arr]

display(features.shape, features.head(2))
display(survived.shape, survived.head(2))
display(data_test0.shape, data_test0.head(2))

(4000, 4490)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4480,4481,4482,4483,4484,4485,4486,4487,4488,4489
0,58.443,1.656,0.014,42.546,15.897,0.177275,0.165594,0.95,1.656,0.014,...,0.041,0.1985,0.047619,0.008929,0.0,0.309524,0.031521,2.968655,11.859982,49.922
1,196.85,3.151,0.205,147.129,49.721,0.613038,0.517927,2.7,3.151,0.235,...,0.24775,0.66475,0.363095,0.151786,0.02381,0.3125,0.299993,2.863169,12.370617,200.562


(4000,)

0    0
1    1
Name: live alone, dtype: int64

(232, 4490)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4480,4481,4482,4483,4484,4485,4486,4487,4488,4489
0,110.599,3.711,0.05,47.533,63.066,0.198054,0.656937,0.236,3.711,0.15,...,0.0905,1.108,0.422619,0.282738,0.10119,0.348214,0.915769,1.794087,2.930483,263.522
1,199.633,4.461,0.054,146.867,52.766,0.611946,0.549646,4.461,2.956,0.061,...,0.50825,1.35425,0.758929,0.39881,0.104167,0.375,0.617829,1.711954,3.551951,352.803


## 2. Build model

In [10]:
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

import lightgbm as lgb

x_data = features
y_data = survived
x_test = data_test0

# n_components = random.choice([10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150])
n_components = 100
print('n_components: %s' % n_components)

pca = PCA(n_components=n_components)
pca.fit(x_data)
#pca.fit(x_test)

x_data = pca.transform(x_data)
x_test = pca.transform(x_test)

random_num = np.random.randint(10000)
print('random_num: %s' % random_num)
x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.1, random_state=random_num)
print(x_train.shape)
print(x_val.shape)
print(y_train.shape)
print(y_val.shape)
print(x_test.shape)


lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_val, y_val, reference=lgb_train)

# LightGBM parameters
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': ['auc', 'binary_logloss'],
#         'num_class': 3,
    'learning_rate': random.choice([0.03, 0.1]),
    'num_leaves': random.choice([5, 10, 20, 30, 40]),
    'max_depth': random.choice([4, 5, 6, 7, 8, 9, 10, 11, 12]),
    'n_estimators': random.choice([2000, 5000, 10000]),
    'min_data_in_leaf': random.choice([5, 10, 20, 30, 40]),
    'num_iteration': random.choice([30, 40, 60, 80]),
    'verbose': 0
}

print('params: %s' % params)

# train
gbm = lgb.train(
    params,
    lgb_train,
    num_boost_round=500,
    valid_sets=lgb_eval,
    early_stopping_rounds=10
)

y_val_prob = gbm.predict(x_val, num_iteration=gbm.best_iteration)
print(y_val_prob.shape)
print(y_val_prob[:10])
val_pred_test = (y_val_prob>=0.5).astype(int)
print(val_pred_test[:10])
val_acc = accuracy_score(val_pred_test, y_val)
print('val_acc: %.4f' % val_acc)
print('*' * 60)


n_components: 100
random_num: 4525
(3600, 100)
(400, 100)
(3600,)
(400,)
(232, 100)
params: {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': ['auc', 'binary_logloss'], 'learning_rate': 0.1, 'num_leaves': 30, 'max_depth': 6, 'n_estimators': 2000, 'min_data_in_leaf': 40, 'num_iteration': 80, 'verbose': 0}
You can set `force_col_wise=true` to remove the overhead.
[1]	valid_0's auc: 0.793221	valid_0's binary_logloss: 0.494302
Training until validation scores don't improve for 10 rounds
[2]	valid_0's auc: 0.802838	valid_0's binary_logloss: 0.477713
[3]	valid_0's auc: 0.807955	valid_0's binary_logloss: 0.463067
[4]	valid_0's auc: 0.810775	valid_0's binary_logloss: 0.452228
[5]	valid_0's auc: 0.811914	valid_0's binary_logloss: 0.443911
[6]	valid_0's auc: 0.819832	valid_0's binary_logloss: 0.432367
[7]	valid_0's auc: 0.824725	valid_0's binary_logloss: 0.425974
[8]	valid_0's auc: 0.82084	valid_0's binary_logloss: 0.422832
[9]	valid_0's auc: 0.818973	valid_0's binary_l



In [11]:
y_test_proba = gbm.predict(x_test, num_iteration=gbm.best_iteration)
# y_pred = np.argmax(y_pred, axis=1)
print(y_test_proba.shape)
print(y_test_proba[:200])
y_test_pred = (y_test_proba>=0.5).astype(int)
print(y_test_pred[:200])

(232,)
[0.89733894 0.88832239 0.54028517 0.89733894 0.88757753 0.74382113
 0.89733894 0.76954313 0.89733894 0.70083645 0.89733894 0.87423045
 0.5464281  0.7174861  0.51339668 0.88814648 0.89733894 0.64074891
 0.81177564 0.8802702  0.74706774 0.89733894 0.88757753 0.85507174
 0.88142378 0.69358993 0.88201973 0.88814648 0.89733894 0.83776962
 0.89733894 0.89733894 0.89733894 0.88201973 0.89733894 0.59653336
 0.79821795 0.531008   0.87649215 0.88814648 0.86401573 0.61296779
 0.87872205 0.70581424 0.89506247 0.88757753 0.88111396 0.86303608
 0.89290841 0.7983112  0.72461777 0.88814648 0.82317471 0.89733894
 0.59781655 0.89049459 0.89733894 0.86820769 0.64305166 0.89733894
 0.89733894 0.76597695 0.89733894 0.81615581 0.88832239 0.68313032
 0.63135546 0.88814648 0.89733894 0.89733894 0.89819905 0.88901335
 0.85604861 0.83596433 0.89733894 0.75115715 0.82232106 0.86973217
 0.89733894 0.79440019 0.73330205 0.5868998  0.89733894 0.86535503
 0.88391967 0.53907328 0.57046158 0.81470443 0.83338821

In [12]:
y_data_proba = gbm.predict(x_data, num_iteration=gbm.best_iteration)
# y_pred = np.argmax(y_pred, axis=1)
print(y_data_proba.shape)
print(y_data_proba[:200])
y_data_pred = (y_data_proba>=0.5).astype(int)
print(y_data_pred[:200])

y_data_proba = gbm.predict(x_data, num_iteration=gbm.best_iteration)
# y_pred = np.argmax(y_pred, axis=1)
print(y_data_proba.shape)
print(y_data_proba[:200])
y_data_pred = (y_data_proba>=0.5).astype(int)
print(y_data_pred[:200])

(4000,)
[0.50280786 0.89733894 0.89733894 0.88876129 0.89733894 0.54819188
 0.88814648 0.81957383 0.79174889 0.89733894 0.87166074 0.83063431
 0.73213248 0.57096177 0.49015114 0.55376819 0.89733894 0.87803934
 0.89733894 0.88901335 0.6810048  0.89733894 0.51127788 0.89733894
 0.8723442  0.89733894 0.88907408 0.88814648 0.8802702  0.85789033
 0.7692165  0.88111396 0.88546296 0.749841   0.69869624 0.87355203
 0.89733894 0.77881455 0.89733894 0.74715284 0.80691485 0.84823595
 0.89733894 0.88814648 0.86746462 0.62271273 0.84947468 0.89733894
 0.81521023 0.86789214 0.82473758 0.59415924 0.89733894 0.86122403
 0.86808848 0.82751952 0.5673867  0.8108387  0.89819905 0.51417067
 0.46757477 0.89733894 0.89733894 0.89160744 0.67655585 0.7043278
 0.88814648 0.71891942 0.85024907 0.88142378 0.89733894 0.89122432
 0.75388621 0.71638713 0.64701637 0.51221159 0.53260326 0.89215513
 0.83660093 0.50989824 0.87849935 0.60073938 0.89733894 0.88201973
 0.71092059 0.89733894 0.83473683 0.71876097 0.89733894

In [13]:
y_train_proba = gbm.predict(x_train, num_iteration=gbm.best_iteration)
# y_pred = np.argmax(y_pred, axis=1)
print(y_train_proba.shape)
print(y_train_proba[:20])
y_train_pred = (y_train_proba>=0.5).astype(int)
print(y_train_pred[:20])

precision, recall, fscore, support = precision_recall_fscore_support(y_train_pred, list(y_train), average='weighted')
print(precision, recall, fscore, support)
train_acc = accuracy_score(y_train_pred, list(y_train))
print(train_acc)

(3600,)
[0.5554848  0.87565721 0.51387641 0.89733894 0.88814648 0.63698028
 0.46610593 0.89733894 0.56388523 0.82317471 0.89733894 0.89733894
 0.75482045 0.81898139 0.79712579 0.78881368 0.87565721 0.89733894
 0.88814648 0.89733894]
[1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1]
0.987136878385032 0.8258333333333333 0.8925029854551191 None
0.8258333333333333


In [14]:
y_val_proba = gbm.predict(x_val, num_iteration=gbm.best_iteration)
# y_pred = np.argmax(y_pred, axis=1)
print(y_val_proba.shape)
print(y_val_proba[:20])
y_val_pred = (y_val_proba>=0.5).astype(int)
print(y_val_pred[:20])

precision, recall, fscore, support = precision_recall_fscore_support(y_val_pred, list(y_val), average='weighted')
print(precision, recall, fscore, support)
val_acc = accuracy_score(y_val_pred, list(y_val))
print(val_acc)

(400,)
[0.49907293 0.77735905 0.82416189 0.78933534 0.89733894 0.87890442
 0.80472622 0.88814648 0.6937683  0.86932602 0.87609701 0.84121109
 0.89733894 0.7954855  0.71303761 0.7118864  0.76723968 0.47985876
 0.63101884 0.81208729]
[0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1]
0.9706414565826331 0.785 0.8642980840088431 None
0.785


In [15]:
print('random_num: %s' % random_num)
print('val_acc: %.3f' % val_acc)

random_num: 4525
val_acc: 0.785


## 4. Predict and Export titanic_pred.csv file

In [16]:
random_num = str(int(random_num)).zfill(4)
print(random_num)

run_name_acc = run_name + '_' + str(int(val_acc*10000)).zfill(4)
print(run_name_acc)

4525
Titanic_LightGBM_20210515_181002_7850


In [17]:
def save_proba(y_data_proba, y_data, y_test_proba, file_name):
    if os.path.exists(file_name):
        os.remove(file_name)
        print('Remove file: %s' % file_name)
    with h5py.File(file_name) as h:
        h.create_dataset('y_data_proba', data=y_data_proba)
        h.create_dataset('y_data', data=y_data)
        h.create_dataset('y_test_proba', data=y_test_proba)
    print('Save file: %s' % file_name)

def load_proba(file_name):
    with h5py.File(file_name, 'r') as h:
        y_data_proba = np.array(h['y_data_proba'])
        y_data = np.array(h['y_data'])
        y_test_proba = np.array(h['y_test_proba'])
    print('Load file: %s' % file_name)
    return y_data_proba, y_data, y_test_proba

In [18]:
# y_proba_file = os.path.join(model_temp_folder, 'titanic_proba_%s_%s.p' % (run_name_acc, random_num))
# save_proba(y_data_proba, y_data, y_test_proba, y_proba_file)
# y_data_proba, y_data, y_test_proba = load_proba(y_proba_file)

# print(y_data_proba.shape)
# print(y_data.shape)
# print(y_test_proba.shape)

In [19]:
# passenger_id = data_test['PassengerId']
# output = pd.DataFrame( { 'PassengerId': passenger_id , 'Survived': y_test_pred })

# output_csv_file = os.path.join(output_temp_folder, '%s_%s.csv' % (run_name_acc, random_num))
# output.to_csv(output_csv_file, index = False)
# print(output_csv_file)
# print('\n%s_%s' % (run_name_acc, random_num))

## Preview

In [20]:
## 加载其他数据
fake_csv_file = os.path.join(raw_data_folder, 'data_fake.csv')
print(fake_csv_file)

D:\业余项目\qiang-power\raw_data\data_fake.csv


In [21]:
fake_data = pd.read_csv(fake_csv_file)
fake_data.fillna(0, inplace = True)
display(fake_data.shape, fake_data.head(2))

(1858, 5093)

Unnamed: 0,live alone,0,1,2,3,4,5,6,7,8,...,5082,5083,5084,5085,5086,5087,5088,5089,5090,5091
0,1,76.6,-1.75,6.46,45.5,11.2,-0.631,-0.631,-0.631,-0.631,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,73.0,-1.7,6.13,43.3,10.6,-0.631,-0.631,-0.631,-0.631,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
fake_x = fake_data[column_x_arr]
fake_y = fake_data[column_y]
display(fake_x.shape, fake_x.head(2))
display(fake_y.shape, fake_y.head(2))

(1858, 4490)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4480,4481,4482,4483,4484,4485,4486,4487,4488,4489
0,76.6,-1.75,6.46,45.5,11.2,-0.631,-0.631,-0.631,-0.631,-0.631,...,-0.631,-0.631,-0.631,-0.631,-0.631,-0.631,-0.631,-0.631,-0.631,0.0
1,73.0,-1.7,6.13,43.3,10.6,-0.631,-0.631,-0.631,-0.631,-0.631,...,-0.631,-0.631,-0.631,-0.631,-0.631,-0.631,-0.631,-0.631,-0.631,0.0


(1858,)

0    1
1    1
Name: live alone, dtype: int64

In [23]:
# pca.fit(fake_x)
fake_x_pca = pca.transform(fake_x)
type(fake_x_pca)

numpy.ndarray

In [24]:
y_data_proba = gbm.predict(fake_x_pca, num_iteration=gbm.best_iteration)
# y_pred = np.argmax(y_pred, axis=1)
print(y_data_proba.shape)
print(y_data_proba[:100])
y_data_pred = (y_data_proba>=0.5).astype(int)
print(y_data_pred)

(1858,)
[0.89049459 0.88939918 0.75446537 0.74510548 0.89733894 0.78279678
 0.89733894 0.89733894 0.89819905 0.66280497 0.88757753 0.89049459
 0.89049459 0.89733894 0.89049459 0.88757753 0.78281449 0.88757753
 0.89380125 0.70028229 0.89733894 0.85425662 0.89733894 0.88757753
 0.88757753 0.88111396 0.80382836 0.89733894 0.75446537 0.88757753
 0.80382836 0.75446537 0.73536632 0.6848969  0.63373087 0.75446537
 0.88757753 0.89733894 0.716086   0.79594687 0.89049459 0.8914051
 0.89733894 0.61009388 0.79791408 0.75446537 0.89733894 0.88757753
 0.88757753 0.70123465 0.69525155 0.7662342  0.73536632 0.76862072
 0.8914051  0.88757753 0.88757753 0.78279678 0.88757753 0.79594687
 0.89819905 0.86901164 0.88850926 0.89380125 0.79594687 0.88757753
 0.79594687 0.89733894 0.76862072 0.89049459 0.75446537 0.78281449
 0.81702578 0.89733894 0.78279678 0.89733894 0.89819905 0.88757753
 0.88757753 0.79594687 0.73536632 0.79594687 0.89049459 0.61009388
 0.76550197 0.88850926 0.89049459 0.89733894 0.88808528

In [25]:
for idx, row in enumerate(y_data_pred):
    print(row, end=' ')

1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 

In [26]:
accuracy_score(y_data_pred, fake_y)

0.5215285252960172

In [27]:
precision, recall, fscore, support = precision_recall_fscore_support(y_data_pred, fake_y, average='weighted')
print(precision, recall, fscore, support)

0.9688062760971115 0.5215285252960172 0.652796574198942 None


In [28]:
precision_recall_fscore_support(y_data_pred, fake_y)

(array([0.06519453, 1.        ]),
 array([1.        , 0.50501114]),
 array([0.12240869, 0.67110618]),
 array([  62, 1796], dtype=int64))

In [29]:
print('Done!')

Done!
