In [None]:
import numpy as np
import pandas as pd
import sklearn.naive_bayes as NB
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from google.colab import drive

from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Flatten
from keras.preprocessing.image import ImageDataGenerator, load_img
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense
from keras import backend as K
from tensorflow import keras
import os
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.metrics import precision_score, recall_score, confusion_matrix
import itertools


drive.mount('/content/gdrive')
mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

In [None]:
train_ds = pd.read_csv('/content/gdrive/MyDrive/data/std_ANOVA_x_train.csv')
test_ds = pd.read_csv('/content/gdrive/MyDrive/data/std_ANOVA_x_test.csv')
columns = list(train_ds.columns)
# columns.remove('最高學歷')
# columns.remove('畢業學校類別')
train_ds = train_ds.loc[:,columns]
test_ds = test_ds.loc[:,columns]
# test_ds.drop(columns='PerStatus', inplace=True)
# print(len(train_ds))
# print(len(test_ds))

In [None]:
# nb_train_samples = 5217
# nb_validation_samples = 17
EPOCHS = 50
BATCH_SIZE = 16
INPUT_DIM = 10

In [None]:
# train_ds.dropna(inplace=True)
test_ds_valid = test_ds.dropna().copy()
test_ds_invalid = test_ds[test_ds.isna().any(axis=1)].copy()


In [None]:
def F_beta_score(y_true, y_pred, beta=1.5):
    """F beta score with beta=1.5"""
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f_beta = (1+beta**2)*(prec*rec)/(beta**2*prec + rec)
    return f_beta

In [None]:
#資料準備
# X = train_ds.drop(columns='PerStatus', inplace=False)
X = train_ds
Y = pd.read_csv('/content/gdrive/MyDrive/data/y_train.csv')
Y.dropna(inplace=True)
Y.PerStatus.value_counts() # label distribution

train_x, valid_x, train_y, valid_y = train_test_split(X,Y,test_size = 0.2,random_state=1)

In [None]:
# 不平衡資料計算
number_pos = Y.PerStatus.value_counts()[1]
number_neg = Y.PerStatus.value_counts()[0]
total = number_pos + number_neg
print('Examples:\n    Total: {}\n    Negative: {} ({:.2f}% of total)\n'.format(
    total, number_pos, 100 * number_pos / total))

In [None]:
# 初始偏差
initial_bias = np.log([number_pos/number_neg])
initial_bias

In [None]:
label = ['Stay', 'Resign']
math_scores = [number_pos, number_neg]
x = np.arange(len(label))
plt.bar(x, math_scores, color=['green', 'blue'])
plt.xticks(x, label)
plt.xlabel('PerStatus')
plt.ylabel('People')
plt.title('Data Distribution')
plt.show()

In [None]:
# 不均勻數據權重分配 class_weight
weight_for_0 = (1 / number_neg)*(total)/2.0
weight_for_1 = (1 / number_pos)*(total)/2.0

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

In [None]:
#建指標
METRICS = [
      # keras.metrics.TruePositives(name='tp'),
      # keras.metrics.FalsePositives(name='fp'),
      # keras.metrics.TrueNegatives(name='tn'),
      # keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
      keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]

In [None]:
def plot_metrics(history):
  metrics = ['loss', 'prc', 'precision', 'recall']
  for n, metric in enumerate(metrics):
    name = metric.replace("_"," ").capitalize()
    plt.subplot(2,2,n+1)
    plt.plot(history.epoch, history.history[metric], color=colors[0], label='Train')
    plt.plot(history.epoch, history.history['val_'+metric],
             color=colors[0], linestyle="--", label='Val')
    plt.xlabel('Epoch')
    plt.ylabel(name)
    if metric == 'loss':
      plt.ylim([0, plt.ylim()[1]])
    elif metric == 'auc':
      plt.ylim([0.8,1])
    else:
      plt.ylim([0,1])

    plt.legend();

In [None]:
# nb_train_samples = 5217
# nb_validation_samples = 17
# epochs = 1
# batch_size = 16

In [None]:
#建提早結束指令
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='precision', 
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True)

In [None]:
# #建model
# def model(metrics = METRICS, output_bias=None):

#     model = Sequential()
#     model.add(Dense(128, input_dim=INPUT_DIM, activation='relu'))
#     model.add(Dropout(0.2))
#     model.add(BatchNormalization())
#     model.add(Dense(64, activation='relu'))
#     model.add(Dropout(0.2))
#     model.add(BatchNormalization())
#     model.add(Dense(16, activation='relu'))
#     model.add(Dropout(0.1))
#     model.add(Flatten())
#     model.add(Dense(1, activation='sigmoid'))
#     model.compile(optimizer=keras.optimizers.Adam(lr=1e-3), 
#                   loss='binary_crossentropy', 
#                   metrics=metrics)
#     model.summary()
    
#     return model
#     #loss='binary_crossentropy'
#     #loss=custom_loss(recall_weight=0.9, spec_weight=0.1)

In [None]:
def make_model(metrics=METRICS, output_bias=initial_bias):
  if output_bias is not None:
    output_bias = tf.keras.initializers.Constant(output_bias)
  model = keras.Sequential([
      keras.layers.Dense(
          256, activation='relu',
          input_dim=INPUT_DIM),
      keras.layers.Dropout(0.5),
      keras.layers.Dense(1, activation='sigmoid',
                         bias_initializer=output_bias),
  ])

  model.compile(
      optimizer=keras.optimizers.Adam(learning_rate=1e-3),
      loss=keras.losses.BinaryCrossentropy(),
      metrics=metrics)

  return model

In [None]:

# weighted_model.load_weights(initial_weights)
KerasClassifier_model = KerasClassifier(build_fn=make_model, nb_epoch=EPOCHS, batch_size=BATCH_SIZE)
KerasClassifier_model._estimator_type = "classifier"

# zero_model = KerasClassifier_model
weighted_model = KerasClassifier_model
# weighted_model.load_weights(initial_weights)
# zero_model = KerasClassifier_model
# zero_model = make_model()
weighted_history = weighted_model.fit(
    train_x,
    train_y,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks = [early_stopping],
    validation_data=(valid_x, valid_y),
    class_weight=class_weight)

# # bias_model = KerasClassifier_model
# no_weighted_model = make_model()
# # weighted_model.load_weights(initial_weights)
# # bias_model.layers[-1].bias.assign([0.0])
# no_weighted_history = no_weighted_model.fit(
#     train_x,
#     train_y,
#     batch_size=BATCH_SIZE,
#     epochs=EPOCHS,
#     # callbacks = [early_stopping],
#     validation_data=(valid_x, valid_y))
#     # class_weight=class_weight)

scores_train = weighted_model.score(train_x, train_y, verbose=0)
scores_valid = weighted_model.score(valid_x, valid_y, verbose=0)

print('scores_train: {:.2f}'.format(scores_train))
print('scores_valid: {:.2f}'.format(scores_valid))

In [None]:
scores_train = weighted_model.score(train_x, train_y, verbose=0)
scores_valid = weighted_model.score(valid_x, valid_y, verbose=0)

print('scores_train: {:.2f}'.format(scores_train))
print('scores_valid: {:.2f}'.format(scores_valid))

In [None]:
def plot_loss(history, label, n):
  # Use a log scale on y-axis to show the wide range of values.
  plt.semilogy(history.epoch, history.history['loss'],
               color=colors[n], label='Train ' + label)
  plt.semilogy(history.epoch, history.history['val_loss'],
               color=colors[n], label='Val ' + label,
               linestyle="--")
  plt.xlabel('Epoch')
  plt.ylabel('Loss')
  plt.legend()

In [None]:
def plot_metrics(history):
  metrics = ['loss', 'prc', 'precision', 'recall']
  for n, metric in enumerate(metrics):
    name = metric.replace("_"," ").capitalize()
    plt.subplot(2,2,n+1)
    plt.plot(history.epoch, history.history[metric], color=colors[0], label='Train')
    plt.plot(history.epoch, history.history['val_'+metric],
             color=colors[0], linestyle="--", label='Val')
    plt.xlabel('Epoch')
    plt.ylabel(name)
    if metric == 'loss':
      plt.ylim([0, plt.ylim()[1]])
    elif metric == 'auc':
      plt.ylim([0.8,1])
    else:
      plt.ylim([0,1])

    plt.legend();

In [None]:
# plot_loss(no_weighted_history, "Weighted", 1)
# plot_loss(weighted_history, "No Weighted", 0)


In [None]:
plot_metrics(zero_history)

In [None]:
pred_train_x= weighted_model.predict(train_x)
pred_valid_x = weighted_model.predict(valid_x)
pred_test = weighted_model.predict(test_ds_valid)
pred_test_pd = pd.DataFrame(pred_test)
pred_train_x = pd.DataFrame(pred_train_x)

In [None]:
pred_train_x = pd.DataFrame(pred_train_x)
# train_y.value_counts()
pred_test_pd.value_counts()
# pred_test_pd.value_counts()

In [None]:
# submission = pd.read_csv('/content/gdrive/MyDrive/data/submission.csv')

In [None]:
score = F_beta_score(valid_y,pred_valid_x)
print(score)

In [None]:
score = F_beta_score(train_y,pred_train_x)
print(score)

In [None]:
# pred_test_pd.to_csv('/content/gdrive/MyDrive/data/submission_3.csv')  

In [None]:
# #2-class Precision-Recall curve:train
# disp = plot_precision_recall_curve(weighted_model, train_x, train_y)
# disp.ax_.set_title('2-class Precision-Recall curve: ')

In [None]:
# make_model().summary()

In [None]:
def plot_confusion_matrix(classes, y_true, y_pred, title, save_path, save=False):
    cmap = plt.cm.Blues
    cm = confusion_matrix(y_true, y_pred)
    plt.figure()
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title,fontsize=30)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    thresh = (cm.max()+cm.min()) / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black",
                 fontsize=30)
    
    text = f'F beta score: {F_beta_score(y_true, y_pred):.4f}'
    plt.gcf().subplots_adjust(bottom=0.2)
    plt.figtext(0.5, 0.1, text, ha='center',fontsize=30)
    
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    if save:
        plt.savefig(save_path+f'{title}.png', transparent=True)
    plt.show()


In [None]:
# save_path = './'
# title = 'Nerul Network'
# plot_confusion_matrix([0,1], valid_y, pred_valid_x, title=title, save_path=save_path, save=True)