In [None]:
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model, Sequential
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras import regularizers
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn import preprocessing 
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np
import seaborn as sns
sns.set(style="whitegrid")
np.random.seed(203)

# Dataset Preparation

## load data

In [None]:
# 전체데이터 불러오기
raw_df = pd.read_csv('e:/kma/data/TRAIN_nottree_with_파생변수0802.csv', encoding='utf8')
raw_df.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1, inplace=True)
raw_df = raw_df.astype({'yyyymmdd':'str'})
raw_df['yyyymmdd'] = pd.to_datetime(raw_df['yyyymmdd'])
# raw_df = pd.get_dummies(raw_df, columns=['weekday', 'season'])

add_list = set(raw_df['add'])
indep_cols = raw_df.columns.difference(['yyyymmdd', 'add', 'sex', 'frequency'])


area = '세종'
sex = 1
condition = (raw_df['add']==area) & (raw_df['sex']==sex)        
tmp = raw_df.copy()[condition]
tmp.reset_index(level=0, inplace=True, drop=True)

## EDA

In [None]:
import seaborn as sns
sns.set_theme(style="darkgrid")
ax = sns.countplot(x="frequency", data=tmp)

frq_size = tmp.groupby(['frequency']).size()
frq_ratio = frq_size/frq_size.sum()
frq_ratio

## 0, 1로 class 변경

In [None]:
# 0: 정상, 1: 이상치
tmp['Class'] = tmp['frequency'].apply(lambda x: 0 if x in range(0,1) else 1)
tmp['Class'].value_counts()
tmp.drop(['yyyymmdd', 'sex', 'add', 'frequency'], axis=1, inplace=True)

## train, test 분리

In [None]:
train_df = tmp[:-365]
val_df = tmp[-365:]

In [None]:
non_fraud = tmp[tmp['Class'] == 0]
fraud = tmp[tmp['Class'] == 1]
df = non_fraud.append(fraud).sample(180).reset_index(drop=True)
# df = non_fraud.append(fraud).sample(frac=1).reset_index(drop=True)
X = df.drop(['Class'], axis = 1).values
# X = df[indep_cols].values
Y = df["Class"].values

## T-SNE

In [None]:
# 그래프 타이틀 한글인코딩
from matplotlib import font_manager, rc
font_path = "C:/Windows/Fonts/NGULIM.TTF"
font = font_manager.FontProperties(fname=font_path).get_name()
rc('font', family=font)

def tsne_plot(x1, y1, name="graph.png"):
    tsne = TSNE(n_components=2, random_state=0, learning_rate='auto')
    X_t = tsne.fit_transform(x1)


    plt.figure(figsize=(12, 8), facecolor='white')
    plt.scatter(X_t[np.where(Y == 0), 0], X_t[np.where(Y == 0), 1], marker='o', color='g', linewidth=1, alpha=0.8, label='정상')
    plt.scatter(X_t[np.where(Y == 1), 0], X_t[np.where(Y == 1), 1], marker='o', color='r', linewidth=1, alpha=0.8, label='이상치')

    plt.legend(loc='best');
    plt.savefig(name);
    plt.show();
    
tsne_plot(X, Y, "./tsne.png")

# AutoEncoders to the rescue

In [25]:
train_x = train_df.drop(["Class"], axis=1)
train_y = train_df["Class"].values
x_scale = preprocessing.StandardScaler().fit_transform(train_x.values)
x_norm, x_fraud = x_scale[train_y == 0], x_scale[train_y == 1]

val_x = val_df.drop(["Class"], axis=1)
val_y = val_df["Class"].values

val_x_scale = preprocessing.StandardScaler().fit_transform(val_x.values)
val_x_norm, val_x_fraud = val_x_scale[val_y == 0], val_x_scale[val_y == 1]

In [26]:
## input layer 
input_layer = Input(shape=(X.shape[1],))

## encoding part
encoded = Dense(256, activation='tanh', activity_regularizer=regularizers.l1(10e-5))(input_layer) #
encoded = Dense(128, activation='tanh')(encoded)
encoded = Dense(100, activation='tanh')(encoded)
encoded = Dense(64, activation='tanh')(encoded)
# encoded = Dense(50, activation='relu')(encoded)

## decoding part
decoded = Dense(64, activation='tanh')(encoded)
decoded = Dense(100, activation='tanh')(decoded)
decoded = Dense(128, activation='tanh')(decoded)
decoded = Dense(256, activation='tanh')(decoded)

## output layer
output_layer = Dense(X.shape[1], activation='relu')(decoded)

In [27]:
autoencoder = Model(input_layer, output_layer)
autoencoder.compile(optimizer="adam", loss="mse")

In [28]:
early_stop = EarlyStopping(monitor='val_loss', patience=100)
history = autoencoder.fit(x_norm, x_norm,
                batch_size = 256, epochs = 1000, 
                shuffle = True, 
                validation_data=(val_x_norm, val_x_norm),
                verbose=0,
                callbacks=[early_stop])

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model mse')
plt.xlabel('Epoch')
plt.ylabel('mse')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
min(history.history['val_loss'])

## Visualize the latent representations

In [None]:
hidden_representation = Sequential()
hidden_representation.add(autoencoder.layers[0])
hidden_representation.add(autoencoder.layers[1])
hidden_representation.add(autoencoder.layers[2])
hidden_representation.add(autoencoder.layers[3])
hidden_representation.add(autoencoder.layers[4])

In [None]:
a = hidden_representation.predict(x_norm)
b = hidden_representation.predict(x_fraud)

train_rep_x = np.append(a, b, axis = 0)
y_n = np.zeros(a.shape[0])
y_f = np.ones(b.shape[0])
train_rep_y = np.append(y_n, y_f)
tsne_plot(train_rep_x, train_rep_y, "./latent_representation.png")

In [None]:
norm_hid_rep = hidden_representation.predict(val_x_norm)
fraud_hid_rep = hidden_representation.predict(val_x_fraud)

In [None]:
rep_x = np.append(norm_hid_rep, fraud_hid_rep, axis = 0)
y_n = np.zeros(norm_hid_rep.shape[0])
y_f = np.ones(fraud_hid_rep.shape[0])
rep_y = np.append(y_n, y_f)
tsne_plot(rep_x, rep_y, "./latent_representation.png")

In [None]:
from IPython.display import display, Image, HTML
display(HTML("""<table align="center">
<tr ><td><b>Actual Representation (Before) </b></td><td><b>Latent Representation (Actual)</b></td></tr>
<tr><td><img src='./tsne.png'></td><td>
<img src='./latent_representation.png'></td></tr></table>"""))

# Linear Regression

In [None]:
# train_x, val_x, train_y, val_y = train_test_split(rep_x, rep_y, test_size=0.5)
clf = LogisticRegression(solver="lbfgs").fit(train_rep_x, train_rep_y)
pred_y = clf.predict(rep_x)

print ("")
print ("Classification Report: ")
print (classification_report(rep_y, pred_y))

print ("")
print ("Accuracy Score: ", accuracy_score(rep_y, pred_y))