In [43]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pickle
import pprint
# 사용할 분류기
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix as cm, classification_report as cr

In [26]:
# 전처리가 완료된 코드들
x_train = np.load("train_benford.npy")
x_val = np.load("val_benford.npy")
x_test = np.load("test_benford.npy")

바로 테스트에 적용할 계획이기 때문에 train-val 을 합쳐준다

In [27]:
x_train = x_train / x_train.sum(axis=1)[:, None]
x_val = x_val / x_val.sum(axis=1)[:, None]
x_test = x_test / x_test.sum(axis=1)[:, None]

x_train = np.concatenate([x_train, x_val], axis=0)
del x_val

In [28]:
y_train = pd.read_csv("../imageCSV/train_truncated.csv").label.values
y_val = pd.read_csv("../imageCSV/val_truncated.csv").label.values
y_test = pd.read_csv("../imageCSV/test_truncated.csv").label.values

In [29]:
y_train = np.concatenate([y_train, y_val])


# 그래프 관련

In [30]:
plt.style.available

['Solarize_Light2',
 '_classic_test_patch',
 '_mpl-gallery',
 '_mpl-gallery-nogrid',
 'bmh',
 'classic',
 'dark_background',
 'fast',
 'fivethirtyeight',
 'ggplot',
 'grayscale',
 'seaborn-v0_8',
 'seaborn-v0_8-bright',
 'seaborn-v0_8-colorblind',
 'seaborn-v0_8-dark',
 'seaborn-v0_8-dark-palette',
 'seaborn-v0_8-darkgrid',
 'seaborn-v0_8-deep',
 'seaborn-v0_8-muted',
 'seaborn-v0_8-notebook',
 'seaborn-v0_8-paper',
 'seaborn-v0_8-pastel',
 'seaborn-v0_8-poster',
 'seaborn-v0_8-talk',
 'seaborn-v0_8-ticks',
 'seaborn-v0_8-white',
 'seaborn-v0_8-whitegrid',
 'tableau-colorblind10']

In [None]:
plt.style.use("bmh")

In [None]:
x = np.arange(1, 10)
benford = np.log10(1 + 1 / x)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8), sharex=True, sharey=True)
ax1.set_title('real')
ax2.set_title('fake')
ax1.bar(x, benford, color='blue')
ax2.bar(x, benford, color='blue')
for r, m in zip(x_train[y_train == 0], x_train[y_train == 1]):
    ax1.plot(x, r)
    ax2.plot(x, m)

In [None]:
real_mean = x_train[y_train == 0].mean(axis=0)
fake_mean = x_train[y_train == 1].mean(axis=0)

In [None]:
x = np.arange(1, 10)
benford = np.log10(1 + 1 / x)

plt.ylabel("prob")
plt.xlabel("First Digit")
plt.xticks(x)
plt.bar(x, benford,width=0.5)
plt.plot(x, real_mean, color='green', alpha=1, label="real")
plt.plot(x, fake_mean, color='red', alpha=0.7, label="fake")
plt.legend()


In [None]:
plt.plot(x, real_mean - benford, label="real")
plt.plot(x, fake_mean - benford, label="fake")
plt.xticks(x)
plt.legend()
plt.xlabel("First Digit")
plt.ylabel("prob diff")

# 코드 관련

In [31]:
rf = RandomForestClassifier(verbose=1, n_jobs=-1)

In [32]:
rf.fit(x_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    4.5s finished


In [33]:
y_pred = rf.predict(x_test)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.1s finished


In [34]:
print(cm(y_true=y_test, y_pred=y_pred))
print(cr(y_true=y_test, y_pred=y_pred))

[[10388  2936]
 [ 2384 10692]]
              precision    recall  f1-score   support

           0       0.81      0.78      0.80     13324
           1       0.78      0.82      0.80     13076

    accuracy                           0.80     26400
   macro avg       0.80      0.80      0.80     26400
weighted avg       0.80      0.80      0.80     26400



In [35]:
svc =  SVC(random_state=0,verbose=1,)

In [36]:
svc.fit(x_train, y_train)

[LibSVM].............................................
*.
*
optimization finished, #iter = 45665
obj = -76643.977805, rho = -16.037296
nSV = 78869, nBSV = 78849
Total nSV = 78869


In [37]:
svc_pred = svc.predict(x_test)


In [38]:
print(cm(y_true=y_test, y_pred=svc_pred))
print(cr(y_true=y_test, y_pred=svc_pred))

[[ 7368  5956]
 [ 2638 10438]]
              precision    recall  f1-score   support

           0       0.74      0.55      0.63     13324
           1       0.64      0.80      0.71     13076

    accuracy                           0.67     26400
   macro avg       0.69      0.68      0.67     26400
weighted avg       0.69      0.67      0.67     26400



In [39]:
mlp = MLPClassifier(hidden_layer_sizes=(100, 50, 20), 
verbose=1, 
early_stopping=True, 
)

In [40]:
mlp.fit(x_train, y_train)

Iteration 1, loss = 0.67413262
Validation score: 0.626231
Iteration 2, loss = 0.61110739
Validation score: 0.670076
Iteration 3, loss = 0.60362397
Validation score: 0.677367
Iteration 4, loss = 0.59936300
Validation score: 0.672538
Iteration 5, loss = 0.59171279
Validation score: 0.678788
Iteration 6, loss = 0.58031650
Validation score: 0.703504
Iteration 7, loss = 0.56180173
Validation score: 0.729830
Iteration 8, loss = 0.54348488
Validation score: 0.720549
Iteration 9, loss = 0.52680480
Validation score: 0.757955
Iteration 10, loss = 0.50856120
Validation score: 0.774242
Iteration 11, loss = 0.49418183
Validation score: 0.782765
Iteration 12, loss = 0.48015607
Validation score: 0.777936
Iteration 13, loss = 0.47266499
Validation score: 0.761553
Iteration 14, loss = 0.46592804
Validation score: 0.787689
Iteration 15, loss = 0.46029722
Validation score: 0.794697
Iteration 16, loss = 0.45761071
Validation score: 0.792140
Iteration 17, loss = 0.45562318
Validation score: 0.779735
Iterat

In [41]:
mlp_pred = mlp.predict(x_test)

In [42]:
print(cm(y_true=y_test, y_pred=mlp_pred))
print(cr(y_true=y_test, y_pred=mlp_pred))

[[10160  3164]
 [ 2097 10979]]
              precision    recall  f1-score   support

           0       0.83      0.76      0.79     13324
           1       0.78      0.84      0.81     13076

    accuracy                           0.80     26400
   macro avg       0.80      0.80      0.80     26400
weighted avg       0.80      0.80      0.80     26400



In [44]:
pickle.dump(rf, open("./models/rf.sav", "wb"))
pickle.dump(svc, open("./models/svc.sav", "wb"))
pickle.dump(mlp, open("./models/mlp.sav", "wb"))

In [45]:
del rf, svc, mlp

In [47]:
# 다시 읽어오려면 이런 식으로 하면 됨
rf = pickle.load(open("./models/rf.sav", "rb"))
svc = pickle.load(open("./models/rf.sav", "rb"))
mlp = pickle.load(open("./models/rf.sav", "rb"))

In [None]:
# styleGAN-XL 테스트 데이터를 받아서 테스트 후 confusion matrix 다시 확인