In [1]:
import time
import numpy
import torch
import pandas
import torch.nn
import statistics
import torch.utils
import sklearn.metrics
import matplotlib.pyplot
import sklearn.model_selection

In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

In [4]:
%matplotlib inline

In [5]:
# set plot style
matplotlib.pyplot.style.use('ggplot')

In [6]:
PATH = '/home/andreas/Documents/data_mining/project/onion/input/'

In [7]:
cols = pandas.read_csv(PATH + 'cols.csv', encoding='utf-8')

In [8]:
X = pandas.read_csv(PATH + 'preprocessed_onion_data.csv',
                    dtype=numpy.float16, usecols=cols.tokens, nrows=4000)

In [9]:
del cols

In [10]:
Y = pandas.read_csv(PATH + 'preprocessed_onion_data.csv', dtype=numpy.float16,
                    usecols=['label'], nrows=4000)

In [11]:
Y = pandas.concat([Y, abs(Y - 1)], axis=1).astype(numpy.int8)

In [12]:
Y.columns = ['valid', 'fake']

In [13]:
x_fit, x_test, y_fit, y_test = \
    sklearn.model_selection.train_test_split(X, Y,
                                             test_size=0.25, random_state=42)

In [14]:
x_train, x_val, y_train, y_val = \
    sklearn.model_selection.train_test_split(x_fit,
                                             y_fit,
                                             test_size=0.10,
                                             random_state=42)

In [15]:
del x_fit

In [16]:
del y_fit

In [17]:
# define an identity matrix X with dimentions 943x943
x_train = torch.from_numpy(x_train.to_numpy()).float()
# convert target values to pytorch tensor
y_train = torch.from_numpy(y_train.to_numpy()).float()

In [18]:
# build a pytorch dataset instance from X and Y variables
train_dataset = torch.utils.data.TensorDataset(x_train, y_train)

In [19]:
# define an identity matrix X with dimentions 943x943
x_val = torch.from_numpy(x_val.to_numpy()).float()
# convert target values to pytorch tensor
y_val = torch.from_numpy(y_val.to_numpy()).float()

In [20]:
# build a pytorch dataset instance from X and Y variables
val_dataset = torch.utils.data.TensorDataset(x_val, y_val)

In [21]:
# define an identity matrix X with dimentions 943x943
x_test = torch.from_numpy(x_test.to_numpy()).float()
# convert target values to pytorch tensor
y_test = torch.from_numpy(y_test.to_numpy()).float()

In [22]:
# build a pytorch dataset instance from X and Y variables
test_dataset = torch.utils.data.TensorDataset(x_test, y_test)

In [23]:
train_loader = torch.utils.data.DataLoader(train_dataset)
val_loader = torch.utils.data.DataLoader(val_dataset)
test_loader = torch.utils.data.DataLoader(test_dataset)

In [24]:
# define an identity matrix X with dimentions 943x943
X = torch.from_numpy(X.to_numpy()).float()
# convert target values to pytorch tensor
Y = torch.from_numpy(Y.to_numpy()).float()

In [25]:
model = torch.nn.Sequential(
    torch.nn.Linear(X.shape[1], 1024),
    torch.nn.ReLU(),
    torch.nn.Linear(1024, 512),
    torch.nn.ReLU(),
    torch.nn.Linear(512, 2),
    torch.nn.Softmax(dim=1)
)

In [26]:
criterion = torch.nn.BCELoss()

In [27]:
learning_rate = 1e-5

In [28]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [29]:
def get_device():
    device = None
    if torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
    return device

In [30]:
device = get_device()

In [31]:
model.to(device)

Sequential(
  (0): Linear(in_features=22125, out_features=1024, bias=True)
  (1): ReLU()
  (2): Linear(in_features=1024, out_features=512, bias=True)
  (3): ReLU()
  (4): Linear(in_features=512, out_features=2, bias=True)
  (5): Softmax(dim=1)
)

In [32]:
early_stopping = False
# initialize early stopping prevention limit
prevent = 5
# initialize early stopping prevention limit
consecutive = False
# initialize early stopping message
message = ' '
# initialize epoch counter
epoch = 0
# number of epochs to train the model
epochs = 50
# initialize variables
prev_mean_valid_loss = numpy.Inf
start = 0
# initialize error lists
train_loss = []
valid_loss = []
history = []

In [33]:
print('Time: ', start, ' (in seconds)')
while not early_stopping and epoch < epochs:
    if epoch == 0:
        start = time.time()

    # prep model for training
    model.train()
    for x_train, y_train in train_loader:
        # forward pass
        y_hat = model(x_train.to(device))
        # calculate the loss
        loss = criterion(y_hat, y_train.to(device))
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # backward pass
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss.append(loss.item())
    # shut down autograd to begin evaluation
    with torch.no_grad():
        # prep model for evaluation
        model.eval()
        for x_val, y_val in val_loader:
            # forward pass
            y_hat = model(x_val.to(device))
            # calculate the loss
            loss = criterion(y_hat, y_val.to(device))
            # update running validation loss
            valid_loss.append(loss.item())
    # early stopping conditional
    if prev_mean_valid_loss <= statistics.mean(valid_loss):
        if consecutive is True:
            prevent -= 1
        consecutive = True
        if prevent < 0:
            early_stopping = True
            message = '\tPrevious average Validation error was lower than\
                current Validation error'
    else:
        consecutive = False

    # print results after 2 epochs
    if epoch % 2 == 1:
        end = time.time()
        print('Epoch: ', epoch+1, '\t Time: +', end-start, '\t Training\
        loss: ', statistics.mean(train_loss), '\t Validation loss: ',
              statistics.mean(valid_loss))
        start = time.time()

    # update epoch's validation loss variable
    prev_mean_valid_loss = statistics.mean(valid_loss)

    # early stopping message
    if early_stopping is True:
        print('\t\tStopping at epoch: ', epoch + 1, message)
        epoch = epochs - 1
    epoch += 1

Time:  0  (in seconds)
Epoch:  2 	 Time: + 56.74877643585205 	 Training        loss:  0.11211478857422696 	 Validation loss:  0.002653617662920927
Epoch:  4 	 Time: + 56.484904050827026 	 Training        loss:  0.05815931294974015 	 Validation loss:  0.001480394856765391
Epoch:  6 	 Time: + 56.62317180633545 	 Training        loss:  0.039993281720218044 	 Validation loss:  0.0010430907078385744
Epoch:  8 	 Time: + 56.46927881240845 	 Training        loss:  0.030749054554896595 	 Validation loss:  0.0008176516341250739
Epoch:  10 	 Time: + 56.40941905975342 	 Training        loss:  0.025079124377326147 	 Validation loss:  0.000683869180622717
Epoch:  12 	 Time: + 56.47921109199524 	 Training        loss:  0.02121051968785101 	 Validation loss:  0.0005970387799230911
Epoch:  14 	 Time: + 56.472445487976074 	 Training        loss:  0.018386184100846012 	 Validation loss:  0.0005346794787915464
Epoch:  16 	 Time: + 56.31591796875 	 Training        loss:  0.016228892837416593 	 Validation l

In [34]:
# define test error list
test_loss = []
# initialize timer
start = time.time()
# test model
model.eval()
with torch.no_grad():
    for x_test, y_test in test_loader:
        yhat = model(x_test.to(device))
        loss = criterion(yhat, y_test.to(device))
        test_loss.append(loss.item())
# end time checkpoint
end = time.time()
# print test results
print('\tTime: {:.10} \tTest Loss: {:.15f}'.format(end-start,
                                                   statistics.mean(test_loss)))

	Time: 0.5658450127 	Test Loss: 0.000000861466274


In [35]:
# compute mae and rmse metrics
mse_loss = torch.nn.MSELoss(reduction='mean')
mae_loss = torch.nn.L1Loss(reduction='mean')
target = model(X.to(device))
output_rmse = torch.sqrt(mse_loss(Y.to(device), target))
output_mae = mae_loss(Y.to(device), target)
print("RMSE Loss\n", output_rmse.item())
print("MAE Loss\n", output_mae.item())

RMSE Loss
 0.013694597408175468
MAE Loss
 0.00037543996586464345


In [36]:
# check model's prediction on the whole dataset
prediction = None
with torch.no_grad():
    prediction = model(X.to(device)).cpu().detach()

In [37]:
final_evaluation = []
column_list = []
y_hat = []
y_real = []

In [38]:
for i in range(len(Y)):
    final_evaluation.append(Y[i].numpy().max() - prediction[i].numpy().max())
    column_list.append(numpy.argmax(Y[i].numpy()) ==
                       numpy.argmax(prediction[i].numpy()))
    y_real.append(Y[i].numpy().max())
    y_hat.append(prediction[i].numpy().max())

In [39]:
evaluation_df = pandas.DataFrame(list(zip(final_evaluation, column_list,
                                          y_real, y_hat)),
                                 columns=['loss_dif', 'column',
                                          'y_real', 'y_hat'])

In [40]:
# evaluation_df.to_csv('evaluation.csv')

In [41]:
evaluation_df.column.value_counts()

True     3998
False       2
Name: column, dtype: int64

In [42]:
evaluation_df.loss_dif.mean()

-0.0003794786036014557

In [43]:
Y[0]

tensor([0., 1.])

In [44]:
prediction[0]

tensor([8.1007e-12, 1.0000e+00])

In [45]:
errors = evaluation_df.loc[evaluation_df['column'] == False]

In [46]:
errors

Unnamed: 0,loss_dif,column,y_real,y_hat
1737,-0.505721,False,0.0,0.505721
2704,-0.511287,False,0.0,0.511287


In [60]:
print(sklearn.metrics.classification_report(evaluation_df.y_real.to_numpy().astype(numpy.float16), evaluation_df.y_hat.to_numpy().astype(numpy.int8)))

              precision    recall  f1-score   support

         0.0       0.02      1.00      0.04         3
         1.0       1.00      0.96      0.98      3997

    accuracy                           0.96      4000
   macro avg       0.51      0.98      0.51      4000
weighted avg       1.00      0.96      0.98      4000

