In [1]:
import sys
import time
import nltk
import numpy
import pandas
import torch.nn
import statistics
import torch.utils
import sklearn.metrics
import sklearn.model_selection
import sklearn.feature_extraction.text

In [2]:
# input_dataframe = pandas.read_csv('onion-or-not.csv', encoding='utf-8', nrows=500)
input_dataframe = pandas.read_csv('onion-or-not.csv', encoding='utf-8')

In [3]:
tokenized_vector = dict()

In [4]:
for i in input_dataframe.index:
    tokenized_vector[i] = nltk.word_tokenize(input_dataframe.loc[i][0])

In [5]:
stemmer = nltk.PorterStemmer()

In [6]:
stopwords = set(nltk.corpus.stopwords.words('english'))

In [7]:
for tokens in tokenized_vector:
    for counter, token in enumerate(tokenized_vector[tokens]):
        if stemmer.stem(tokenized_vector[tokens][counter]) not in stopwords:
            tokenized_vector[tokens][counter] = stemmer.stem(tokenized_vector[tokens][counter])
        else:
            tokenized_vector[tokens].remove(tokenized_vector[tokens][counter])

In [8]:
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer()

In [9]:
preprocessed_tokenized_vector = list()

In [10]:
for i in tokenized_vector:
    preprocessed_tokenized_vector.append(' '.join(tokenized_vector[i]))

In [11]:
X = vectorizer.fit_transform(preprocessed_tokenized_vector)

In [12]:
tf_idf_df = pandas.DataFrame(X.todense(), columns=vectorizer.get_feature_names(), dtype=numpy.float16)

In [13]:
preprocessed_data = pandas.concat([tf_idf_df, input_dataframe.iloc[:, 1:]], axis=1, sort=False)

In [14]:
print('Total size of dataframe: ', round(sys.getsizeof(preprocessed_data) / 2**20, 2), 'MB')

Total size of dataframe:  1012.99 MB


In [15]:
del X

In [16]:
del tf_idf_df

In [17]:
del preprocessed_tokenized_vector

In [18]:
del tokenized_vector

In [19]:
del input_dataframe

In [20]:
PATH = '/home/andreas/Documents/data_mining/project/onion/input/'

In [21]:
cols = pandas.DataFrame(preprocessed_data.columns[:-1].tolist(), columns=['tokens'])

In [22]:
X = preprocessed_data.iloc[:, :-1]

In [23]:
Y = pandas.concat([preprocessed_data.iloc[:, -1], abs(preprocessed_data.iloc[:, -1] - 1)], axis=1).astype(numpy.int8)

In [24]:
Y.columns = ['valid', 'fake']

In [25]:
del preprocessed_data

In [26]:
x_fit, x_test, y_fit, y_test = \
    sklearn.model_selection.train_test_split(X, Y,
                                             test_size=0.25, random_state=42)

In [27]:
x_train, x_val, y_train, y_val = \
    sklearn.model_selection.train_test_split(x_fit,
                                             y_fit,
                                             test_size=0.10,
                                             random_state=42)

In [28]:
del x_fit

In [29]:
del y_fit

In [30]:
# define an identity matrix X with dimentions 943x943
x_train = torch.from_numpy(x_train.to_numpy()).float()
# convert target values to pytorch tensor
y_train = torch.from_numpy(y_train.to_numpy()).float()

In [31]:
# build a pytorch dataset instance from X and Y variables
train_dataset = torch.utils.data.TensorDataset(x_train, y_train)

In [32]:
del x_train

In [33]:
del y_train

In [34]:
# define an identity matrix X with dimentions 943x943
x_val = torch.from_numpy(x_val.to_numpy()).float()
# convert target values to pytorch tensor
y_val = torch.from_numpy(y_val.to_numpy()).float()

In [35]:
# build a pytorch dataset instance from X and Y variables
val_dataset = torch.utils.data.TensorDataset(x_val, y_val)

In [36]:
del x_val

In [37]:
del y_val

In [38]:
# define an identity matrix X with dimentions 943x943
x_test = torch.from_numpy(x_test.to_numpy()).float()
# convert target values to pytorch tensor
y_test = torch.from_numpy(y_test.to_numpy()).float()

In [39]:
# build a pytorch dataset instance from X and Y variables
test_dataset = torch.utils.data.TensorDataset(x_test, y_test)

In [40]:
train_loader = torch.utils.data.DataLoader(train_dataset)
val_loader = torch.utils.data.DataLoader(val_dataset)
test_loader = torch.utils.data.DataLoader(test_dataset)

In [41]:
del train_dataset

In [42]:
del val_dataset

In [43]:
del test_dataset

In [44]:
# define an identity matrix X with dimentions 943x943
X = torch.from_numpy(X.to_numpy()).float()
# convert target values to pytorch tensor
Y = torch.from_numpy(Y.to_numpy()).float()

In [45]:
model = torch.nn.Sequential(
    torch.nn.Linear(X.shape[1], 512),
    torch.nn.ReLU(),
    torch.nn.Linear(512, 128),
    torch.nn.ReLU(),
    torch.nn.Linear(128, 2),
    torch.nn.Softmax(dim=1)
)

In [46]:
criterion = torch.nn.BCELoss()

In [47]:
learning_rate = 1e-5

In [48]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [49]:
def get_device():
    device = None
    if torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
    return device

In [50]:
device = get_device()

In [51]:
model.to(device)

Sequential(
  (0): Linear(in_features=22125, out_features=512, bias=True)
  (1): ReLU()
  (2): Linear(in_features=512, out_features=128, bias=True)
  (3): ReLU()
  (4): Linear(in_features=128, out_features=2, bias=True)
  (5): Softmax(dim=1)
)

In [52]:
early_stopping = False
# initialize early stopping prevention limit
prevent = 5
# initialize early stopping prevention limit
consecutive = False
# initialize early stopping message
message = ' '
# initialize epoch counter
epoch = 0
# number of epochs to train the model
epochs = 50
# initialize variables
prev_mean_valid_loss = numpy.Inf
start = 0
# initialize error lists
train_loss = []
valid_loss = []
history = []

In [53]:
print('Time: ', start, ' (in seconds)')
while not early_stopping and epoch < epochs:
    if epoch == 0:
        start = time.time()

    # prep model for training
    model.train()
    for x_train, y_train in train_loader:
        # forward pass
        y_hat = model(x_train.to(device))
        # calculate the loss
        loss = criterion(y_hat.flatten(), y_train.to(device))
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # backward pass
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss.append(loss.item())
    # shut down autograd to begin evaluation
    with torch.no_grad():
        # prep model for evaluation
        model.eval()
        for x_val, y_val in val_loader:
            # forward pass
            y_hat = model(x_val.to(device))
            # calculate the loss
            loss = criterion(y_hat.flatten(), y_val.to(device))
            # update running validation loss
            valid_loss.append(loss.item())
    # early stopping conditional
    if prev_mean_valid_loss <= statistics.mean(valid_loss):
        if consecutive is True:
            prevent -= 1
        consecutive = True
        if prevent < 0:
            early_stopping = True
            message = '\tPrevious average Validation error was lower than\
                current Validation error'
    else:
        consecutive = False

    # print results after 2 epochs
    if epoch % 2 == 1:
        end = time.time()
        print('Epoch: ', epoch+1, '\t Time: +', end-start, '\t Training\
        loss: ', statistics.mean(train_loss), '\t Validation loss: ',
              statistics.mean(valid_loss))
        start = time.time()

    # update epoch's validation loss variable
    prev_mean_valid_loss = statistics.mean(valid_loss)

    # early stopping message
    if early_stopping is True:
        print('\t\tStopping at epoch: ', epoch + 1, message)
        epoch = epochs - 1
    epoch += 1

Time:  0  (in seconds)


  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)


Epoch:  2 	 Time: + 181.33974385261536 	 Training        loss:  0.4350694837858706 	 Validation loss:  0.3534855001938478
Epoch:  4 	 Time: + 178.99992537498474 	 Training        loss:  0.3176764562921923 	 Validation loss:  0.32024289953204715
Epoch:  6 	 Time: + 179.9589807987213 	 Training        loss:  0.2623137922678959 	 Validation loss:  0.3144400713901235
Epoch:  8 	 Time: + 180.22914099693298 	 Training        loss:  0.22770063550444275 	 Validation loss:  0.31819723130208344
Epoch:  10 	 Time: + 180.3503177165985 	 Training        loss:  0.20316295432909334 	 Validation loss:  0.32648086663907155
Epoch:  12 	 Time: + 180.62848043441772 	 Training        loss:  0.18447468221565055 	 Validation loss:  0.33713888383546603
		Stopping at epoch:  13 	Previous average Validation error was lower than                current Validation error


In [54]:
del x_train

In [55]:
del y_train

In [56]:
del x_val

In [57]:
del y_val

In [58]:
del train_loader

In [59]:
del val_loader

In [60]:
# define test error list
test_loss = []
# initialize timer
start = time.time()
# test model
model.eval()
with torch.no_grad():
    for x, y in test_loader:
        yhat = model(x.to(device))
        loss = criterion(yhat.flatten(), y.to(device))
        test_loss.append(loss.item())
# end time checkpoint
end = time.time()
# print test results
print('\tTime: {:.10} \tTest Loss: {:.15f}'.format(end-start,
                                                   statistics.mean(test_loss)))

	Time: 2.376538515 	Test Loss: 0.387354930504106


In [61]:
del test_loader

In [62]:
# check model's prediction on the whole dataset
prediction = None
with torch.no_grad():
    prediction = model(x_test.to(device)).cpu().detach()

In [63]:
final_evaluation = []
y_hat = []
y_real = []
column_list = []
evaluation = []

In [64]:
for i in range(len(y_test)):
    final_evaluation.append(y_test[i].numpy().max() - prediction[i].numpy().max())
    column_list.append(numpy.argmax(y_test[i].numpy()) ==
                       numpy.argmax(prediction[i].numpy()))
    y_real.append(y_test[i].numpy().max())
    y_hat.append(prediction[i].numpy().max())

In [65]:
test_df = pandas.DataFrame(list(zip(final_evaluation, column_list,
                                    y_real, y_hat)),
                           columns=['loss_dif', 'column',
                                    'y_real', 'y_hat'])

In [66]:
test_df.column.value_counts()

True     5233
False     767
Name: column, dtype: int64

In [67]:
final_evaluation = []
y_hat = []
y_real = []
column_list = []
evaluation = []

In [68]:
# check model's prediction on the whole dataset
overall = None
with torch.no_grad():
    overall = model(X.to(device)).cpu().detach()

In [69]:
for i in range(len(Y)):
    final_evaluation.append(Y[i].numpy().max() - overall[i].numpy().max())
    column_list.append(numpy.argmax(Y[i].numpy()) ==
                       numpy.argmax(overall[i].numpy()))
    y_real.append(Y[i].numpy().max())
    y_hat.append(overall[i].numpy().max())

In [70]:
over_df = pandas.DataFrame(list(zip(final_evaluation, column_list,
                                    y_real, y_hat)),
                           columns=['loss_dif', 'column',
                                    'y_real', 'y_hat'])

In [71]:
over_df.column.value_counts()

True     22617
False     1383
Name: column, dtype: int64

In [72]:
del train_loss

In [73]:
del valid_loss

In [74]:
del test_loss