In [1]:
import sys
import time
import nltk
import numpy
import pandas
import torch.nn
import statistics
import torch.utils
import sklearn.metrics
import sklearn.model_selection
import sklearn.feature_extraction.text

In [2]:
input_dataframe = pandas.read_csv('onion-or-not.csv', encoding='utf-8')

In [3]:
tokenized_vector = dict()

In [4]:
for i in input_dataframe.index:
    tokenized_vector[i] = nltk.word_tokenize(input_dataframe.loc[i][0])

In [5]:
stemmer = nltk.PorterStemmer()

In [6]:
stopwords = set(nltk.corpus.stopwords.words('english'))

In [7]:
for tokens in tokenized_vector:
    for counter, token in enumerate(tokenized_vector[tokens]):
        if stemmer.stem(tokenized_vector[tokens][counter]) not in stopwords:
            tokenized_vector[tokens][counter] = stemmer.stem(
                tokenized_vector[tokens][counter])
        else:
            tokenized_vector[tokens].remove(tokenized_vector[tokens][counter])

In [8]:
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer()

In [9]:
preprocessed_tokenized_vector = list()

In [10]:
for i in tokenized_vector:
    preprocessed_tokenized_vector.append(' '.join(tokenized_vector[i]))

In [11]:
X = vectorizer.fit_transform(preprocessed_tokenized_vector)

In [12]:
tf_idf_df = pandas.DataFrame(X.todense(),
                             columns=vectorizer.get_feature_names(),
                             dtype=numpy.float16)

In [13]:
preprocessed_data = pandas.concat([tf_idf_df, input_dataframe.iloc[:, 1:]],
                                  axis=1, sort=False)

In [14]:
print('Total size of dataframe: ',
      round(sys.getsizeof(preprocessed_data) / 2**20, 2), 'MB')

Total size of dataframe:  1012.99 MB


In [15]:
del X

In [16]:
del tf_idf_df

In [17]:
del preprocessed_tokenized_vector

In [18]:
del tokenized_vector

In [19]:
del input_dataframe

In [20]:
PATH = '/home/andreas/Documents/data_mining/project/onion/input/'

In [21]:
cols = pandas.DataFrame(preprocessed_data.columns[:-1].tolist(),
                        columns=['tokens'])

In [22]:
X = preprocessed_data.iloc[:, :-1]

In [24]:
Y = preprocessed_data.iloc[:, -1]

In [25]:
del preprocessed_data

In [27]:
x_fit, x_test, y_fit, y_test = \
    sklearn.model_selection.train_test_split(X, Y,
                                             test_size=0.25, random_state=42)

In [28]:
x_train, x_val, y_train, y_val = \
    sklearn.model_selection.train_test_split(x_fit,
                                             y_fit,
                                             test_size=0.10,
                                             random_state=42)

In [29]:
del x_fit

In [30]:
del y_fit

In [31]:
x_train = torch.from_numpy(x_train.to_numpy()).float()
y_train = torch.from_numpy(y_train.to_numpy()).float()

In [32]:
train_dataset = torch.utils.data.TensorDataset(x_train, y_train)

In [33]:
del x_train

In [34]:
del y_train

In [35]:
x_val = torch.from_numpy(x_val.to_numpy()).float()
y_val = torch.from_numpy(y_val.to_numpy()).float()

In [36]:
val_dataset = torch.utils.data.TensorDataset(x_val, y_val)

In [37]:
del x_val

In [38]:
del y_val

In [39]:
x_test = torch.from_numpy(x_test.to_numpy()).float()
y_test = torch.from_numpy(y_test.to_numpy()).float()

In [40]:
test_dataset = torch.utils.data.TensorDataset(x_test, y_test)

In [41]:
train_loader = torch.utils.data.DataLoader(train_dataset)
val_loader = torch.utils.data.DataLoader(val_dataset)
test_loader = torch.utils.data.DataLoader(test_dataset)

In [42]:
del train_dataset

In [43]:
del val_dataset

In [44]:
del test_dataset

In [45]:
X = torch.from_numpy(X.to_numpy()).float()
Y = torch.from_numpy(Y.to_numpy()).float()

In [46]:
model = torch.nn.Sequential(
    torch.nn.Linear(X.shape[1], 1024),
    torch.nn.ReLU(),
    torch.nn.Linear(1024, 512),
    torch.nn.ReLU(),
    torch.nn.Linear(512, 1),
    torch.nn.Sigmoid()
)

In [47]:
criterion = torch.nn.BCELoss()

In [48]:
learning_rate = 1e-6

In [49]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [50]:
def get_device():
    device = None
    if torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
    return device

In [51]:
device = get_device()

In [52]:
model.to(device)

Sequential(
  (0): Linear(in_features=22125, out_features=1024, bias=True)
  (1): ReLU()
  (2): Linear(in_features=1024, out_features=512, bias=True)
  (3): ReLU()
  (4): Linear(in_features=512, out_features=1, bias=True)
  (5): Sigmoid()
)

In [53]:
early_stopping = False
# initialize early stopping prevention limit
prevent = 5
# initialize early stopping prevention limit
consecutive = False
# initialize early stopping message
message = ' '
# initialize epoch counter
epoch = 0
# number of epochs to train the model
epochs = 50
# initialize variables
prev_mean_valid_loss = numpy.Inf
start = 0
# initialize error lists
train_loss = []
valid_loss = []
history = []

In [54]:
print('Time: ', start, ' (in seconds)')
while not early_stopping and epoch < epochs:
    if epoch == 0:
        start = time.time()

    # prep model for training
    model.train()
    for x_train, y_train in train_loader:
        # forward pass
        y_hat = model(x_train.to(device))
        # calculate the loss
        loss = criterion(y_hat.flatten(), y_train.to(device))
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # backward pass
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss.append(loss.item())
    # shut down autograd to begin evaluation
    with torch.no_grad():
        # prep model for evaluation
        model.eval()
        for x_val, y_val in val_loader:
            # forward pass
            y_hat = model(x_val.to(device))
            # calculate the loss
            loss = criterion(y_hat.flatten(), y_val.to(device))
            # update running validation loss
            valid_loss.append(loss.item())
    # early stopping conditional
    if prev_mean_valid_loss <= statistics.mean(valid_loss):
        if consecutive is True:
            prevent -= 1
        consecutive = True
        if prevent < 0:
            early_stopping = True
            message = '\tPrevious average Validation error was lower than\
                current Validation error'
    else:
        consecutive = False

    # print results after 2 epochs
    if epoch % 2 == 1:
        end = time.time()
        print('Epoch: ', epoch+1, '\t Time: +', end-start, '\t Training\
        loss: ', statistics.mean(train_loss), '\t Validation loss: ',
              statistics.mean(valid_loss))
        start = time.time()

    # update epoch's validation loss variable
    prev_mean_valid_loss = statistics.mean(valid_loss)

    # early stopping message
    if early_stopping is True:
        print('\t\tStopping at epoch: ', epoch + 1, message)
        epoch = epochs - 1
    epoch += 1

Time:  0  (in seconds)
Epoch:  2 	 Time: + 348.6037919521332 	 Training        loss:  0.6785692429744903 	 Validation loss:  0.6703175398210685
Epoch:  4 	 Time: + 342.7747790813446 	 Training        loss:  0.6535778464004397 	 Validation loss:  0.6444097912104594
Epoch:  6 	 Time: + 343.29390048980713 	 Training        loss:  0.6170665140304362 	 Validation loss:  0.6069072960917321
Epoch:  8 	 Time: + 343.9396913051605 	 Training        loss:  0.5692311673230825 	 Validation loss:  0.5615064587067657
Epoch:  10 	 Time: + 344.7268159389496 	 Training        loss:  0.5209850433888532 	 Validation loss:  0.5195425890374463
Epoch:  12 	 Time: + 344.871999502182 	 Training        loss:  0.47924580438231723 	 Validation loss:  0.4855711580188115
Epoch:  14 	 Time: + 345.1459381580353 	 Training        loss:  0.4447826157545896 	 Validation loss:  0.45896355810341316
Epoch:  16 	 Time: + 345.4600520133972 	 Training        loss:  0.4162915700648105 	 Validation loss:  0.4380081609729344
Epo

In [55]:
del x_train

In [56]:
del y_train

In [57]:
del x_val

In [58]:
del y_val

In [59]:
del train_loader

In [60]:
del val_loader

In [61]:
# define test error list
test_loss = []
# initialize timer
start = time.time()
# test model
model.eval()
with torch.no_grad():
    for x, y in test_loader:
        yhat = model(x.to(device))
        loss = criterion(yhat.flatten(), y.to(device))
        test_loss.append(loss.item())
# end time checkpoint
end = time.time()
# print test results
print('\tTime: {:.10} \tTest Loss: {:.15f}'.format(end-start,
                                                   statistics.mean(test_loss)))

	Time: 3.398823977 	Test Loss: 0.310725052263438


In [62]:
del test_loader

In [63]:
# check model's prediction on the whole dataset
prediction = None
with torch.no_grad():
    prediction = model(x_test.to(device)).cpu().detach()

In [64]:
final_evaluation = []
y_hat = []
y_real = []
evaluation = []

In [65]:
for i in range(len(y_test)):
    final_evaluation.append(y_test[i].numpy() - prediction[i].item())
    y_real.append(y_test[i].numpy())
    y_hat.append(prediction[i].item())
    if abs(y_test[i].numpy() - prediction[i].item()) > 0.5:
        evaluation.append(False)
    else:
        evaluation.append(True)

In [66]:
test_df = pandas.DataFrame(list(zip(final_evaluation, evaluation, y_real,
                                    y_hat)), columns=['loss_dif',
                                                      'evaluation',
                                                      'y_real',
                                                      'y_hat'])

In [67]:
test_df

Unnamed: 0,loss_dif,evaluation,y_real,y_hat
0,-0.000462,True,0.0,0.000462
1,-0.000060,True,0.0,0.000060
2,-0.008119,True,0.0,0.008119
3,-0.000350,True,0.0,0.000350
4,-0.000608,True,0.0,0.000608
...,...,...,...,...
5995,-0.001724,True,0.0,0.001724
5996,-0.001295,True,0.0,0.001295
5997,-0.000156,True,0.0,0.000156
5998,-0.183887,True,0.0,0.183887


In [68]:
debug = test_df.loc[test_df['evaluation'] == False]

In [69]:
debug

Unnamed: 0,loss_dif,evaluation,y_real,y_hat
12,-0.608145,False,0.0,0.608145
28,-0.792266,False,0.0,0.792266
49,0.619998,False,1.0,0.380002
58,0.627977,False,1.0,0.372023
60,0.947423,False,1.0,0.052577
...,...,...,...,...
5963,-0.913302,False,0.0,0.913302
5971,0.800790,False,1.0,0.199210
5981,0.957684,False,1.0,0.042316
5984,0.994995,False,1.0,0.005005


In [70]:
print(
    sklearn.metrics.classification_report(
        test_df.y_real.astype(numpy.float16).to_numpy(),
        numpy.where(test_df.y_hat.astype(numpy.float16).to_numpy() > 0.5, 1, 0)
    ))

              precision    recall  f1-score   support

         0.0       0.91      0.91      0.91      3798
         1.0       0.85      0.84      0.84      2202

    accuracy                           0.89      6000
   macro avg       0.88      0.87      0.88      6000
weighted avg       0.88      0.89      0.89      6000



In [71]:
# check model's prediction on the whole dataset
overall = None
with torch.no_grad():
    overall = model(X.to(device)).cpu().detach()

In [72]:
final_evaluation = []
y_hat = []
y_real = []
evaluation = []

In [73]:
for i in range(len(Y)):
    final_evaluation.append(Y[i].numpy() - overall[i].item())
    y_real.append(Y[i].numpy())
    y_hat.append(overall[i].item())
    if abs(Y[i].numpy() - overall[i].item()) > 0.5:
        evaluation.append(False)
    else:
        evaluation.append(True)

In [74]:
overall_df = pandas.DataFrame(list(zip(final_evaluation, evaluation, y_real,
                                       y_hat)), columns=['loss_dif',
                                                         'evaluation',
                                                         'y_real',
                                                         'y_hat'])

In [75]:
del final_evaluation

In [76]:
del y_hat

In [77]:
del y_real

In [78]:
overall_df.evaluation.value_counts()

True     22556
False     1444
Name: evaluation, dtype: int64

In [79]:
overall_df.loss_dif.mean()

0.001386176293951219

In [80]:
errors = overall_df.loc[overall_df['evaluation'] == False]

In [81]:
errors

Unnamed: 0,loss_dif,evaluation,y_real,y_hat
31,0.963477,False,1.0,0.036523
39,0.673781,False,1.0,0.326219
69,-0.715734,False,0.0,0.715734
90,0.985894,False,1.0,0.014106
102,0.733846,False,1.0,0.266154
...,...,...,...,...
23971,-0.975370,False,0.0,0.975370
23974,0.991210,False,1.0,0.008790
23976,0.660783,False,1.0,0.339217
23978,-0.821544,False,0.0,0.821544


In [82]:
print(sklearn.metrics.classification_report(
    overall_df.y_real.astype(numpy.float16).to_numpy(),
    numpy.where(overall_df.y_hat.astype(numpy.float16).to_numpy() > 0.5, 1, 0)
))

              precision    recall  f1-score   support

         0.0       0.95      0.95      0.95     15000
         1.0       0.92      0.92      0.92      9000

    accuracy                           0.94     24000
   macro avg       0.94      0.94      0.94     24000
weighted avg       0.94      0.94      0.94     24000



In [83]:
del errors

In [84]:
del overall_df

In [85]:
del train_loss

In [86]:
del valid_loss

In [87]:
del test_loss