In [1]:
import sys
import time
import nltk
import numpy
import pandas
import torch.nn
import statistics
import torch.utils
import sklearn.metrics
import sklearn.model_selection
import sklearn.feature_extraction.text

In [2]:
input_dataframe = pandas.read_csv('onion-or-not.csv', encoding='utf-8')

In [3]:
tokenized_vector = dict()

In [4]:
for i in input_dataframe.index:
    tokenized_vector[i] = nltk.word_tokenize(input_dataframe.loc[i][0])

In [5]:
stemmer = nltk.PorterStemmer()

In [6]:
stopwords = set(nltk.corpus.stopwords.words('english'))

In [7]:
for tokens in tokenized_vector:
    for counter, token in enumerate(tokenized_vector[tokens]):
        if stemmer.stem(tokenized_vector[tokens][counter]) not in stopwords:
            tokenized_vector[tokens][counter] = stemmer.stem(tokenized_vector
                                                             [tokens][counter])
        else:
            tokenized_vector[tokens].remove(tokenized_vector[tokens][counter])

In [8]:
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer()

In [9]:
preprocessed_tokenized_vector = list()

In [10]:
for i in tokenized_vector:
    preprocessed_tokenized_vector.append(' '.join(tokenized_vector[i]))

In [11]:
X = vectorizer.fit_transform(preprocessed_tokenized_vector)

In [12]:
tf_idf_df = pandas.DataFrame(X.todense(),
                             columns=vectorizer.get_feature_names(),
                             dtype=numpy.float16)

In [13]:
preprocessed_data = pandas.concat([tf_idf_df, input_dataframe.iloc[:, 1:]],
                                  axis=1, sort=False)

In [14]:
print('Total size of dataframe: ',
      round(sys.getsizeof(preprocessed_data) / 2**20, 2), 'MB')

Total size of dataframe:  1012.99 MB


In [15]:
del X

In [16]:
del tf_idf_df

In [17]:
del preprocessed_tokenized_vector

In [18]:
del tokenized_vector

In [19]:
del input_dataframe

In [20]:
PATH = '/home/andreas/Documents/data_mining/project/onion/input/'

In [21]:
cols = pandas.DataFrame(preprocessed_data.columns[:-1].tolist(),
                        columns=['tokens'])

In [22]:
X = preprocessed_data.iloc[:, :-1]

In [23]:
Y = preprocessed_data.iloc[:, -1]

In [24]:
del preprocessed_data

In [25]:
x_fit, x_test, y_fit, y_test = \
    sklearn.model_selection.train_test_split(X, Y,
                                             test_size=0.25, random_state=42)

In [26]:
x_train, x_val, y_train, y_val = \
    sklearn.model_selection.train_test_split(x_fit,
                                             y_fit,
                                             test_size=0.10,
                                             random_state=42)

In [27]:
del x_fit

In [28]:
del y_fit

In [29]:
x_train = torch.from_numpy(x_train.to_numpy()).float()
y_train = torch.from_numpy(y_train.to_numpy()).float()

In [30]:
train_dataset = torch.utils.data.TensorDataset(x_train, y_train)

In [31]:
del x_train

In [32]:
del y_train

In [33]:
x_val = torch.from_numpy(x_val.to_numpy()).float()
y_val = torch.from_numpy(y_val.to_numpy()).float()

In [34]:
val_dataset = torch.utils.data.TensorDataset(x_val, y_val)

In [35]:
del x_val

In [36]:
del y_val

In [37]:
x_test = torch.from_numpy(x_test.to_numpy()).float()
y_test = torch.from_numpy(y_test.to_numpy()).float()

In [38]:
test_dataset = torch.utils.data.TensorDataset(x_test, y_test)

In [39]:
train_loader = torch.utils.data.DataLoader(train_dataset)
val_loader = torch.utils.data.DataLoader(val_dataset)
test_loader = torch.utils.data.DataLoader(test_dataset)

In [40]:
del train_dataset

In [41]:
del val_dataset

In [42]:
del test_dataset

In [43]:
X = torch.from_numpy(X.to_numpy()).float()
Y = torch.from_numpy(Y.to_numpy()).float()

In [44]:
model = torch.nn.Sequential(
    torch.nn.Linear(X.shape[1], 64),
    torch.nn.ReLU(),
    torch.nn.Linear(64, 16),
    torch.nn.Dropout(0.1),
    torch.nn.ReLU(),
    torch.nn.Linear(16, 1),
    torch.nn.Sigmoid()
)

In [45]:
criterion = torch.nn.BCELoss()

In [46]:
learning_rate = 1e-4

In [47]:
optimizer = torch.optim.Adam(model.parameters(),
                             lr=learning_rate,
                             weight_decay=0.2)

In [48]:
def get_device():
    device = None
    if torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
    return device

In [49]:
device = get_device()

In [50]:
model.to(device)

Sequential(
  (0): Linear(in_features=22125, out_features=64, bias=True)
  (1): ReLU()
  (2): Linear(in_features=64, out_features=16, bias=True)
  (3): Dropout(p=0.1, inplace=False)
  (4): ReLU()
  (5): Linear(in_features=16, out_features=1, bias=True)
  (6): Sigmoid()
)

In [51]:
early_stopping = False
# initialize early stopping prevention limit
prevent = 5
# initialize early stopping prevention limit
consecutive = False
# initialize early stopping message
message = ' '
# initialize epoch counter
epoch = 0
# number of epochs to train the model
epochs = 500
# initialize variables
prev_mean_valid_loss = numpy.Inf
start = 0
# initialize error lists
train_loss = []
valid_loss = []
history = []

In [52]:
print('Time: ', start, ' (in seconds)')
while not early_stopping and epoch < epochs:
    if epoch == 0:
        start = time.time()

    # prep model for training
    model.train()
    for x_train, y_train in train_loader:
        # forward pass
        y_hat = model(x_train.to(device))
        # calculate the loss
        loss = criterion(y_hat.flatten(), y_train.to(device))
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # backward pass
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss.append(loss.item())
    # shut down autograd to begin evaluation
    with torch.no_grad():
        # prep model for evaluation
        model.eval()
        for x_val, y_val in val_loader:
            # forward pass
            y_hat = model(x_val.to(device))
            # calculate the loss
            loss = criterion(y_hat.flatten(), y_val.to(device))
            # update running validation loss
            valid_loss.append(loss.item())
    # early stopping conditional
    if prev_mean_valid_loss <= statistics.mean(valid_loss):
        if consecutive is True:
            prevent -= 1
        consecutive = True
        if prevent < 0:
            early_stopping = True
            message = '\tPrevious average Validation error was lower than\
                current Validation error'
    else:
        consecutive = False

    # print results after 2 epochs
    if epoch % 2 == 1:
        end = time.time()
        print('Epoch: ', epoch+1, '\t Time: +', end-start, '\t Training\
        loss: ', statistics.mean(train_loss), '\t Validation loss: ',
              statistics.mean(valid_loss))
        start = time.time()

    # update epoch's validation loss variable
    prev_mean_valid_loss = statistics.mean(valid_loss)

    # early stopping message
    if early_stopping is True:
        print('\t\tStopping at epoch: ', epoch + 1, message)
        epoch = epochs - 1
    epoch += 1

Time:  0  (in seconds)
Epoch:  2 	 Time: + 52.68909525871277 	 Training        loss:  0.6697393896347946 	 Validation loss:  0.6709876214464505
Epoch:  4 	 Time: + 54.70188903808594 	 Training        loss:  0.6693080399396979 	 Validation loss:  0.6708298934996129
Epoch:  6 	 Time: + 52.84192514419556 	 Training        loss:  0.6691505167734476 	 Validation loss:  0.6707720929273853
Epoch:  8 	 Time: + 53.11253118515015 	 Training        loss:  0.6690712356466202 	 Validation loss:  0.67074298289501
Epoch:  10 	 Time: + 53.518542528152466 	 Training        loss:  0.6690236478032153 	 Validation loss:  0.6707255162199338
Epoch:  12 	 Time: + 55.48626351356506 	 Training        loss:  0.6689919226604357 	 Validation loss:  0.6707138774737164
Epoch:  14 	 Time: + 54.901580572128296 	 Training        loss:  0.6689692605545702 	 Validation loss:  0.6707055640835611
Epoch:  16 	 Time: + 53.245259046554565 	 Training        loss:  0.6689522648041631 	 Validation loss:  0.6706993290409445
Epoc

In [53]:
del x_train

In [54]:
del y_train

In [55]:
del x_val

In [56]:
del y_val

In [57]:
del train_loader

In [58]:
del val_loader

In [59]:
# define test error list
test_loss = []
# initialize timer
start = time.time()
# test model
model.eval()
with torch.no_grad():
    for x, y in test_loader:
        yhat = model(x.to(device))
        loss = criterion(yhat.flatten(), y.to(device))
        test_loss.append(loss.item())
# end time checkpoint
end = time.time()
# print test results
print('\tTime: {:.10} \tTest Loss: {:.15f}'.format(end-start,
                                                   statistics.mean(test_loss)))

	Time: 1.954857588 	Test Loss: 0.666418733954430


In [60]:
del test_loader

In [61]:
# check model's prediction on the whole dataset
prediction = None
with torch.no_grad():
    prediction = model(x_test.to(device)).cpu().detach()

In [62]:
final_evaluation = []
y_hat = []
y_real = []
evaluation = []

In [63]:
for i in range(len(y_test)):
    final_evaluation.append(y_test[i].numpy() - prediction[i].item())
    y_real.append(y_test[i].numpy())
    y_hat.append(prediction[i].item())
    if abs(y_test[i].numpy() - prediction[i].item()) > 0.5:
        evaluation.append(False)
    else:
        evaluation.append(True)

In [64]:
test_df = pandas.DataFrame(list(zip(final_evaluation, evaluation, y_real,
                                    y_hat)), columns=['loss_dif',
                                                      'evaluation',
                                                      'y_real',
                                                      'y_hat'])

In [65]:
test_df

Unnamed: 0,loss_dif,evaluation,y_real,y_hat
0,-0.433266,True,0.0,0.433266
1,-0.433266,True,0.0,0.433266
2,-0.433266,True,0.0,0.433266
3,-0.433266,True,0.0,0.433266
4,-0.433266,True,0.0,0.433266
...,...,...,...,...
5995,-0.433266,True,0.0,0.433266
5996,-0.433266,True,0.0,0.433266
5997,-0.433266,True,0.0,0.433266
5998,-0.433266,True,0.0,0.433266


In [66]:
debug = test_df.loc[test_df['evaluation'] == False]

In [67]:
debug

Unnamed: 0,loss_dif,evaluation,y_real,y_hat
7,0.566734,False,1.0,0.433266
9,0.566734,False,1.0,0.433266
18,0.566734,False,1.0,0.433266
19,0.566734,False,1.0,0.433266
29,0.566734,False,1.0,0.433266
...,...,...,...,...
5987,0.566734,False,1.0,0.433266
5989,0.566734,False,1.0,0.433266
5991,0.566734,False,1.0,0.433266
5992,0.566734,False,1.0,0.433266


In [68]:
print(sklearn.metrics.classification_report(
    test_df.y_real.astype(numpy.float16).to_numpy(),
    numpy.where(test_df.y_hat.astype(numpy.float16).to_numpy() > 0.5, 1, 0)))

              precision    recall  f1-score   support

         0.0       0.63      1.00      0.78      3798
         1.0       0.00      0.00      0.00      2202

    accuracy                           0.63      6000
   macro avg       0.32      0.50      0.39      6000
weighted avg       0.40      0.63      0.49      6000



  _warn_prf(average, modifier, msg_start, len(result))


In [69]:
# check model's prediction on the whole dataset
overall = None
with torch.no_grad():
    overall = model(X.to(device)).cpu().detach()

In [70]:
final_evaluation = []
y_hat = []
y_real = []
evaluation = []

In [71]:
for i in range(len(Y)):
    final_evaluation.append(Y[i].numpy() - overall[i].item())
    y_real.append(Y[i].numpy())
    y_hat.append(overall[i].item())
    if abs(Y[i].numpy() - overall[i].item()) > 0.5:
        evaluation.append(False)
    else:
        evaluation.append(True)

In [72]:
overall_df = pandas.DataFrame(list(zip(final_evaluation, evaluation, y_real,
                                       y_hat)), columns=['loss_dif',
                                                         'evaluation',
                                                         'y_real',
                                                         'y_hat'])

In [73]:
del final_evaluation

In [74]:
del y_hat

In [75]:
del y_real

In [76]:
overall_df.evaluation.value_counts()

True     15000
False     9000
Name: evaluation, dtype: int64

In [77]:
overall_df.loss_dif.mean()

-0.05826577544212341

In [78]:
errors = overall_df.loc[overall_df['evaluation'] == False]

In [79]:
errors

Unnamed: 0,loss_dif,evaluation,y_real,y_hat
0,0.566734,False,1.0,0.433266
2,0.566734,False,1.0,0.433266
3,0.566734,False,1.0,0.433266
4,0.566734,False,1.0,0.433266
5,0.566734,False,1.0,0.433266
...,...,...,...,...
23992,0.566734,False,1.0,0.433266
23994,0.566734,False,1.0,0.433266
23995,0.566734,False,1.0,0.433266
23997,0.566734,False,1.0,0.433266


In [80]:
print(sklearn.metrics.classification_report(
    overall_df.y_real.astype(numpy.float16).to_numpy(),
    numpy.where(
        overall_df.y_hat.astype(numpy.float16).to_numpy() > 0.5, 1, 0)
))

              precision    recall  f1-score   support

         0.0       0.62      1.00      0.77     15000
         1.0       0.00      0.00      0.00      9000

    accuracy                           0.62     24000
   macro avg       0.31      0.50      0.38     24000
weighted avg       0.39      0.62      0.48     24000



  _warn_prf(average, modifier, msg_start, len(result))


In [81]:
del errors

In [82]:
del overall_df

In [83]:
del train_loss

In [84]:
del valid_loss

In [85]:
del test_loss