In [None]:
# paths of the input files
no_head_train_path_0 = '../data/IWSPA-AP-traindata/phish/'
no_head_train_path_1 = '../data/IWSPA-AP-traindata/legit/'
head_train_path_0 = '../data/Dataset_Full_Header_Training/Dataset_Submit_Phish/'
head_train_path_1 = '../data/Dataset_Full_Header_Training/Dataset_Submit_Legit/'
no_head_test_path = '../data/IWSPA-APTestData/testdata_noheaders/'
head_test_path = '../data/IWSPA-APTestData/testdata_fullheaders/'

In [None]:
import os, re, string
import numpy as np
import fasttext

In [None]:
def clean_text(text):
    text = text.decode('utf-8')
    while '\n' in text:
        text = text.replace('\n', ' ')
    while '  ' in text:
        text = text.replace('  ', ' ')
    words = text.split()
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    stripped = []
    for token in words: 
        new_token = regex.sub(u'', token)
        if not new_token == u'':
            stripped.append(new_token.lower())
    text = ' '.join(stripped)
    return text

In [None]:
def get_data(path):
    text_list = list()
    files = os.listdir(path)
    for text_file in files:
        file_path = os.path.join(path, text_file)
        read_file = open(file_path,'r+')
        read_text = read_file.read()
        read_file.close()
        cleaned_text = clean_text(read_text)
        text_list.append(cleaned_text)
    return text_list, files

In [None]:
no_head_train_0, temp = get_data(no_head_train_path_0)
no_head_train_1, temp = get_data(no_head_train_path_1)
head_train_0, temp = get_data(head_train_path_0)
head_train_1, temp = get_data(head_train_path_1)
no_head_test, no_head_files = get_data(no_head_test_path)
head_test, head_files = get_data(head_test_path)

In [None]:
no_head_train = no_head_train_0 + no_head_train_1
no_head_labels_train = ([0] * len(no_head_train_0)) + ([1] * len(no_head_train_1))

head_train = head_train_0 + head_train_1
head_labels_train = ([0] * len(head_train_0)) + ([1] * len(head_train_1))


## get data statistics

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
tf_vectorizer = CountVectorizer()
X = tf_vectorizer.fit_transform(no_head_train)

In [None]:
print ('#total words', np.matrix.sum(X.todense()))
print ('#unique words',len(set(tf_vectorizer.get_feature_names())))

In [None]:
tf_vectorizer = CountVectorizer(head_train)
X = tf_vectorizer.fit_transform(head_train)

In [None]:
print ('#total words', np.matrix.sum(X.todense()))
print ('#unique words',len(set(tf_vectorizer.get_feature_names())))

## without header

In [None]:
shuffled_indices = np.random.permutation(len(no_head_labels_train))
train_data = np.array(no_head_train)[shuffled_indices]
train_data = train_data.tolist()
train_label = np.array(no_head_labels_train)[shuffled_indices]
train_label = train_label.tolist()

In [None]:
temp_train_data = train_data[0:int(0.8*len(train_data))]
temp_train_label = train_label[0:int(0.8*len(train_label))]
temp_test_data = train_data[int(0.8*len(train_data)):]
temp_test_labels = train_label[int(0.8*len(train_label)):]

In [None]:
fast_train_file = '../data/fast_train.txt'
fast_test_file = '../data/fast_test.txt'
writeFile = open(fast_train_file, 'w')
for text, label in zip(temp_train_data, temp_train_label):
    writeFile.write('__label__'+str(label)+' '+str(text.encode('utf-8'))+'\n')
writeFile.close()

writeFile = open(fast_test_file, 'w')
for text, label in zip(temp_test_data, temp_test_labels):
    writeFile.write('__label__'+str(label)+' '+str(text.encode('utf-8'))+'\n')
writeFile.close()

In [None]:
classifier = fasttext.supervised(fast_train_file, '../models/Amrita-NLP_TOP_fastText_noheaders')

In [None]:
result = classifier.test(fast_test_file)

In [None]:
result.precision, result.recall, result.nexamples

In [None]:
print (classifier.min_count, classifier.dim, classifier.epoch, classifier.word_ngrams, classifier.encoding, classifier.loss_name, classifier.maxn, classifier.t)

In [None]:
final_fast_train_file = '../data/final_fast_train.txt'
writeFile = open(final_fast_train_file, 'w')
for text, label in zip(train_data, train_label):
    writeFile.write('__label__'+str(label)+' '+str(text.encode('utf-8'))+'\n')
writeFile.close()

In [None]:
classifier = fasttext.supervised(final_fast_train_file, '../models/Amrita-NLP_TOP_fastText_noheaders')

In [None]:
len(no_head_test)

In [None]:
for i in range (len(no_head_test)):
    if len(no_head_test[i]) == 0:
        no_head_test[i] = '  '

In [None]:
final_labels = classifier.predict(no_head_test)

In [None]:
writeFile = open('../submission/Amrita-NLP_submission_TOP_noheaders_1.txt', 'w')
for value, test_file in zip(final_labels,no_head_files):
    writeFile.write(test_file + ' ' + value[0])
    writeFile.write('\n')
writeFile.close()

## with header

In [None]:
shuffled_indices = np.random.permutation(len(head_labels_train))
train_data = np.array(head_train)[shuffled_indices]
train_data = train_data.tolist()
train_label = np.array(head_labels_train)[shuffled_indices]
train_label = train_label.tolist()

In [None]:
temp_train_data = train_data[0:int(0.8*len(train_data))]
temp_train_label = train_label[0:int(0.8*len(train_label))]
temp_test_data = train_data[int(0.8*len(train_data)):]
temp_test_labels = train_label[int(0.8*len(train_label)):]

In [None]:
fast_train_file = '../data/fast_train.txt'
fast_test_file = '../data/fast_test.txt'
writeFile = open(fast_train_file, 'w')
for text, label in zip(temp_train_data, temp_train_label):
    writeFile.write('__label__'+str(label)+' '+str(text.encode('utf-8'))+'\n')
writeFile.close()

writeFile = open(fast_test_file, 'w')
for text, label in zip(temp_test_data, temp_test_labels):
    writeFile.write('__label__'+str(label)+' '+str(text.encode('utf-8'))+'\n')
writeFile.close()

In [None]:
classifier = fasttext.supervised(fast_train_file, '../models/Amrita-NLP_TOP_fastText_headers')

In [None]:
result = classifier.test(fast_test_file)

In [None]:
result.precision, result.recall, result.nexamples

In [None]:
final_fast_train_file = '../data/final_fast_train.txt'
writeFile = open(final_fast_train_file, 'w')
for text, label in zip(train_data, train_label):
    writeFile.write('__label__'+str(label)+' '+str(text.encode('utf-8'))+'\n')
writeFile.close()

In [None]:
classifier = fasttext.supervised(final_fast_train_file, '../models/Amrita-NLP_TOP_fastText_headers')

In [None]:
final_labels = classifier.predict(head_test)

In [None]:
writeFile = open('../submission/Amrita-NLP_submission_TOP_headers_1.txt', 'w')
for value, test_file in zip(final_labels, head_files):
    writeFile.write(test_file + ' ' + value[0])
    writeFile.write('\n')
writeFile.close()

## combined model

In [None]:
no_head_train = no_head_train_0 + no_head_train_1
no_head_labels_train = ([0] * len(no_head_train_0)) + ([1] * len(no_head_train_1))

head_train = head_train_0 + head_train_1
head_labels_train = ([0] * len(head_train_0)) + ([1] * len(head_train_1))

temp_train = no_head_train + head_train
temp_labels = no_head_labels_train + head_labels_train

In [None]:
shuffled_indices = np.random.permutation(len(temp_labels))
train_data = np.array(temp_train)[shuffled_indices]
train_data = train_data.tolist()
train_label = np.array(temp_labels)[shuffled_indices]
train_label = train_label.tolist()

In [None]:
temp_train_data = train_data[0:int(0.8*len(train_data))]
temp_train_label = train_label[0:int(0.8*len(train_label))]
temp_test_data = train_data[int(0.8*len(train_data)):]
temp_test_labels = train_label[int(0.8*len(train_label)):]

In [None]:
fast_train_file = '../data/fast_train.txt'
fast_test_file = '../data/fast_test.txt'
writeFile = open(fast_train_file, 'w')
for text, label in zip(temp_train_data, temp_train_label):
    writeFile.write('__label__'+str(label)+' '+str(text.encode('utf-8'))+'\n')
writeFile.close()

writeFile = open(fast_test_file, 'w')
for text, label in zip(temp_test_data, temp_test_labels):
    writeFile.write('__label__'+str(label)+' '+str(text.encode('utf-8'))+'\n')
writeFile.close()

In [None]:
classifier = fasttext.supervised(fast_train_file, '../models/model_combined')

In [None]:
result = classifier.test(fast_test_file)

In [None]:
result.precision, result.recall, result.nexamples

In [None]:
final_fast_train_file = '../data/final_fast_train.txt'
writeFile = open(final_fast_train_file, 'w')
for text, label in zip(train_data, train_label):
    writeFile.write('__label__'+str(label)+' '+str(text.encode('utf-8'))+'\n')
writeFile.close()

In [None]:
classifier = fasttext.supervised(final_fast_train_file, '../models/model_combined')

In [None]:
for i in range (len(no_head_test)):
    if len(no_head_test[i]) == 0:
        no_head_test[i] = '  '

In [None]:
final_head_labels = classifier.predict(head_test)

In [None]:
final_no_head_labels = classifier.predict(no_head_test)

In [None]:
writeFile = open('../submission/Amrita-NLP_submission_headers_2.txt', 'w')
for value, test_file in zip(final_head_labels , head_files):
    writeFile.write(test_file + ' ' + value[0])
    writeFile.write('\n')
writeFile.close()

In [None]:
writeFile = open('../submission/Amrita-NLP_submission_noheaders_2.txt', 'w')
for value, test_file in zip(final_no_head_labels,no_head_files):
    writeFile.write(test_file + ' ' + value[0])
    writeFile.write('\n')
writeFile.close()