This example was adapted from https://www.cs.jhu.edu/~mdredze/datasets/image_spam/

In [1]:
from pyLZJD import hash, sim, vectorize
import numpy as np
from multiprocessing import Pool
import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
import os
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import urllib.parse
import glob

import xgboost as xgb

import matplotlib.pyplot as plt



In [2]:
spam_url = "https://www.cs.jhu.edu/~mdredze/datasets/image_spam/personal_image_spam.tar.gz"
ham_url = "https://www.cs.jhu.edu/~mdredze/datasets/image_spam/personal_image_ham.tar.gz"

In [3]:
if not os.path.exists("personal_image_spam"):
    print("Downloading dataset")
    import urllib.request
    import tarfile

    urllib.request.urlretrieve(ham_url, 'personal_image_ham.tar.gz')
    urllib.request.urlretrieve(spam_url, 'personal_image_spam.tar.gz')
    
    tf = tarfile.open("personal_image_ham.tar.gz")
    tf.extractall()
    tf = tarfile.open("personal_image_spam.tar.gz")
    tf.extractall()
    

In [4]:
spam_paths = glob.glob("personal_image_spam/*")
ham_paths = glob.glob("personal_image_ham/*")


all_paths = spam_paths + ham_paths
yBad = [1 for i in range(len(spam_paths))]
yGood = [0 for i in range(len(ham_paths))]
y = yBad + yGood


In [5]:
X = vectorize(all_paths)



In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #splitting data


In [7]:
lgs = LogisticRegression(class_weight='balanced')
lgs.fit(X_train, y_train) #training our model


LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [8]:
##############
# Evaluation #
##############

predicted = lgs.predict(X_test)

fpr, tpr, _ = metrics.roc_curve(y_test, (lgs.predict_proba(X_test)[:, 1]))
auc = metrics.auc(fpr, tpr)

print("Accuracy: %f" % lgs.score(X_test, y_test))  #checking the accuracy
print("Precision: %f" % metrics.precision_score(y_test, predicted))
print("Recall: %f" % metrics.recall_score(y_test, predicted))
print("F1-Score: %f" % metrics.f1_score(y_test, predicted))
print("AUC: %f" % auc)

Accuracy: 0.946429
Precision: 0.950437
Recall: 0.965926
F1-Score: 0.958119
AUC: 0.987108


Using LZJD's 'false seen probability', we can sometimes improve the accuracy and robustness of our models by over-sampling the training data. For most ML techniques, this has no benefit. LZJD it works because on each sampling, LZJD will produce a "different but valid" representation for the same file. 

In [9]:
paths_train, paths_test, y_train, y_test = train_test_split(all_paths, y, test_size=0.2, random_state=42) #splitting data

X_train = vectorize(paths_train*10, false_seen_prob=0.05)
X_test = vectorize(paths_test)


In [10]:
lgs = LogisticRegression(class_weight='balanced')
lgs.fit(X_train, y_train*10) #training our model

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [11]:
predicted = lgs.predict(X_test)

fpr, tpr, _ = metrics.roc_curve(y_test, (lgs.predict_proba(X_test)[:, 1]))
auc = metrics.auc(fpr, tpr)

print("Accuracy: %f" % lgs.score(X_test, y_test))  #checking the accuracy
print("Precision: %f" % metrics.precision_score(y_test, predicted))
print("Recall: %f" % metrics.recall_score(y_test, predicted))
print("F1-Score: %f" % metrics.f1_score(y_test, predicted))
print("AUC: %f" % auc)

Accuracy: 0.956767
Precision: 0.953824
Recall: 0.979259
F1-Score: 0.966374
AUC: 0.991602
