# Import essential library

In [1]:
import numpy as np

# Loading the data and split it into dataset and label

In [2]:
def load_data(file):
	lines = []
	with open(file, 'r') as f:
		for line in f.readlines():
			line = line.strip().split(',')
			lines.append(line)
	lines = np.array(lines).astype(np.float32)
	dataset = lines[...,0:57]
	label = lines[...,57].astype(np.int8)
	return dataset, label

# Preprocessing the data
### The given data has different data range. So we have to standardize and normalize the data.

In [3]:
def preprocessing(data):
	### preprocessing the data to [0, 1]
	from sklearn import preprocessing
	scaler = preprocessing.MinMaxScaler(feature_range=(0, 1)).fit(data)
	data = scaler.transform(data)
	### normalization
	data = preprocessing.normalize(data, norm='l2')
	### PCA: the result shows no improvement
	# from sklearn.decomposition import PCA
	# data = PCA(n_components = 57).fit_transform(data)
	return data

# Support Vector Machine Classifier
### linear kernel gives the best result.

In [4]:
def svm_model():
	from sklearn.svm import SVC
	model = SVC(C = 1.5, kernel = 'linear', gamma = 'auto')
	return model

# Multi-layer Perceptron Classifier
### Perform better than the svm classifier.

In [5]:
def NN_model():
	from sklearn.neural_network import MLPClassifier
	model = MLPClassifier(activation='relu', solver='adam', alpha=0.0001, max_iter=200, learning_rate_init=0.001)
	return model

# Print the table of Accuracy, Precision, Recall, FP Rate, FN Rate, Overall Error Rate and Average Error Rate

In [6]:
def table(cv_results, k, d):
	tp = cv_results['test_tp']
	fp = cv_results['test_fp']
	tn = cv_results['test_tn']
	fn = cv_results['test_fn']
	from prettytable import PrettyTable
	t = PrettyTable()
	t.add_column("Accuracy", np.around( cv_results['test_Accuracy'] , decimals = d))
	t.add_column("Precision", np.around( cv_results['test_Precision'] , decimals = d))
	t.add_column("Recall", np.around( cv_results['test_Recall'] , decimals = d))
	### the false positive rate is the fraction of non-spam testing examples that are misclassified as spam
	### fp / ( fp + tn ) 
	t.add_column("FP Rate", np.around( fp / ( fp + tn ) , decimals = d))
	### the false negative rate is the fraction of spam testing examples that are misclassified as nonspam
	### fn / ( fn + tp ) = 1 - recall rate
	t.add_column("FN Rate", np.around( fn / ( fn + tp ) , decimals = d))
	### the overall error rate is the fraction of overall examples that are misclassified. 
	t.add_column("Overall Error Rate", np.around( 1 - cv_results['test_Accuracy'] , decimals = d))
	average_error = np.round(np.average(1 - cv_results['test_Accuracy']), decimals = d)
	t.add_column("Average ER", [average_error]*k)
	print(t)

# Save Model

In [7]:
def save_model(model):
	from sklearn.externals import joblib
	joblib.dump(model, 'model.pickle')

# Main Program

In [9]:
def main():
	X, y = load_data("spambase.data")
	X = preprocessing(X)
	### k-fold
	k = 10
	### choose from svm and NN
	model = svm_model()
	# model = NN_model()
	from sklearn.model_selection import cross_validate
	from sklearn.metrics import make_scorer, accuracy_score
	from sklearn.metrics import confusion_matrix
	### compute tn, fp, fn, tp
	def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0]
	def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1]
	def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0]
	def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 1]
	scoring = {'Accuracy': 'accuracy', 'Precision': 'precision', 'Recall': 'recall',
				'tp': make_scorer(tp), 'tn': make_scorer(tn),
				'fp': make_scorer(fp), 'fn': make_scorer(fn)}
    ### This cross-validation object is a variation of KFold that returns stratified folds. 
    ### The folds are made by preserving the percentage of samples for each class.
	cv_results = cross_validate(model, X, y, scoring = scoring, cv = k, n_jobs = -1,
						return_train_score=False)
	table(cv_results, k, 5)
	
	### save model
	# save_model(svm_model)
	
if __name__ == '__main__':
	main()

+----------+-----------+---------+---------+---------+--------------------+------------+
| Accuracy | Precision |  Recall | FP Rate | FN Rate | Overall Error Rate | Average ER |
+----------+-----------+---------+---------+---------+--------------------+------------+
| 0.92842  |  0.93064  | 0.88462 | 0.04301 | 0.11538 |      0.07158       |   0.0748   |
| 0.94143  |   0.9235  | 0.92857 | 0.05018 | 0.07143 |      0.05857       |   0.0748   |
| 0.93709  |   0.9322  | 0.90659 | 0.04301 | 0.09341 |      0.06291       |   0.0748   |
| 0.93913  |  0.93714  | 0.90608 | 0.03943 | 0.09392 |      0.06087       |   0.0748   |
| 0.94348  |   0.9235  |  0.9337 | 0.05018 |  0.0663 |      0.05652       |   0.0748   |
|  0.9413  |   0.885   |  0.9779 | 0.08244 |  0.0221 |       0.0587       |   0.0748   |
| 0.95217  |  0.97041  | 0.90608 | 0.01792 | 0.09392 |      0.04783       |   0.0748   |
| 0.93696  |  0.91304  | 0.92818 | 0.05735 | 0.07182 |      0.06304       |   0.0748   |
| 0.87582  |    0.81 