import tensorflow as tf
import numpy as np
import scipy
import h5py
import os
import shutil
import matplotlib.pyplot as plt
%matplotlib inline
from tensorflow.python.keras.preprocessing  import  image 
from tensorflow.python.keras.preprocessing.image import ImageDataGenerator
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Activation, Dropout, Flatten, Dense,Conv2D, MaxPooling2D,LeakyReLU
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.callbacks import TensorBoard,ModelCheckpoint
from tensorflow.python.keras.layers import BatchNormalization
from IPython.display import display
from tensorflow.python.keras.layers import ELU
from numpy.random import seed
from shutil import copyfile
from time import time
tf.set_random_seed(1)
seed(1)
from sklearn.naive_bayes import GaussianNB

# dimensions of our images.
image_width, image_height = 32, 32
WORK_DIRECTORY="./trec07p"
os.chdir(WORK_DIRECTORY) 
train_data_dir      = 'KaggleCatDog/train'
validation_data_dir = 'KaggleCatDog/validation'
weightsPath= "weights/"
modelPath= "model/"
weightsFilePath= "weights/weightscatsdogs.h5"
modelFilePath= "model/modelcatsdogs.json"
path = WORK_DIRECTORY+"UserData/"
nb_train_samples = 24158
nb_validation_samples = 1052
epochs =2
batch_size = 25
num_classes =2
input_shape = (image_width, image_height, 3)

def HouseKeeping(pathName):
    pathName = WORK_DIRECTORY+ pathName
    if os.path.exists(pathName):
        shutil.rmtree(pathName)
    if not os.path.exists(pathName):
        os.makedirs(pathName)   

train_datagen = ImageDataGenerator(rescale=1. / 255,
                                   horizontal_flip=True,
                                   vertical_flip=True,
                                   rotation_range=90.,
                                   zoom_range=0.3,
                                   shear_range=0.3)
test_datagen = ImageDataGenerator(rescale=1. / 255)

train_generator = train_datagen.flow_from_directory(
    train_data_dir,
    target_size=(image_width, image_height),
    batch_size=batch_size,
    class_mode='binary')

validation_generator = test_datagen.flow_from_directory(
    validation_data_dir,    target_size=(image_width, image_height),
    batch_size=batch_size,
    class_mode='binary')

model = Sequential()
 
#conv2d_1  32 filters of shape 3 X 3
#Parameters = 896 = ((Filter width * Filter Breadth * Channels) + 1) * Number of Filters
#((3 X 3 X 3 )+1) *32 ) = 896
model.add(Conv2D(32, (3, 3), input_shape=input_shape))     

# Filter Count * 4
# 32 * 4 = 128
model.add(BatchNormalization()) 
model.add(Activation('relu'))
#max_pooling2d_1
model.add(MaxPooling2D(pool_size=(2, 2)))   

#conv2d_2  64 filters of shape 3 X 3   
# (3 * 3 * 32 + 1) * 64 = 18496
model.add(Conv2D(64, (3, 3)))           

# Filter Count * 4
# 64 * 4 = 256
model.add(BatchNormalization())
model.add(Activation('relu'))
#max_pooling2d_2
model.add(MaxPooling2D(pool_size=(2, 2))) 

model.add(Dropout(0.5))
#conv2d_3  128 filters of shape 3 X 3    
# ( 3 * 3  * 64 +1)*128  = 73856
model.add(Conv2D(128, (3, 3)))        

# Filter Count * 4
# 128 * 4 = 512
model.add(BatchNormalization())
model.add(Activation('elu'))
#max_pooling2d_3
model.add(MaxPooling2D(pool_size=(2, 2)))  
 


model.add(Dropout(0.5))
model.add(Flatten())

# last Count = 512
# (last count + 1) * DenseCount
#(512 + 1)* 100 = 51300
model.add(Dense(100, activation='elu'))
 
# (100+1)*1 = 101
model.add(Dense(1,   activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
tensorboard = TensorBoard(log_dir="logs/{}".format(time()))
print(model.summary())


HouseKeeping(weightsPath)
HouseKeeping(modelPath)
model_json = model.to_json()
with open(modelFilePath, "w") as json_file:
    json_file.write(model_json) 

#+"{epoch:03d}-{val_acc:3f}.hdf5"
checkpointer = ModelCheckpoint(filepath=weightsFilePath,
                               verbose=1, 
                               monitor='val_acc',
                               save_best_only=True,
                               mode='max')

with tf.device('/gpu:0'):
    history=model.fit_generator(
        train_generator,
        verbose=1,
        steps_per_epoch=nb_train_samples // batch_size,
        epochs=epochs,
        validation_data=validation_generator,
        validation_steps=nb_validation_samples // batch_size,
        callbacks=[checkpointer])

loss = history.history['loss']
val_loss = history.history['val_loss']
plt.plot(loss)
plt.plot(val_loss)
plt.legend(['loss','val_loss'])
plt.title("Loss")
plt.ylabel("Loss")
plt.xlabel("Epoch")
plt.show()

acc = history.history['acc']
val_acc = history.history['val_acc']
plt.plot(acc)
plt.plot(val_acc)
plt.legend(['acc','val_acc'])
plt.title("Accuracy")
plt.title("Loss")
plt.ylabel("Loss")
plt.xlabel("Epoch")
plt.show()

files = os.listdir(path)

for file in files:
   
    t_image= image.load_img(path + file )
    print(path + file )
    test_image = image.load_img(path + file, target_size=(image_height, image_width))
    test_image = image.img_to_array(test_image)
    test_image = np.expand_dims(test_image, axis=0)

    pred = model.predict_on_batch(test_image)
    print(pred)
    if pred >= 1.0:
        print("Dog")
    else:
        print("Cat")

    plt.imshow(t_image)
    plt.show()      

import numpy as np 
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf 
import os

# Function to encode string features
def encode_features_and_labels(training, testing):

	# To encode string  labels into numbers
	le = LabelEncoder()

	# Creates new dummy columns from each unique string in a particulat feature
	training = pd.get_dummies(data=training, columns=['proto', 'service', 'state'])
	testing = pd.get_dummies(data=testing, columns=['proto', 'service', 'state'])

	# Making sure that the training features are same as testing features.
	# The training dataset has more unique protocols and states, therefore number \
	# of dummy columns will be different in both. We make it the same.
	traincols = list(training.columns.values)
	testcols = list(testing.columns.values)

	# For those in training but not in testing
	for col in traincols:
		# If a column is missing in the testing dataset, we add it
		if col not in testcols:
			testing[col] = 0
			testcols.append(col)
	# For those in testing but not in training
	for col in testcols:
		if col not in traincols:
			training[col] = 0
			traincols.append(col)


	# Moving the labels and categories to the end and making sure features are in the same order
	traincols.pop(traincols.index('attack_cat'))
	traincols.pop(traincols.index('label'))
	training = training[traincols+['attack_cat', 'label']]
	testing = testing[traincols+['attack_cat', 'label']]

	# Encoding the category names into numbers so that they can be one hot encoded later.
	training['attack_cat'] = le.fit_transform(training['attack_cat'])
	testing['attack_cat'] = le.fit_transform(testing['attack_cat'])

	# Returning modified dataframes and the vocabulary of labels for inverse transform
	return (training, testing, le)

# Parameters
training_epochs = 20
batch_size = 9
start_rate = 0.0002

# Network Parameters
n_hidden_1 = 100 # 1st layer number of neurons
n_hidden_2 = 50 # 2nd layer number of neurons
n_features = 196 # There are 194 different features for each packet.
n_classes = 10 # There are 9 different types of malicious packets + Normal

########### Defining tensorflow computational graph ###########

# tf Graph input
# Features
X = tf.placeholder(tf.float32, [None, n_features])
# Labels
Y = tf.placeholder(tf.int32, [None,])
# decay step for learning rate decay
decay_step = tf.placeholder(tf.int32)


# Create model
def deep_neural_network(x):

    # Hidden fully connected layer with 100 neurons
    layer_1 = tf.layers.dense(x, n_hidden_1, activation=tf.nn.relu)
    # Hidden fully connected layer with 50 neurons
    layer_2 = tf.layers.dense(layer_1, n_hidden_2, activation=tf.nn.relu)
    # Output fully connected layer with a neuron for each class
    out_layer = tf.layers.dense(layer_2, n_classes)
    return out_layer

# Construct model
logits = deep_neural_network(X)

# Define loss and optimizer
# Converting categories into one hot labels
labels = tf.one_hot(indices=tf.cast(Y, tf.int32), depth=n_classes)
loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
    					logits=logits, labels=labels))
global_step = tf.Variable(0, trainable=False)

# Using a learning rate which has polynomial decay
starter_learning_rate = start_rate
end_learning_rate = 0.00005 # we will use a polynomial decay to reach learning this learning rate.29
decay_steps = decay_step
learning_rate = tf.train.polynomial_decay(starter_learning_rate, global_step,
                                          decay_steps, end_learning_rate,
                                          power=0.5)
# Using adam optimizer to reduce loss
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)

# Initializing the variables
init = tf.global_variables_initializer()

# Model for testing
pred = tf.nn.softmax(logits)  # Apply softmax to logits

# Model for prediction: Used to just return predicted values
prediction=tf.argmax(pred,1)

########## END of model ############

########## Reading and processing input datasets #########

# Default values. 
train_set = 'm-set.csv'
test_set = 'UNSW_NB15_testing-set.csv'

# Comment if you need to hardcode path
# train_set = input("Enter training dataset: ")
# test_set = input("Enter testing dataset: ")
# if not os.path.exists(train_set) or not os.path.exists(test_set):
# 	print("Files not found")
# 	exit()
# Read data using pandas
training = pd.read_csv(train_set, index_col='id')
testing = pd.read_csv(test_set, index_col='id')

# Encoding string columns
training, testing, le = encode_features_and_labels(training, testing)

# Normalising all numerical features:
cols_to_normalise = list(training.columns.values)[:39]
training[cols_to_normalise] = training[cols_to_normalise].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
testing[cols_to_normalise] = testing[cols_to_normalise].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

######## End of preprocessing #######

######## Training and testing #########

def get_accuracy(df):

	# Calculate accuracy for label classification
	categories = prediction.eval(feed_dict={X: df.iloc[:, 0:-2]}) # Getting back the predictions

	# Function to convert categories back into binary labels
	f = lambda x: 0 if le.inverse_transform(x) == "Normal" else 1

	# Prepating the necessary predictions and labels for comparision; converting categories to normal/malicious
	binary_prediction = np.fromiter((f(xi) for xi in categories), categories.dtype, count=len(categories))
	binary_labels = df.iloc[:, -1].values
	
	# Compating predictions and labels to calculate accuracy
	correct_labels = tf.equal(binary_prediction, binary_labels)
	label_accuracy = tf.reduce_mean(tf.cast(correct_labels, tf.float32))
	result = label_accuracy.eval()
	print("Label accuracy: {:.2f}%".format(result*100))

	# Calculate accuracy for category classification
	correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(labels, 1))
	accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
	result = accuracy.eval({X: df.iloc[:, 0:-2], Y: df.iloc[:,-2]})
	print("Category accuracy: {:.2f}%".format(result*100))

def train_and_test_model(training, testing):
	with tf.Session() as sess:
		sess.run(init)

		# Training cycle
		for epoch in range(training_epochs):
			# Shuffling dataset before training
			df = training.sample(frac=1)
			avg_cost = 0.
			total_data = df.index.shape[0] 
			num_batches = total_data // batch_size + 1
			i = 0
			# Loop over all batches
			while i < total_data:
				batch_x = df.iloc[i:i+batch_size, 0:-2].values
				batch_y = df.iloc[i:i+batch_size, -2].values # Last two columns are categories and labels
				i += batch_size
				# Run optimization op and cost op (to get loss value)
				_, c = sess.run([train_op, loss_op], feed_dict={X: batch_x,
				                                                Y: batch_y,
				                                                decay_step: num_batches * training_epochs})
				# Compute average loss
				avg_cost += c / num_batches
			# Display logs per epoch step
			print("Epoch: {:04} | Cost={:.9f}".format(epoch+1, avg_cost))
			get_accuracy(testing)
			print()
		print("Training complete")

		print("Training results: ")
		get_accuracy(training)
		print("Testing results: ")
		get_accuracy(testing)


# Training the model after shuffling the data.
train_and_test_model(training, testing)

In [None]:
"""Spam filter using Naive Bayes classifier"""


import email.parser 
import os, sys, stat
from tqdm import tqdm
import re, cgi
import math, pickle
from decimal import Decimal
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
import pandas as pd
import numpy as np

def extract_content(filename):
	''' Extract the subject and payload from the .eml file.'''
	with open(filename, 'rb') as fp:
		msg = email.message_from_bytes(fp.read())
	sub = msg.get('subject')
	#If it is a multipart message, get_payload returns a list of parts.
	if msg.is_multipart():
		payload = msg.get_payload()[0]	
		payload = payload.as_bytes() #We will consider the body as bytes so it is easier to decode into a unicode string.
	else:
		payload =  msg.get_payload()
	return "{}\n{}" . format(sub, payload)

def get_text_from_email(mail):
	""" Removes html tags and punctuations."""
	tag_re = re.compile(r'(<!--.*?-->|<[^>]*>)')

	# Remove well-formed tags, fixing mistakes by legitimate users
	mail = tag_re.sub('', mail)

	# Clean up anything else by escaping
	mail = cgi.escape(mail)
	
	mail = re.sub(r'([\\][n|t|x])', ' ', mail)                           #Removes \n\t\b strings
	mail = re.sub(r'[=*/&;.,/\" ?:<>\[\]\(\)\{\}\|%#`~\\]', ' ', mail)   #Removes punctuations
	mail = re.sub(r'[- _=+]{2,}|(?=\s)[-_]|[-_](?=\s)', ' ', mail)       #Removes unnecessary hiphens and underscores
	mail = re.sub(r'[\d]', ' ', mail)                                    #Revoves all digits
	mail = re.sub(r'[\'!=+]', '', mail)                                  #Replaces these punctuations with null string
	return mail.lower()


def preprocess(mail):
	"""Preprocess data"""
	# Currently just one preprocessing step.
	mail = get_text_from_email(mail)
	return mail


def add_words_to_dict(word_set, word_dict, ham):
	"""Checks if the word is presnt or not and increments its respective value"""
	for word in word_set:
		if word not in word_dict:
			word_dict[word] = {'spam_count': 0, 'ham_count': 0}
		if ham:
			word_dict[word]['ham_count'] = word_dict[word]['ham_count'] + 1
		else:
			word_dict[word]['spam_count'] = word_dict[word]['spam_count'] + 1 

def calculate_spaminess(word, word_dict, total_ham, total_spam):
	""" Calculate the probability of a message being spam provided that the word is present."""

	pr_s, pr_h = 0.5, 0.5  #Assumming equal probability for both ham and spam
	threshold = 2   #Strength factor to handle rare words
	total_occurance = word_dict[word]['spam_count'] + word_dict[word]['ham_count']  #Total number of times the word has occured in both ham and spam
	freq_s = word_dict[word]['spam_count'] / total_spam 
	freq_h = word_dict[word]['ham_count'] / total_ham
	spamminess = (freq_s * pr_s) / (freq_s * pr_s + freq_h * pr_h)  #The probability that a given mail is spam, provided that this word is present.
	corrected_spaminess = (0.3 * threshold + total_occurance * spamminess) / (threshold + total_occurance)  #Considering the strength factor.
	word_dict[word]['spaminess'] = corrected_spaminess   

def generate_dictionary(files, labels):
	"""Generates a dictionary of all the words in both ham and spam mails"""
	#Initializing variables
	iterator = 0
	word_dict = {}
	total_spam = 0
	total_ham = 0

	for file in tqdm(files):
		#Read and extract mail contents
		try:
			mail = extract_content(file)
		except:
			print("Corrupted File {}" . format(file))
		# Prepare data
		mail = preprocess(mail)
		word_list = [s for s in mail.split()]
		word_set = set(word_list)

		# Incrementing HAM/SPAM count
		ham = (True if (labels[iterator].split()[0]) == "spam" else False)
		if ham:
			total_ham += 1
		else:
			total_spam += 1

		add_words_to_dict(word_set, word_dict, ham)
		iterator += 1
	for word in word_dict:
		calculate_spaminess(word, word_dict, total_ham, total_spam)
	with open('word_dict.pickle', 'wb') as f:
		pickle.dump(word_dict, f)
	return word_dict

def get_scores(expected, predicted):
	""" Compares predicted and expected values and returns various metrics."""
	scores = {}
	# _ implies we do not care about that metric.
	_, scores['False Positives'], scores['False Negatives'], _= confusion_matrix(expected, predicted).ravel()
	scores['Precision'], scores['Recall'], scores['F_score'], _= precision_recall_fscore_support(expected, predicted, average='macro')
	return scores

def training(files, labels):
	"""Trains the model and returns a word dictionary"""
	try:
		with open('word_dict.pickle', 'rb') as f:
			print("Found pickle file. Skipping training")
			word_dict = pickle.load(f)
	except:
		# Generate Dictionary
		word_dict = generate_dictionary(files, labels)

	return word_dict

def predict(files, word_dict):
	"""Predicts values using the word dictionary and returns a list of predictions"""
	predictions = []
	for file in tqdm(files):
		#Read and extract mail contents
		try:
			mail = extract_content(file)
		except:
			print("Corrupted File {}" . format(file))
		
		# Prepare data
		mail = preprocess(mail)
		word_list = [s for s in mail.split()]
		word_set = set(word_list)

		n = 0
		spaminess_list = []
		for word in word_set:
			if word not in word_dict:
				continue              						# Ignore new words (for now)
				spaminess = 0.6       						# Or... assume it is slightly spam ( Gives better FP, but lower f-score)
			else:
				spaminess = word_dict[word]['spaminess']
				if spaminess < 0.6 and spaminess > 0.4:
					continue                                #ignore the word if spaminess is neutral
			spaminess_list.append(spaminess)

		# Adding up all the word probabilities
		for spaminess in spaminess_list:
			n +=  (math.log(1-spaminess) - math.log(spaminess))
		probability = 1 / (1 + Decimal(math.e) ** Decimal(n))
		
		# Predicting 
		if probability > 0.8:
			prediction = '0'
		else:
			prediction = '1'
		predictions.append(prediction)
	return predictions


def main():

	# Default paths for all the inputs. Overrided if script not in the same locations as them.
	train = './trec07p/data'
	test = './trec07p/data'
	spam = './trec07p/full'

	# Getting user input if defaults are not valid
	print("Please make sure the script is in the same directory as the Training and testing folders.")
	if not (os.path.isdir(train) and os.path.isdir(test) and os.path.exists(spam)):
		print("Testing and training datasets not found: ")
		train = input("Enter training dataset path: ")
		test = input("Enter testing dataset path: ")
		spam = input("Enter labels file path: ")
	
	# Getting training and testing files
	train_files = sorted([os.path.join(train, file) for file in os.listdir(train)])[:3000]
	test_files = sorted([os.path.join(test, file) for file in os.listdir(test)])
	files = train_files + test_files
	print("Found the datasets.")
	
	# Spam labels
	with open(spam, 'r') as f:
		labels = [line.split()[0] for line in f.readlines()]
	train_labels = labels[:3000]
	test_labels = labels[3000:]

	# Training our model
	print("Training the model...")
	word_dict = training(train_files, train_labels)
	
	# Predicting labels for both training and testing data.
	print("Testing on both training and testing datasets...")
	predictions = predict(files, word_dict )
	train_predictions = predictions[:3000]
	test_predictions = predictions[3000:]

	# Get respective scores
	test_scores = get_scores(test_labels, test_predictions)
	train_scores = get_scores(train_labels, train_predictions)
	combined_scores = get_scores(labels, predictions)

	# Output results onto the console
	print("\nTraining Scores:")
	for key, value in sorted(train_scores.items()):
		print("{:15} : {:.5}" .format(key, float(value)))
	print("\nTesting Scores: ")
	for key, value in sorted(test_scores.items()):
		print("{:15} : {:.5}" .format(key, float(value)))
	print("\nCombined Scores: ")
	for key, value in sorted(combined_scores.items()):
		print("{:15} : {:.5}" .format(key, float(value)))

	# Creating a results file. Pandas object is used to help format our output.
	mails = {}
	mails['files'] = [os.path.split(file)[1] for file in files]
	mails['labels'] = labels
	mails['predictions'] = predictions
	df = pd.DataFrame(mails)
	df['result'] = np.where(df['predictions'] == df['labels'], "CORRECT", "WRONG")
	df.set_index('files', inplace=True)
	with open('NBresults.txt', 'w') as f:
		f.write(df.to_string())
	print("Results file created: {}" . format(os.path.abspath('NBresults.txt')))

main()