# Basic Machine Learning for Cybersecurity

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd

df = pd.read_csv('C:\Users\kenne\OneDrive\Desktop\Data Science-Development\DataScienceBootcamp\north_korea_missile_test_database.csv')
y = df['Missile Name']
x = df.drop("Missile Name", axis = 1)


xTrain, yTrain, xTest, yTest = train_test_split(x, y, test_size = 0.2, random_state = 31)
xTrain, xVal, yTrain, yVal = train_test_split(xTrain, yTrain, test_size = 0.2, random_state = 31)

In [None]:
# Standardizing Data
from sklearn.preprocessing import StandardScaler
data = pd.read_csv('C:\Users\kenne\OneDrive\Desktop\Data Science-Development\DataScienceBootcamp\file_pe_headers.csv, sep = ","')
x = data.drop(['Name', 'Malware'], axis = 1).to_numpy()
xStand = StandardScaler().fit_transform(x)
print(xStand)

In [None]:
# Start by importing lib and reading the dataset
from sklearn.decomposition import PCA
import pandas as pd
data = pd.read_csv('C:\Users\kenne\OneDrive\Desktop\Data Science-Development\DataScienceBootcamp\file_pe_headers.csv', sep = ",")
x = data.drop(['Name', 'Malware'], axis = 1).to_numpy()

# Standardize the dataset before using PCA
from sklearn.preprocessing import StandardScaler
x_std = StandardScaler().fit_transform(x)

pca = PCA()
pca.fit_transform(x_std)
print(pca.explained_variance_ratio_)
sum(pca.explained_variance_ratio_[0:40])

In [None]:
# Generate a markov chain
%pip install markovify
import markovify
import pandas as pd
from itertools import chain

df = pd.read_csv('C:\Users\kenne\OneDrive\Desktop\Data Science-Development\DataScienceBootcamp\airport_reviews.csv')
n = 100
review_subset = df['content'][0:n]
text = "".join(chain.from_iterable(review_subset))
mark = markovify.Text(text)

for i in range(5):
    print(mark.make_sentence())
    
for i in range(3):
    print(mark.make_short_sentence(140))

In [None]:
# Performing Clustering using scikit-learn
import pandas as pd
import plotly.express as px

df = pd.read_csv('C:\Users\kenne\OneDrive\Desktop\Data Science-Development\DataScienceBootcamp\file_pe_headers.csv', sep = ",")
fig = px.scatter_3d(df, 
                    x = 'SuspiciousImportFunctions', 
                    y = 'SectionLength',
                    z = "SuspiciousNameSection",  
                    color = " Malware")
fig.show()

y = df["Malware"]
x = df.drop(["Name", "Malware"], axis = 1).to_numpy()

from sklearn.cluster import KMeans
est = KMeans(n_clusters= len(set(y)))
est.fit(x)

ypred = est.predict(x)
df["pred"] = ypred
df["pred"] = df["pred"].astype("category")

fig = px.scatter_3d(df, 
                    x = 'SuspiciousImportFunctions', 
                    y = 'SectionLength',
                    z = "SuspiciousNameSection",  
                    color = "pred")
fig.show()

In [None]:
# Training an XGBoost Classifier
import pandas as pd
df = pd.read_csv('C:\Users\kenne\OneDrive\Desktop\Data Science-Development\DataScienceBootcamp\file_pe_headers.csv', sep = ",")
y = df["Malware"]
x = df.drop(["Name","Malware"], axis=1).to_numpy()

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

from xgboost import XGBClassifier
XGB_model = XGBClassifier()
XGB_model.fit(x_train, y_train)

from sklearn.metrics import accuracy_score
y_test_pred = XGB_model.predict(x_test)
accuracy = accuracy_score(y_test, y_test_pred)
print("Accuracy %.2f%%" %(accuracy * 100))

In [None]:
# Analyzing Time Series using Statsmodels
%pip installs statsmodels scipy
from random import random
time_series = [2*x+random() for i in range(1,100)]

import matplotlib.pyplot as plt
plt.plot(time_series)
plt.show()

from statsmodels.tsa.ar_model import AR
model = AR(time_series)
model_fit = model.fit()
y = model_fit.predict(len(time_series), len(time_series))

from statsmodel.tsa.arima_model import ARIMA
model = ARIMA(time_series, order = (0, 1))
model_fit= model.fit(disp = False)
y = model_fit.predict(len(time_series), len(time_series))
print(y)

from statsmodel.tsa.holwinters import SimpleExpSmoothing
model = SimpleExpSmoothing(time_series)
model_fit = model.fit()
y = model_fit.predict(len(time_series), len(time_series))
print(y)

In [None]:
# Anomaly detection with Isolation Forest Implementation
import numpy as np
import pandas as pd

random_seed = np.random.RandomState(12)
x_train = 0.5*random_seed.randn(500,2)
x_train = np.r_[x_train + 3, x_train]
x_train = pd.DataFrame(x_train, columns = ['x', 'y'])

x_test = 0.5*random_seed.randn(500,2)
x_test = np.r_[x_test + 3, x_test]
x_test = pd.DataFrame(x_test, columns = ['x', 'y'])

x_outliers = random_seed.uniform(low=-5, high=5, size=(50,2))
x_outliers = pd.DataFrame(x_outliers, columns = ['x', 'y'])

import matplotlib.pyplot as plt
p1 = plt.scatter(x_train['x'], x_train['y'], c = 'white', s = 50, edgecolor = "black")
p2 = plt.scatter(x_train['x'], x_train['y'], c = 'green', s = 50, edgecolor = "black")
p3 = plt.scatter(x_train['x'], x_train['y'], c = 'blue', s = 50, edgecolor = "black")

plt.xlim(-6, 6)
plt.ylim(-6, 6)
plt.legend(
    [p1, p2, p3],
    ['Training set', 'Normal Testing set', 'Anomalous Testing Data'],
    loc = 'lower left')

plt.show()

from sklearn.ensemble import IsolationForest
clf = IsolationForest()
y_pred_train = clf.predict(x_train)
y_pred_test = clf.predict(x_test)
y_pred_outliers = clf.predict(x_outliers)

x_outliers = x_outliers.assign(pred = y_pred_outliers)
x_outliers.head()

p1 = plt.scatter(x_train['x'], x_train['y'], c = 'white', s = 50, edgecolor = "black")
p2 = plt.scatter(x_outliers.loc[x_outliers['pred'] == -1, 'x'], 
                 x_outliers.loc[x_outliers['pred'] == -1, 'y'],
                 c = "Blue", s = 50, edgecolor = "black")

p3 = plt.scatter(x_outliers.loc[x_outliers['pred'] == 1, 'x'], 
                 x_outliers.loc[x_outliers['pred'] == 1, 'y'],
                 c = "red", s = 50, edgecolor = "black")

plt.xlim(-6, 6)
plt.ylim(-6, 6)
plt.legend(
    [p1, p2, p3],
    ['Training obervations', 'Detected outliners', 'Incorrectly labled outliners'],
    loc = 'lower left')

plt.show()

x_test = x_test.assign(pred = y_pred_test)
x_test.head()


p1 = plt.scatter(x_train['x'], x_train['y'], c = 'white', s = 50, edgecolor = "black")
p2 = plt.scatter(x_test.loc[x_test['pred'] == -1, 'x'], 
                 x_test.loc[x_test['pred'] == -1, 'y'],
                 c = "Blue", s = 50, edgecolor = "black")

p3 = plt.scatter(x_test.loc[x_test['pred'] == 1, 'x'], 
                 x_test.loc[x_test['pred'] == 1, 'y'],
                 c = "red", s = 50, edgecolor = "black")

plt.xlim(-6, 6)
plt.ylim(-6, 6)
plt.legend(
    [p1, p2, p3],
    ['Training obervations', 'Detected outliners', 'Incorrectly labled outliners'],
    loc = 'lower left')

plt.show()

In [None]:
# Natural Language Processing using a Hasing Vectorizer and tf-idf Introduction
with open("anonops_short.txt", encoding = "utf8") as f:
    anonops_short = f.readlines()

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

my_vector = CountVectorizer(input = "content", ngram_range = (1, 2)) 
x_train_counts = my_vector.fit_transform(anonops_short)
tf_transfomer = TfidfTransformer(use_idf = True,).fit(x_train_counts)
x_train_tf = tf_transfomer.transform(x_train_counts)   

print(x_train_counts)

# Detetcing Email Cybersecurity Threats with AI

In [None]:
# Simple Perception-based spam filter
import pandas as pd
import numpy as np

df = pd.read_csv('spam.csv')
y = df.iloc[:, 0].values
y = np.where(y == 'spam', -1, 1)
x = df.iloc[:, 1:2].values

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=0)

from sklearn.linear_model import Perceptron
p = Perceptron(max_iter = 40, eta = 0.1, random_state = 0)
p.fit(x_train, y_train)

y_pred = p.predict(x_test)

from sklearn.metrics import accuracy_score
print("Misclassified samlpes: %d", (y_test != y_pred)).sum()
print("Accuracy: %.2f" %(accuracy_score(y_test, y_pred)))

In [None]:
# Spam Detection with SVMS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('spam.csv')
y = df.iloc[:, 0].values
y = np.where(y == 'spam', -1, 1)
x = df.iloc[:, 1:2].values

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=0)

from sklearn.linear_model import Perceptron
p = Perceptron(max_iter = 40, eta = 0.1, random_state = 0)
p.fit(x_train, y_train)

from sklearn.svm import SVC
svm = SVC(kernel = 'linear', c = 1.0,random_state = 0)
svm.fit(x_train, y_train)
y_pred = svm.predict(x_test)

from defs import plot_decision_regions
x_combined = np.vstack((x_train, x_test))
y_combined = np.hstack((y_train, y_test))

plot_decision_regions(x_combined, y_combined, classifier = svm, test_idx = range(-15,15))
plt.xlabel('Suspect words')
plt.ylabel('Spam or ham')
plt.tight_layout()
plt.show()

from sklearn.metrics import accuracy_score
print("Misclassified samples: %d" % (y_test!= y_pred).sum())
print("Accuracy: %.2f%%" % (accuracy_score(y_test, y)))

In [None]:
# Linear Regression for spam detection
import pandas as pd
import numpy as np

df = pd.read_csv('spam.csv')
x = df.iloc[:, 1:2].values
y = df.iloc[:, 0].values
y = np.where(y == 'spam', -1, 1)

from sklearn.linear_model import LinearRegression
linear = LinearRegression()
linear.fit(x, y)
print(linear.score(x, y))

In [None]:
# Logistic Regression implementation 
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

phishing = np.genfromtxt('phishing.csv', delimiter=',', type = np.int(32))
samples = phishing[:,:-1]
targets = phishing[:-1]

from sklearn.model_selection import train_test_split
train_samples, test_samples, train_targets, test_targets = train_test_split(samples, targets, test_size = 0.2, random_state = 0)
log_class = LogisticRegression()
log_class.fit(train_samples, train_targets)
predictions = log_class.predict(test_samples)
accuracy = 100.0* accuracy_score(test_targets, predictions)
print("Logistic Regression Accuracy: " +  str(accuracy))

In [None]:
# Phishing Detection with Decision Tress
import numpy as np
import pandas as pd
from sklearn import *
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

phishing = np.genfromtxt('phishing.csv', delimiter=',', dtype = np.int(32))
samples = phishing[:,:-1]
targets = phishing[:-1]

from sklearn.model_selection import train_test_split
train_samples, test_samples, train_targets, test_targets = train_test_split(samples, targets, test_size = 0.2, random_state= 0)


from sklearn import tree
tree_clf = tree.DecisionTreeClassifier()
tree_clf = tree_clf.fit(train_samples, train_targets)
predictions = tree_clf.predict(test_samples)
accuracy = 100.0* accuracy_score(test_targets, predictions)
print("Decision Tree accurcay" + str(accuracy))

In [None]:
# Natural Language Processing with Naive Bayes Classifier
import matplotlib.pyplot as plt
import csv
from textblob import TextBlob
import pandas as pd
import sklearn
import numpy as np
import nltk

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from defs import get_tokens
from defs import get_lemmas
nltk.download('punkt')
nltk.download('stopwords')

sms = pd.read_csv("spam_no_header.csv", sep = ",", names = ["type", "text"])
text_train, text_test = train_test_split(sms["text"], sms['type'],test_size = 0.3)
bow = CountVectorizer(analyzer = get_lemmas).fit(text_train)
sms_bow = bow.transform(text_train)
tfidf = TfidfTransformer().fit(sms_bow)
sms_tfidf = tfidf.transform(sms_bow)
spam_detector = MultinomialNB().fit(sms_tfidf, type_train)

mgs = sms["text"][25]
mgs_bow = bow.transform([mgs])
mgs_tfidf = tfidf.transform([mgs_bow])
print("Predicted:" , spam_detector.predict(mgs_tfidf)[0])
print("Expected:", sms.type[25])

predictions = spam_detector.predict(sms_tfidf)
print("Accuracy: ", accuracy_score(sms['type'][:len(predictions)], predictions))
print(classification_report(sms['type'][:len(predictions)], predictions))

# Malware Threat Detection 

In [None]:
# Malware Detection with Decision Tree
import pandas as pd
import numpy as np
from sklearn import *
from sklearn.metrics import accuracy_score

malware_data = pd.read_csv('malware_data.csv', delimiter = ",")
samples = malware_data.iloc[:[0,4]].values
target = malware_data.iloc[:,8].values

from sklearn.model_selection import train_test_split
train_samples, test_samples = train_test_split(samples, target,test_size = 0.2)

from sklearn.tree import DecisionTreeClassifier
tree_class = DecisionTreeClassifier()
tree_class.fit(train_samples, target)
predictions = tree_class.predict(test_samples)
accurcay = 100* accuracy_score(test_samples, predictions)
print("Decision Tree Accuracy: "+ str(accurcay))

In [None]:
# Malware Detection with Random Forest
import pandas as pd
import numpy as np
from sklearn import *
from sklearn.metrics import accuracy_score

malware_data = pd.read_csv('malware_data.csv', delimiter = ",")
samples = malware_data.iloc[:[0,4]].values
target = malware_data.iloc[:,8].values

from sklearn.model_selection import train_test_split
train_samples, test_samples, train_targets, test_targets = train_test_split(samples, targets, test_size = 0.2, random_state= 0)

rfc = ensemble.RandomForestClassifier(n_estimators = 100)
rfc.fit(train_samples, train_targets)
accuracy = rfc.score(test_samples, train_targets)
print("Random Forest Classifier Accuracy: " + str(accuracy * 100))

In [None]:
# Malware Detection with K-Means
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

malware_data = pd.read_csv('malware_data.csv', delimiter = ",")
samples = malware_data.iloc[:, [1,2,3,4]].values
targets = malware_data.iloc[:, 8].values
k_means = KMeans(n_clusters = 3, max_iter= 300)
k_means.fit(samples)

print("k_Means labels: ", str(k_means.labels_))
print("K-means clustering results ", pd.crosstab(targets, k_means.labels_, rownames = ["Oberverd"], colnames = ["Predicted"]))
print("Silhouette Coefficient: %0.3f"% silhouette_score(samples, k_means.labels_, metric = "euclidean"))

# Advanced Malware Threat Detection with AI 

In [None]:
# Detecting obfuscated Javascript file
import os 
from sklearn.feature_extraction.text import MashingVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline

js_path = "JavascriptSamples"
obfuscated_js_path = "ObfuscatedJavascriptSamples"

corpus = []
labels = []
file_types_and_labels = [(js_path, 0), (obfuscated_js_path, 1)]

for files_path, label in file_types_and_labels:
    files = os.listdir(files_path)
    for file in files:
        file_path = file_path + "/" + file
        try:
            with open(file_path, "r") as myfile:
                data = myfile.read().replace("\n", "")
                data = str(data)
                corpus.append(data)
                labels.append(label)
        except:
            pass
    
x_train, x_test, y_train, y_test = train_test_split(corpus, labels, test_size = 0.3, random_state = 42)
text_clf =  Pipeline([('vect', MashingVector(input = "content"), ngram_range = (1,3),"tfidf", TfidfTransformer(use_idf = True),"rf", RandomForestClassifier(class_weight = "balanced"))])

text_clf.fit(x_train, y_train)
y_test_pred = text_clf.predict(x_test)
print("Accuracy: ", accuracy_score(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))

In [None]:
# Tracking Malware drift
month0 = {"Trojan":24, "CryptoMiner": 11, "other":36, "Worm": 29}
month1 = {"Trojan":28, "CryptoMiner": 25, "other":22, "Worm": 25}
month2 = {"Trojan":18, "CryptoMiner": 36, "other":41, "Worm": 5}
month3 = {"CryptoMiner":24, "Trojan": 33, "other":44, "Worm": 5}
months = [month0, month1, month2, month3]

trojan_time_series = []
crypto_miner_time_series = []
worm_time_series = []
other_time_series = []

for month in months:
    trojan_time_series.append(month["Trojan"])
    crypto_miner_time_series.append(month["CryptoMiner"])
    worm_time_series.append(month["Worm"])
    other_time_series.append(month["other"])

plt.title("Trojan")
plt.plot(trojan_time_series, label = "Trojan")
plt.savefig("TrojanGraph.png")
plt.show()

plt.title("Crypto_miner")
plt.plot(trojan_time_series, label = "Crypto_miner")
plt.savefig("CryptoMinerGraph.png")
plt.show()

plt.title("Worm")
plt.plot(trojan_time_series, label = "Worm")
plt.savefig("WormGraph.png")
plt.show()

plt.title("Other")
plt.plot(trojan_time_series, label = "Other")
plt.savefig("OtherGraph.png")
plt.show()

from statsmodels.tsa.arima_model import ARIMA
ts_model = ARIMA(trojan_time_series, order = (0,1))
model_fit_to_data = ts_model.fit(disp = True)
y_Trojan = model_fit_to_data.predict(len(trojan_time_series), len(trojan_time_series))
print("Predicted Trojan count for next month: ", str(y_Trojan[0]), + "%")

ts_model = ARIMA(crypto_miner_time_series, order = (0,1))
model_fit_to_data = ts_model.fit(disp = True)
y_Trojan = model_fit_to_data.predict(len(crypto_miner_time_series), len(crypto_miner_time_series))
print("Predicted Trojan count for next month: ", str(y_Trojan[0]), + "%")

ts_model = ARIMA(worm_time_series, order = (0,1))
model_fit_to_data = ts_model.fit(disp = True)
y_Trojan = model_fit_to_data.predict(len(worm_time_series), len(worm_time_series))
print("Predicted Trojan count for next month: ", str(y_Trojan[0]), + "%")

ts_model = ARIMA(other_time_series, order = (0,1))
model_fit_to_data = ts_model.fit(disp = True)
y_Trojan = model_fit_to_data.predict(len(other_time_series), len(other_time_series))
print("Predicted Trojan count for next month: ", str(y_Trojan[0]), + "%")

In [None]:
# Botnet Detection with Machine Learning
import numpy as np
import pandas as pd
from sklearn.tree import *
from sklearn.linear_model import *
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import *
from sklearn.neighbors import *
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

dataset = pd.read_csv("botnet_data.csv")
samples = dataset.iloc[:, [1,2]].values
target = dataset.iloc["ANOMALY"].values
training_samples, testing_samples, training_target, testing_target = train_test_split(samples, target, test_size = 0.3, random_state = 0)

knc = KNeighborsClassifier(n_neighbors = 2)
knc.fit(training_samples, training_target)
knc_pred = knc.predict(testing_samples)
knc_accuracy = 100.0 * accuracy_score(testing_target, knc_pred)
print("KNeighborsClassifier accuracy: " + str(knc_accuracy))

dfc = DecisionTreeClassifier(random_state=0)
dfc.fit(training_samples, training_target)
dfc_pred = dfc.predict(testing_samples)
dfc_accuracy = 100.0 * accuracy_score(testing_target, dfc_pred)
print("DecisionTreeClassifier accuracy: " + str(dfc_accuracy))

gnb = GaussianNB()
gnb.fit(training_samples, training_target)
gnb_pred = gnb.predict(testing_samples)
gnb_accuracy = 100.0 * accuracy_score(testing_target, gnb_pred)
print("GaussianNB accuracy: " + str(gnb_accuracy))

In [None]:
# Anomaly Detection Module
import math
import numpy as np

class AnomalyDetector:
    def __init__(self, data):
        (self.mu_param, self.sigma_squared) = GaussionAnomalyDetection.estimate_gaussian(data)
        
        self.data = data
        
    def mulitvarible_gaussion(self, data):
        mu_param = self.mu_param
        sigma_squared = self.sigma_squared
        (num_examples, num_features) = data.shape
        probabilities = np.zeros((num_examples, 1))
        
        for example_index in range(num_examples):
            for feature_index in range(num_features):
                power_divdend = (data[example_index, feature_index] - mu_param[feature_index]) ** 2
                power_divder = 2 * sigma_squared[feature_index]
                e_power = -1 * power_divdend / power_divder
                
                probabilities_prefix = 1/math.sqrt(2 * math.pi * sigma_squared[feature_index])
                probability = probabilities_prefix * math.exp(e_power)
                probabilities[example_index] *= probability
        
        return probabilities
    
    def estimate_gaussian(self, data):
        num_examples = data.shape[0]
        mu_param = (1 / num_examples) * np.sum(data, axis = 0)
        sigma_squared = (1 / num_examples) * np.sum((data - mu_param) ** 2, axis = 0)
        
        return (mu_param, sigma_squared)
    
    def select_threshold(labels, probabilities):
        best_epsilon = 0
        best_fl = 0
        
        precision_history = []
        recall_history = []
        fl_history = []
        
        min_probability = np.min(probabilities)
        max_probability = np.max(probabilities)
        step_size = (max_probability - min_probability) / 1000
        
        for epsilon in np.arange(min_probability, max_probability, step_size):
            predictions = probabilities < epsilon
            false_positives = np.sum((predictions == 1) & (labels == 0))
            false_negatives = np.sum((predictions == 0) & (labels == 1))
            true_positives = np.sum((predictions == 1) & (labels == 1))
            
            if (true_positives + false_positives) == 0 or (true_positives + false_negatives) == 0:
                continue
            
            precision = true_positives / (true_positives + false_positives)
            recall = true_positives / (true_positives + false_negatives)
            fl_score = 2 * precision * recall / (precision + recall)
                
            if fl_score  > best_fl:
                best_epsilon = epsilon
                best_fl = fl_score   
                
        return best_epsilon, best_fl, precision_history, recall_history, fl_history

In [None]:
# Gaussion Anomaly Detection Implementation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

dataset = pd.read_csv('Social_Network_Ads.csv')
hist_dist = dataset[["LATENCY", "THROUGHPUT"]].hist(grid = False, figsize = (10, 4))

data = dataset[["LATENCY", "THROUGHPUT"]].values
plt.scatter(data[:, 0], data[:, 1], alpha = 0.6)
plt.xlabel("Latency (ms)")
plt.ylabel("THROUGHPUT")
plt.title("Date Flow")
plt.show()

from guassian_anomaly_detection import GuassianAnomalyDetection
gaussian_anomaly_detection = GuassianAnomalyDetection(data)
print("mu param estimation: ")
print(gaussian_anomaly_detection.mu_param)

# Securing Users Authentication

In [None]:
# Keystroke Demonstration 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.neural_network import MLPClassifier

pwd_data = pd.read_csv("pwd_data.csv", header = 0)
DD = [dd for dd in pwd_data.columns if dd.startswuth("DD")]
plot = pwd_data[DD]
plot["subject"] = pwd_data["subject"].Values
plot = plot.groupby("subject").mean()
plot.iloc[:6].T.plot(figsize = (8,6), title = "Average Keystroke Latency per Subjects")

data_train, data_test = train_test_split(pwd_data, test_size = 0.2, random_state = 0)
x_train = data_train[pwd_data.columns[2:]]
y_train = data_train["subject"]
x_test = data_test[pwd_data.columns[2:]]
y_test = data_test["subject"]

knc = KNeighborsClassifier()
knc.fit(x_train, y_train)
y_pred = knc.predict(x_test)
knc_accuracy = metrics.accuracy_score(y_test, y_pred)
print("KNeighborsClassifier Accuracy: " + str(knc_accuracy))

svc = svm.SVC(kernal = "linear")
svc.fit(x_train, y_train)
y_pred = svc.predict(x_test)
svc_accuracy = metrics.accuracy_score(y_test, y_pred)
print("SVC Accuracy: " + str(svc_accuracy))

mlpc = MLPClassifier()
mlpc.fit(x_train, y_train)
y_pred = mlpc.predict(x_test)
mlpc_accuracy = metrics.accuracy_score(y_test, y_pred)
print("MLPClassifier Accuracy: " + str(mlpc_accuracy))

from sklearn.metrics import confusion_matrix
labels = list(pwd_data['subject'].unique())
cm = confusion_matrix(y_test, y_pred, labels)
figure = plt.figure()
axes = figure.add_subplot(111)
figure.colorvar(axes.matshow(cm))
axes.set_xticklabels([''] + labels)
axes.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('Actual')

In [None]:
# Elgenfaces Implementation
from sklearn.datasets import fetch_lfw_people
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

lfw = fetch_lfw_people(min_faces_per_person = 100)
x_data = lfw.data
y_target = lfw.target
names = lfw.target_names
x_train, x_test, y_train, y_test = train_test_split(x_data, y_target, test_size = 0.3)

pca = PCA(n_components = 150, whiten = True)
pca.fit(x_train)
pac_train = pca.transform(x_train)
pca_test = pca.transform(x_test)
mlpc = MLPClassifier()
mlpc.fit(pac_train, y_train)
y_pred = mlpc.predict(pca_test)
print(classification_report(y_test, y_pred, target_names = names))

# Automatic Intrusion Detection

In [None]:
# Credit Card Fraud Detection Implementation
import numpy as np
import pandas as pd

fraud_df = pd.read_csv("FinancialFraudDB.csv", index_col = None)
card_replacement_cost = 5
customer_freeze_cost = 3

cost_matrix = np.zeros(len(fraud_df.index), 4)
cost_matrix[:, 0] = card_replacement_cost * np.ones(len(fraud_df.index))
cost_matrix[:, 1] = fraud_df["Amount"].values
cost_matrix[:, 2] = card_replacement_cost * np.ones(len(fraud_df.index))

y = fraud_df.pop("Class").values
x = fraud_df.values

from sklearn.model_selection import train_test_split
sets = train_test_split(x, y, cost_matrix, test_size = 0.25, random_state = 11)
x_train, x_test, y_train, y_test, cost_matrix_train, cost_matrix_test = sets

from sklearn import tree
y_pred_test_dt = tree.DecisionTreeClassifier().fit(x_train, y_train).predict(x_test)

%pip install costcla
from costcla.models import CostSenitiveDecisionTreeClassifier
y_pred_test_csdt = CostSenitiveDecisionTreeClassifier().fit(x_train, y_train, cost_matrix_train).predict(x_test)

from costcla.metrics import savings_score
print(savings_score(y_test, y_pred_test_dt, cost_matrix_test))
print(savings_score(y_test, y_pred_test_csdt, cost_matrix_test))

In [None]:
# Counterfeit Banknote Detection Implementation
import pandas as pd
df = pd.read_csv("data_banknote_authentication.txt", header = None)
df.columns = ["0", "1", "2", "3", "label"]

from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df)

y_train = df_train.pop("label").values
x_train = df_train.values
y_test = df_test.pop("label").values
x_test = df_test.values

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(x_train, y_train)
print(clf.score(x_test, y_test))

In [None]:
# Ad Blocking using Machine Learning Implementation
import pandas as pd
df = pd.read_csv("ad.data", header = None)
df.rename(columns = {1558: "label"}, inplace = True)

improper_rows = []
for index, row in df.iterrows():
    for col in df.columns:
        val = str(row[col]).strip()
        if val == "?":
            improper_rows.append(index)
            
df = df.drop(df.index[list(set(improper_rows))])
def label_to_numeric(row):
    if row['label'] == "ad":
        return 1
    else:
        return 0
    
df["label"] = df.apply(label_to_numeric, axis = 1) 

from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df)
y_train = df_train.pop("label").values
y_test = df_test.pop("label").values
x_train =  df_train.values
x_test = df_test.values

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)   

In [None]:
# Wireless Indoor Localization Implemention
import pandas as pd
df = pd.read_csv("wifi_localization.txt", sep = "\t",header = None)
df = df.rename(columns = {7: "room"})

from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df)
y_train = df_train.pop("room").values
y_test = df_test.pop("room").values
x_train = df_train.values
x_test = df_test.values

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

from sklearn.metrics import accuracy_score, confusion_matrix    
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

In [None]:
# IOT Device Type Identification using Machine Learning Implementation
import pandas as pd
import os
training_data = pd.read_csv("iot_device_type_training_data.csv")
testing_data = pd.read_csv("iot_device_type_test_data.csv")

x_train, y_train = (training_data.loc[: , training_data.columns != "device_type"].values,
                    training_data["device_category"])

x_test, y_test = (testing_data.loc[: , testing_data.columns != "device_type"].values,
                    testing_data["device_category"])

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(training_data["device_category"].unique())
y_train_encoded = le.transform(y_train)
y_test_encoded = le.transform(y_test)

from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(x_train, y_train_encoded)
model.score(x_test, y_test_encoded)

In [None]:
# Deepfake Recognition using Machine Learning Implementation
from mesonet_classifiers import *
from keras_preprocessing.image import ImageDataGenerator

MesoNet_classifier = Meso4()
MesoNet_classifier.load("weights/Meso4_DF")

image_data_generator = ImageDataGenerator(rescale = 1./255)
data_generator = image_data_generator.flow_from_directory("",classes = ["mesonet_test_images"])

nue_to_label = {1: "real", 0: "fake"}
x,y = data_generator.next()
probilities = MesoNet_classifier.predict(x)
predictions = [nue_to_label[round(x[0])] for x in probilities]
print(predictions)

# Securing and Attacking Data with Machine Learning

In [None]:
# Assessing Password Security using Machine Learning Implementation
import pandas as pd
df = pd.read_csv("passwordDataset.csv", dtype = {"password": str, "strength": int}, index_col = None)
df = df.sample(frac = 1)

train_df = df.head(int(1 * 0.8))
test_df = df.tail(int(1 * 0.2))

y_train = train_df.pop("strength").values
y_test = test_df.pop("strength").values
x_train = train_df.values.flatten()
x_test = test_df.values.flatten()

def charater_tokens(input_string):
    return [x for x in input_string]

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier

password_clf = Pipeline([("vect", TfidfVectorizer(tokenizer = charater_tokens)), ("clf", XGBClassifier()),])

password_clf.fit(x_train, y_train)
password_clf.score(x_test, y_test)

common_password = "qwerty"
strong_computer_generated_password = "a1b2c3d4e5f6g7h8i9j0"

password_clf.predict([common_password, strong_computer_generated_password])

In [None]:
# Machine Learning-based Steganalysis Implementation
boss_features_path = "boosbase.fea"
bossbase_lsb_features_path = "bossbase_lsb.fea"

feature_with_labels = [(boss_features_path, 0), (bossbase_lsb_features_path, 0)]

x = []
y = []

for feature_path, label in feature_with_labels:
    with open(feature_path, "r") as f:
        for line in f:
            fv = line.split()
            x.append(fv)
            y.append(label)
            
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 11)

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf = clf.fit(x_train, y_train)
print(clf.score(x_test, y_test))

In [None]:
# HIPAA Data Breaches - Data Exploration and Visualization
import pandas as pd
df = pd.read_csv("HIPAA-Breach-report-2000-to-2017.csv")
df = df.dropna()
df.head()

%matplotlib inline
def_fig_size = (15, 6)
df["Individuals Affected"].plot(kind = "hist", 
                                figsize = def_fig_size,
                                log = True,
                                title = "Distribution of Individuals Affected")

df.groupby("Coverd Entity Type").mean().plot(kind = "bar", 
                                figsize = def_fig_size,
                                log = True,
                                title = "Average Breach Size by Entity Type")

df.groupby("State").sum().nlargest(20, "Individuals Affected").plot.pie(y = "Individuals Affected",
                                                                        firgsize = def_fig_size,
                                                                        legend = False)

df.groupby("Type of Breach").mean().plot(kind = "bar", 
                                figsize = def_fig_size,
                                log = True,
                                title = "Average Breach Size by Type of Breach")

from sklearn.feature_extraction.text import tdidfVectorizer
vect = TfidfVectorizer()

df["Web Description"] = df["Web Description"].str.replace("\r", " ")
x = df["web Description"].values
x_transformed = vect.fit_transform(x)                                                                   