## Implementing various types of machine learning algorithms to detect the anomalies in computer network.

# Logistic regression.
### Importing necessary libraries.

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

### Importing the dataset

In [2]:
dataset = pd.read_csv('malicious_phish.csv')
X = dataset.iloc[:10000, :-1].values
y = dataset.iloc[:10000, -1].values

In [3]:
dataset

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement
...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing
651188,www.gamespot.com/xbox360/action/deadspace/,phishing
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing


In [4]:
import sklearn
def encode_string_array(string_array):
  encoder = sklearn.preprocessing.OneHotEncoder(sparse=False)
  encoded_array = encoder.fit_transform(string_array)

  return encoded_array

X = encode_string_array(X)



In [5]:
le = LabelEncoder()
y = le.fit_transform(y)

In [6]:
print(X)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [7]:
y

array([3, 0, 0, ..., 0, 0, 0])

### Spiltting the dataset into traning and testing part

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [9]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test= sc.transform(X_test)

In [10]:
print(X_train)

[[-0.01154778 -0.01154778 -0.01154778 ... -0.01154778 -0.01154778
   0.        ]
 [-0.01154778 -0.01154778 -0.01154778 ... -0.01154778 -0.01154778
   0.        ]
 [-0.01154778 -0.01154778 -0.01154778 ... -0.01154778 -0.01154778
   0.        ]
 ...
 [-0.01154778 -0.01154778 -0.01154778 ... -0.01154778 -0.01154778
   0.        ]
 [-0.01154778 -0.01154778 -0.01154778 ... -0.01154778 -0.01154778
   0.        ]
 [-0.01154778 -0.01154778 -0.01154778 ... -0.01154778 -0.01154778
   0.        ]]


In [11]:
print (y_test)

[1 0 0 ... 0 0 0]


## Training the logistic regression

In [12]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state= 0)
classifier.fit(X_train,y_train)

## Predicting the result

In [13]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0 1]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


In [14]:
from sklearn.metrics import confusion_matrix,accuracy_score
print(confusion_matrix(y_test,y_pred))
accuracy_score(y_test,y_pred)

[[1826    0    0    0]
 [ 456    0    0    0]
 [  57    0    9    0]
 [ 152    0    0    0]]


0.734

## Now with Random forest classifier

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
dataset = pd.read_csv('malicious_phish.csv')
X = dataset.iloc[:1000, :-1].values
y = dataset.iloc[:1000, -1].values

In [3]:
from sklearn.preprocessing import OneHotEncoder
def encode_string_array(string_array):
  encoder = OneHotEncoder(sparse=False)
  encoded_array = encoder.fit_transform(string_array)

  return encoded_array
X = encode_string_array(X)



In [4]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [6]:
print(X_train)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [7]:
print(y_train)

[1 0 0 1 0 0 0 0 3 2 0 1 0 0 0 3 0 3 0 0 0 0 1 1 0 0 0 3 0 0 0 0 0 1 3 1 0
 1 2 0 1 0 0 0 0 3 0 0 0 0 3 1 0 0 0 0 0 3 0 2 0 1 0 0 0 1 0 1 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 2 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 2 0 2 0 1
 0 0 1 0 0 1 0 0 0 0 0 3 0 0 1 0 0 0 0 0 0 3 0 1 1 2 0 3 0 0 0 2 1 1 0 0 0
 0 0 0 0 1 3 1 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 1 1 2 0 0 0 0 1 0 1 0 0
 1 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 3 0 0 0 2 1
 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 3 0 0 0 0 0 1 0 0 0 0 0 0 1 0 3 0 1 0 0 1 0
 0 0 0 1 1 1 0 1 0 1 1 1 2 0 0 1 0 2 0 2 1 0 0 0 0 0 0 0 2 3 0 3 0 1 0 3 0
 0 0 1 3 1 0 0 0 3 2 1 0 0 0 1 3 1 0 1 0 0 0 0 0 3 1 0 1 1 0 1 0 0 0 1 1 0
 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 3 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 0 0 0 1
 3 3 0 0 0 0 0 0 3 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0
 0 1 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 3 0 1 0 3 0 0 0 0 0 1 0
 1 0 0 0 3 1 0 1 0 0 0 0 

In [8]:
print(X_test)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [9]:
print(y_test)

[0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 3 0 0
 0 0 0 0 0 0 0 3 0 2 0 0 0 1 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 2 1 0 1 0
 0 0 0 0 0 1 0 0 1 0 0 0 0 2 2 3 1 0 0 1 0 1 0 0 0 0 0 1 1 0 0 0 0 0 2 0 0
 0 0 0 1 0 1 0 0 0 3 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 1 3 2 1 0 0 0 0 0 1 0
 0 0 0 0 1 0 1 0 0 0 2 0 0 0 1 0 0 0 0 0 0 0 0 3 0 0 0 0 2 1 1 0 0 0 0 0 2
 0 2 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 3 0 3 1 0 0
 0 0 1 0 0 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 0 0 3 1 0 0 1 0]



## Feature scaling 

In [10]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [11]:
print(X_train)

[[-0.0365392 -0.0365392 -0.0365392 ... -0.0365392 -0.0365392 -0.0365392]
 [-0.0365392 -0.0365392 -0.0365392 ... -0.0365392 -0.0365392 -0.0365392]
 [-0.0365392 -0.0365392 -0.0365392 ... -0.0365392 -0.0365392 -0.0365392]
 ...
 [-0.0365392 -0.0365392 -0.0365392 ... -0.0365392 -0.0365392 -0.0365392]
 [-0.0365392 -0.0365392 -0.0365392 ... -0.0365392 -0.0365392 -0.0365392]
 [-0.0365392 -0.0365392 -0.0365392 ... -0.0365392 -0.0365392 -0.0365392]]


In [12]:
print(X_test)

[[-0.0365392 -0.0365392 -0.0365392 ... -0.0365392 -0.0365392 -0.0365392]
 [-0.0365392 -0.0365392 -0.0365392 ... -0.0365392 -0.0365392 -0.0365392]
 [-0.0365392 -0.0365392 -0.0365392 ... -0.0365392 -0.0365392 -0.0365392]
 ...
 [-0.0365392 -0.0365392 -0.0365392 ... -0.0365392 -0.0365392 -0.0365392]
 [-0.0365392 -0.0365392 -0.0365392 ... -0.0365392 -0.0365392 -0.0365392]
 [-0.0365392 -0.0365392 -0.0365392 ... -0.0365392 -0.0365392 -0.0365392]]


## Training of Random Forest Classifier

In [13]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 5, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

## Predicting of test result

In [14]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 1]
 [0 3]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 3]
 [0 0]
 [0 2]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 3]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 2]
 [0 1]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 2]
 [0 2]
 [0 3]
 [0 1]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 2]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 3]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 1]
 [0 3]
 [2 2]
 [0 1]
 [0 0]
 [0 0]

## Making the Confusion Matrix

In [15]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[189   0   0   0]
 [ 41   0   0   0]
 [  9   0   1   0]
 [ 10   0   0   0]]


0.76

## Using Rnadom forest Classifier , we clearly see that the accuracy is increased by nearly 4% on the same dataset.