# NLP Supervised Learning Using Naive Bayes

## Importing the libraries

In [16]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


# Importing the dataset

In [17]:
dataset = pd.read_csv('alfa_dataset.csv - Sheet1.csv', delimiter = ',')
print(dataset.head())
print(dataset.shape)

                                    Log file entries  KB index
0  [Mon Feb 12 13:33:43.537781 2018] [autoindex:e...         0
1  [Tue Feb 13 21:57:17.848217 2018] [auth_basic:...         1
2  [Wed Apr 11 02:20:30.821939 2018] [:error] [pi...         2
3  [Fri Apr 20 13:57:53.308061 2018] [:error] [pi...         3
4  [Sun Apr 22 18:58:53.006422 2018] [authz_core:...         4
(60, 2)


# Cleaning the text

In [18]:
import re
import nltk
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 60):
  #print("here", dataset['Log file entries'][i])
  log_entry = re.sub(r"\[[(\w+\d+\s+:\.)]+|\]|/(\w+/)+|(http(://(\w+\.)+))+|(https(://(\w+\.)+))+|(\([\w+\.|\w+,|\w+\)|\w+\\|\.]+)|line(\s+\d+)|referer(:\w+)+|[^a-zA-Z\s+]|\d+|\w+(\-|_|\w+)*\.php|AH|referer|COS|za", " ", dataset['Log file entries'][i])
  #print(log_entry)
  log_entry = log_entry.split()
  ps = PorterStemmer()
  log_entry = [ps.stem(word) for word in log_entry]
  #print(log_entry)
  log_entry = ' '.join(log_entry)
  #print(log_entry)
  corpus.append(log_entry)

print(corpus)

['cannot serv directori No match directoryindex found and server gener directori index forbidden by option direct', 'user u authent failur for password mismatch', 'php warn mysqli connect access deni for user root localhost password NO in on', 'php pars error syntax error unexpect return in on', 'user u authori tion failur for', 'php warn requir onc fail to open stream No such file or directori in on', 'php fatal error requir onc fail open requir dbcon xef xac php php in on', 'php warn requir onc fail to open stream No such file or directori in on', 'php notic undefin variabl salt in on', 'php notic undefin index email in on', 'script not found or unabl to stat', 'php fatal error uncaught error call to undefin function mysqli in nstack trace n includ requir onc studen n main n thrown in on', 'php warn includ fail to open stream No such file or directori in on', 'php warn includ fail open for inclus php php in on', 'php notic use of undefin constant consol assum consol in on', 'php warn

# Creating the Bag of Words Model

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
countVectorizer = CountVectorizer(max_features = 1500)
X = countVectorizer.fit_transform(corpus).toarray()
y = dataset.iloc[ :, -1].values

# Splitting the dataset into Training set and Test set

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Training Naive Bayes Model on the Training set

In [21]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

# Predicting the test set results

In [22]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[10 10]
 [11 11]
 [ 8  8]
 [ 9  9]
 [11 11]
 [ 2  2]
 [ 3  3]
 [ 8  8]
 [ 0  0]
 [ 9 22]
 [ 4  4]
 [10 10]
 [10 10]
 [ 1  1]
 [21 21]]
[[1 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 2 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 3 0 0 0]
 [0 0 0 0 0 0 0 0 2 0 0]
 [0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 1 0 0 0 0]]


0.9333333333333333