In [1]:
import os
import io
import numpy
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
#Function to read files (emails) from the local directory
def readFiles(path):
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            path = os.path.join(root, filename)

            inBody = False
            lines = []
            f = io.open(path, 'r', encoding='latin1')
            for line in f:
                if inBody:
                    lines.append(line)
                elif line == '\n':
                    inBody = True
            f.close()
            message = '\n'.join(lines)
            yield path, message


In [3]:
def dataFrameFromDirectory(path, classification):
    rows = []
    index = []
    for filename, message in readFiles(path):
        rows.append({'message': message, 'class': classification})
        index.append(filename)

    return DataFrame(rows, index=index)

In [12]:
#An empty dataframe with 'message' and 'class' headers
data = DataFrame({'message': [], 'class': []})

#Including the email details with the spam/ham classification in the dataframe
data = data.append(dataFrameFromDirectory('./spam', 'spam'))
data = data.append(dataFrameFromDirectory('./ham/', 'non-spam'))

#Head and the Tail of 'data'
data.head()
print(data.tail())


                                                                                        message  \
./ham/00293.d74734bf5cf13ea138e4ee4df7555008  Looks and sounds a hell of a lot like Clare's ...   
./ham/00431.7f3adeb8cda736429bbe7ee757a07232  \n\nA groys gesheft zol er hobn mit shroyre vu...   
./ham/00310.66b26e342f980a716f81bb66e9b28c92  R. A. Hettinga wrote:\n\n> And then there was ...   
./ham/00476.7133902476448f294ee064117d96c988  On Fri, 6 Sep 2002, Jim Whitehead wrote:\n\n--...   
./ham/00267.332a04bf60ba54e3a303ee253fb977ee  At 12:05 PM 10/4/28 -0400, Stephen D. Williams...   

                                                 class  
./ham/00293.d74734bf5cf13ea138e4ee4df7555008  non-spam  
./ham/00431.7f3adeb8cda736429bbe7ee757a07232  non-spam  
./ham/00310.66b26e342f980a716f81bb66e9b28c92  non-spam  
./ham/00476.7133902476448f294ee064117d96c988  non-spam  
./ham/00267.332a04bf60ba54e3a303ee253fb977ee  non-spam  


In [13]:
vectoriser = CountVectorizer()
count = vectoriser.fit_transform(data['message'].values)
print(count)



  (0, 23912)	3
  (0, 23057)	2
  (0, 44971)	2
  (0, 30980)	2
  (0, 23920)	1
  (0, 18685)	3
  (0, 2686)	118
  (0, 1736)	1
  (0, 45873)	1
  (0, 1668)	161
  (0, 14133)	1
  (0, 44605)	3
  (0, 12827)	1
  (0, 2876)	1
  (0, 812)	1
  (0, 43398)	2
  (0, 29310)	2
  (0, 5009)	1
  (0, 15494)	4
  (0, 33091)	3
  (0, 5037)	2
  (0, 7231)	1
  (0, 5010)	2
  (0, 5040)	2
  (0, 47886)	1
  :	:
  (1400, 44768)	1
  (1400, 43188)	1
  (1400, 44772)	2
  (1400, 7247)	1
  (1400, 36955)	1
  (1400, 30196)	1
  (1400, 67)	1
  (1400, 1168)	1
  (1400, 11426)	1
  (1400, 49136)	1
  (1400, 48083)	1
  (1400, 138)	1
  (1400, 49410)	1
  (1400, 139)	2
  (1400, 15872)	1
  (1400, 49854)	1
  (1400, 36473)	1
  (1400, 14741)	1
  (1400, 43151)	1
  (1400, 16295)	2
  (1400, 46176)	3
  (1400, 41784)	1
  (1400, 16130)	1
  (1400, 35246)	1
  (1400, 1588)	1


In [14]:
target = data['class'].values
print(target)


['spam' 'spam' 'spam' ... 'non-spam' 'non-spam' 'non-spam']


In [15]:
classifier = MultinomialNB()
classifier.fit(count, target)
print(classifier)



MultinomialNB()


In [16]:
exampleInput = ["Hey. This is John Cena. You can't see me", "Free Viagra boys!!", "Please reply to get this offer"]
excount = vectoriser.transform(exampleInput)
print(excount)



  (0, 12204)	1
  (0, 23257)	1
  (0, 26099)	1
  (0, 27043)	1
  (0, 30784)	1
  (0, 41457)	1
  (0, 44788)	1
  (0, 51389)	1
  (1, 11252)	1
  (1, 20756)	1
  (1, 47753)	1
  (2, 21634)	1
  (2, 33898)	1
  (2, 36335)	1
  (2, 39554)	1
  (2, 44788)	1
  (2, 45122)	1


In [17]:
prediction = classifier.predict(excount)
print(prediction)

['non-spam' 'spam' 'spam']


In [18]:
import numpy as np
from sklearn.naive_bayes import GaussianNB

#Assigning features and target variables
#Training da  ta
features = np.array([[1,2],[3,4], [5,6], [-1,-2], [-3,-4], [-5,-6]])
target = np.array([10, 20, 20, 10, 20, 20])

#Creating a Gaussian Classifier
model = GaussianNB()

#Train the model using training data
model.fit(features, target)

#Predict Output 
predictedOutput = model.predict([[3,2],[1,3]])
print (predictedOutput)

#Output: ([20 10])


[20 10]
