In [None]:
import io,os
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer

In [2]:
def read_path(path):
    for root,folders,files in os.walk(path):
        for file in files:
            filename = root+'\\'+file
            f = io.open(filename,encoding='latin1')
            lines = f.readlines()
            data = []
            body = False
            for line in lines:
                if line == '\n':
                    body = True
                if body:
                    data.append(line)
            msg = '\n'.join(data)
            clean = re.findall('[\w]+',msg)
            msg = ' '.join(clean)
            yield msg,filename

In [3]:
def df(path,classification):
    index = []
    row = []
    for message,filename in read_path(path):
        row.append({'message':message,'class':classification})
        index.append(filename)
    return pd.DataFrame(data=row,index=index)

In [4]:
dataset = df('emails/ham','ham')
dataset = dataset.append(df('emails/spam','spam'))

In [5]:
def textPreprocessing(dataset):
    stopwords_list = stopwords.words('english')
    stopwords_list.extend([',','.','-','!','@'])
    wnet = WordNetLemmatizer()
    tfidf = TfidfVectorizer()
    messageList = []
    for i in range(len(dataset)):
        tokens = word_tokenize(dataset['message'][i])
        temp = []
        for token in tokens:
            if token.lower() not in stopwords_list:
                temp.append(token.lower())
        for i in range(len(temp)):
            temp[i] = wnet.lemmatize(temp[i],pos='v')
        msg = ' '.join(temp)
        messageList.append(msg)
    vect_array = tfidf.fit_transform(messageList)
    labels = np.zeros(len(dataset))
    for i in range(len(dataset)):
        if dataset['class'][i] == 'ham':
            labels[i] = float(0)
        if dataset['class'][i] == 'spam':
            labels[i] = float(1)
    return vect_array.toarray(),labels,tfidf

In [6]:
vect_array,labels,tfidf = textPreprocessing(dataset)

In [8]:
print(vect_array)

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.06511003 0.         0.         ... 0.         0.         0.        ]
 [0.         0.03460931 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [9]:
print(len(vect_array))

3000


In [10]:
print(labels)

[0. 0. 0. ... 1. 1. 1.]


In [11]:
print(len(labels))

3000


In [12]:
x_train,x_test,y_train,y_test = train_test_split(vect_array,labels,test_size=0.25)

In [13]:
logreg = LogisticRegression()

In [15]:
logreg.fit(x_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [16]:
ypred = logreg.predict(x_test)

In [18]:
print(accuracy_score(ypred,y_test))

0.9533333333333334


In [19]:
nb = GaussianNB()

In [20]:
nb.fit(x_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [21]:
ypred = nb.predict(x_test)

In [22]:
print(accuracy_score(ypred,y_test))

0.9693333333333334


In [25]:
row = {'message':"""Go until jurong point, crazy.. 
                    Available only in bugis n great world la e buffet... 
                    Cine there got amore wat..."""}
dataset_2 = pd.DataFrame(data=row,index=[0])

In [27]:
print(dataset_2)

                                             message
0  Go until jurong point, crazy.. \n             ...


In [28]:
vect = tfidf.transform([dataset_2['message'][0]])

NameError: name 'tfidf' is not defined