In [3]:
import pandas as pd
import numpy as np 
from sklearn.datasets import load_files

import time

# Text cleaning and precprcessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [4]:
X, y = [], []
email = load_files("../input/enron-spam/enron1")
X = np.append(X, email.data)
y = np.append(y, email.target)    

### Let's create Dataframe with text and target feature

In [5]:
df_all = pd.DataFrame(columns=['text', 'target'])
df_all['text'] = [x for x in X]
df_all['target'] = [t for t in y]

In [6]:
df_all

Unnamed: 0,text,target
0,b'Subject: nesa / hea \' s 24 th annual meetin...,0.0
1,b'Subject: meter 1431 - nov 1999\r\ndaren -\r\...,0.0
2,"b""Subject: investor here .\r\nfrom : mr . rich...",1.0
3,"b""Subject: hi paliourg all available meds . av...",1.0
4,b'Subject: january nominations at shell deer p...,0.0
...,...,...
5167,"b""Subject: check it out\r\nyou have to know th...",1.0
5168,b'Subject: re : noms / actual vols for 5 / 18 ...,0.0
5169,"b'Subject: oct prod est - revision\r\ndaren ,\...",0.0
5170,b'Subject: enron / hpl actuals for february 21...,0.0


In [7]:
df_X = df_all.drop(['target'], axis=1)
df_y = df_all['target']

In [8]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [None]:
start_time = time.time()

# Create corpus
corpus = []
for i in range(0, len(df_X)):
    # Remove special symbols
    review = re.sub(r'\\r\\n', ' ', str(df_X['text'][i]))
    # Remove all symbols except letters
    review = re.sub('[^a-zA-Z]', ' ', review)
    # Replacing all gaps with spaces 
    review = re.sub(r'\s+', ' ', review)                    
    # Remove 'b' in the beginning of each text
    review = re.sub(r'^b\s+', '', review)       

    review = review.lower()
    review = review.split()
    review = [stemmer.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
#tf = TfidfVectorizer()

# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()

In [None]:
# Splitting data on train and test dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,  random_state=9, test_size=0.2)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix

model = MultinomialNB().fit(X_train, y_train)
pred = model.predict(X_test)

accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred)
recall = recall_score(y_test, pred)
conf_m = confusion_matrix(y_test, pred)

print(f"accuracy: %.3f" %accuracy)
print(f"precision: %.3f" %precision)
print(f"recall: %.3f" %recall)
print(f"confusion matrix: ")
print(conf_m)
print("--- %s seconds ---" % (time.time() - start_time))