In [1]:
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer



KeyboardInterrupt: 

In [None]:
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
df = pd.read_csv('../data/dataset.csv')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [None]:
df.head()

Unnamed: 0,text_type,text
0,spam,naturally irresistible your corporate identity...
1,spam,the stock trading gunslinger fanny is merrill ...
2,spam,unbelievable new homes made easy im wanting to...
3,spam,4 color printing special request additional in...
4,spam,do not have money get software cds from here s...


In [None]:
corpus = []
for i in range(len(df)):
  review = df.text[i]
  review = review.lower()
  review = word_tokenize(review)
  review = [lemmatizer.lemmatize(word) for word in review if word not in stop_words]
  review = ' '.join(review)
  corpus.append(review)

In [None]:
corpus

['naturally irresistible corporate identity lt really hard recollect company market full suqgestions information isoverwhelminq good catchy logo stylish statlonery outstanding website make task much easier promise havinq ordered iogo company automaticaily become world ieader isguite ciear without good product effective business organization practicable aim hotat nowadays market promise marketing effort become much effective list clear benefit creativeness hand made original logo specially done reflect distinctive company image convenience logo stationery provided format easy use content',
 'stock trading gunslinger fanny merrill muzo colza attainder penultimate like esmark perspicuous ramble segovia group try slung kansa tanzania yes chameleon continuant clothesman libretto chesapeake tight waterway herald hawthorn like chisel morristown superior deoxyribonucleic clockwork try hall incredible mcdougall yes hepburn einsteinian earmark sapling boar duane plain palfrey inflexible like huz

In [None]:
X = corpus
y = pd.get_dummies(df['text_type']).astype(int)
y = y.iloc[:,1].values

In [None]:
y

array([1, 1, 1, ..., 0, 0, 0])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [None]:
X_train

['offer rakish sp vince norma called said rakish requested stock option instead signing bonus suggested giving 30 k worth option would vest 3 year period told sure would fine cost u less offered cash bonus 20 k stinson',
 '𝖨 ’ 𝗆 𝗌𝗈 𝖾𝗑𝖼𝗂𝗍𝖾𝖽 𝗐𝗁𝖾𝗇 𝖨 𝗈𝗉𝖾𝗇𝖾𝖽 𝗆𝗒 𝗐𝖺𝗅𝗅𝖾𝗍 𝖺𝗇𝖽 𝖨 𝗌𝖺𝗐 𝗆𝗒 𝗉𝖺𝗒𝗆𝖾𝗇𝗍 𝗈𝖿 $ 68959 𝗇𝗈𝗍 𝗆𝗈𝗋𝖾 𝗍𝗁𝖺𝗇 𝗍𝗐𝖾𝗇𝗍𝗒 𝗆𝗂𝗇𝗎𝗍𝖾𝗌 𝖺𝗀𝗈 𝗐𝗁𝖺𝗍 𝖺 𝖻𝗈𝗈𝗌𝗍 @ expertreader 𝖸𝗈𝗎 𝖺𝗋𝖾 𝗍𝗁𝖾 𝗍𝗒𝗉𝖾 𝗈𝖿 𝗆𝖺𝗇𝖺𝗀𝖾𝗋 𝖨 𝗁𝖺𝗏𝖾 𝖺𝗅𝗐𝖺𝗒𝗌 𝗐𝖺𝗇𝗍𝖾𝖽 𝖳𝗁𝖺𝗇𝗄𝗌 𝗌𝗈 𝗆𝗎𝖼𝗁 𝖿𝗈𝗋 𝗍𝗁𝖾 𝗉𝖺𝗒𝗆𝖾𝗇𝗍 𝖬rs william astird 👇👇👇👇👇',
 'delivery status notification recipient message processed mail server orlandi enrico inwind failed 5 2 2 mailbox full remote mta ims 9 libero smtp diagnostic 552 rcpt mailbox disk quota exceeded',
 'free ltci policy comparison software long term care insurance worksite marketing system take advantage current information available concerning group ltci market developed month exhaustive research agent interview worksite marketing system resource successful group enrollment included order agent manual info including implementation s

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500, ngram_range=(1,2))
X_train = cv.fit_transform(X_train).toarray()
X_test = cv.transform(X_test).toarray()

In [None]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
X_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
y_train.shape , X_train.shape

((16278,), (16278, 1500))

In [None]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [None]:
y_pred = spam_detect_model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
confusion_m = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print('Confusion matrix: ', confusion_m)
print('Classification report: ', classification_rep)
print('Accuracy: ', accuracy)

Confusion matrix:  [[2301  578]
 [  97 1094]]
Classification report:                precision    recall  f1-score   support

           0       0.96      0.80      0.87      2879
           1       0.65      0.92      0.76      1191

    accuracy                           0.83      4070
   macro avg       0.81      0.86      0.82      4070
weighted avg       0.87      0.83      0.84      4070

Accuracy:  0.8341523341523341
