In [None]:
import pandas as pd
import re
import nltk
import numpy as np
import matplotlib.pyplot as plt

from nltk.tokenize import word_tokenize
from nltk import PorterStemmer, WordNetLemmatizer, FreqDist
from nltk.corpus import stopwords

from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

### Preprocessing 

In [None]:
df = pd.read_csv('D:\\Phyton\\SMSSpamCollection', sep='\t',header=None, names=['labels','sms'])

print(df.head())

In [None]:
numeric_to_label = LabelBinarizer()
text_to_numeric = TfidfVectorizer(stop_words='english')

numeric_label = numeric_to_label.fit_transform(df['labels'])
numeric_text = text_to_numeric.fit_transform(df['sms'].values)

x_train, x_test, y_train, y_test = train_test_split(numeric_text, numeric_label, test_size=.25, random_state=0)

### XGB Classifier  models 

In [None]:
model = XGBClassifier().fit(x_train, y_train)
y_pred = model.predict(x_test)

for pred, sms in zip(y_pred[:5], x_test[:5]):
    print(f'Pred: {numeric_to_label.inverse_transform(pred)} - SMS {text_to_numeric.inverse_transform(sms)} \n')

In [None]:
accuracy = accuracy_score(y_test, y_pred)

print('Accuracy score %.2f'% accuracy)

In [None]:
change_labels = lambda x: 1 if x == 'spam' else 0

df['labels'] = df['labels'].apply(change_labels)

remove_non_alphabets = lambda x: re.sub(r'[^a-zA-Z]',' ',x)
tokenize = lambda x: word_tokenize(x)

ps = PorterStemmer()

stem = lambda w: [ps.stem(x) for x in w]

df['sms'] = df['sms'].apply(remove_non_alphabets)
df['sms'] = df['sms'].apply(tokenize)
df['sms'] = df['sms'].apply(stem)
df['sms'] = df['sms'].apply(lambda x: ''.join(x))

print(df.head())

max_words = 10000

cv = CountVectorizer(max_features=max_words, stop_words='english')
sparse_matrix = cv.fit_transform(df['sms']).toarray()

x_train, x_test, y_train, y_test = train_test_split(sparse_matrix, np.array(df['labels']))

### Multinomial models 

In [None]:
model = MultinomialNB()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)

print('Accuracy score %.2f'% accuracy)

### Artifial Neural Network models 

In [None]:
embedding_dim = 100

model = Sequential()
model.add(Embedding(max_words,embedding_dim,input_length=x_train.shape[1]))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop',loss='binary_crossentropy', metrics=['accuracy'])

training_ann = model.fit(x_train,y_train,epochs=3, batch_size=32,validation_split=.2)

In [None]:
accuracy = accuracy_score(y_test, y_pred)

print('Accuracy score %.2f'% accuracy)