In [1]:
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag, word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import confusion_matrix

In [2]:
data = pd.read_csv("spam.csv", encoding = "latin-1")
data = data[['v1', 'v2']]
data = data.rename(columns = {'v1': 'label', 'v2': 'text'})

In [3]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
lemmatizer = WordNetLemmatizer()
stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\devhr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def review_messages(msg):
    # converting messages to lowercase
    msg = msg.lower()
    return msg


In [5]:
def alternative_review_messages(msg):
    # converting messages to lowercase
    msg = msg.lower()

    # uses a lemmatizer (wnpos is the parts of speech tag)
    # unfortunately wordnet and nltk uses a different set of terminology for pos tags
    # first, we must translate the nltk pos to wordnet
    nltk_pos = [tag[1] for tag in pos_tag(word_tokenize(msg))]
    msg = [tag[0] for tag in pos_tag(word_tokenize(msg))]
    wnpos = ['a' if tag[0] == 'J' else tag[0].lower() if tag[0] in ['N', 'R', 'V'] else 'n' for tag in nltk_pos]
    msg = " ".join([lemmatizer.lemmatize(word, wnpos[i]) for i, word in enumerate(msg)])

    # removing stopwords 
    msg = [word for word in msg.split() if word not in stopwords]

    return msg

In [6]:
# Processing text messages
data['text'] = data['text'].apply(review_messages)

# train test split 
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size = 0.1, random_state = 1)

# training vectorizer
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)

# training the classifier 
svm = svm.SVC(C=1000)
svm.fit(X_train, y_train)

# testing against testing set 
X_test = vectorizer.transform(X_test)
y_pred = svm.predict(X_test) 
print(confusion_matrix(y_test, y_pred))


[[490   0]
 [  5  63]]


In [7]:
print(svm.score(X_test,y_test))


0.9910394265232975


In [8]:
# test against new messages 
def pred(msg):
    msg = vectorizer.transform([msg])
    prediction = svm.predict(msg)
    return prediction[0]

In [9]:
pred('Congratulations! You’ve won a $500 gift card to Target. Click here to claim your reward.')

'spam'

In [12]:
pred("Hello dave, your shipment from UPS will arrive today. Click here to track your package.")

'ham'

In [11]:
pred("Your Wells Fargo account has been locked for suspicious activity. Please log in here and verify your account.")

'ham'

In [13]:
pred("Hey, this is dave. I’m in a meeting, but I need you to order 5 Amazon gift cards ASAP. I’ll reimburse you once you send them to this email address.")

'ham'

In [14]:
pred("Congratulations! What a surprise, You’ve won a $200 gift card to Target. Click here to claim your reward.")

'spam'

In [17]:
pred("You’re owed a refund of 500$.")

'ham'

In [20]:
pred("Congratulations! You’ve Won! take your $100. ")

'ham'

In [21]:
pred("Enjoy 10% off — text “CUPCAKE” to [Number] to unlock instant discounts.")

'ham'

In [22]:
pred("Hi Nick. This is to remind you about the $75 minimum payment on your credit card ending in XXXX. Payment is due on 01/01. Pls visit order.com to make your payment.")

'ham'

In [23]:
pred("England v Macedonia - dont miss the goals/team news. Txt ur national team to 87077 eg ENGLAND to 87077 Try:WALES, SCOTLAND .20 POBOXox3650 16+")

'spam'