In [None]:
import pandas as pd
import numpy as np
from google.colab import files
import io
from io import BytesIO
import nltk


In [None]:

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
data=files.upload()

Saving spam.csv to spam.csv


In [None]:
df=pd.read_csv(io.StringIO(data['spam.csv'].decode('ISO-8859-1')))

In [None]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
df.shape

(5572, 5)

In [None]:
null=df.isnull().sum()
null

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [None]:
# Check class distribution
classes=df['v1']
print(classes.value_counts())

ham     4825
spam     747
Name: v1, dtype: int64


In [None]:
# convert class labels to binary value 0=ham , 1=spam

from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
Y=encoder.fit_transform(classes)
print(classes[:10])
print(Y[:10])

0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: v1, dtype: object
[0 0 1 0 0 1 0 0 1 1]


In [None]:
# store message

text_messages=df['v2']
print(text_messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: v2, dtype: object


In [None]:
# Use regular expression to replace unneccesry data

# Replace email address
processed=text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','emailaddr')

# Replace urls
processed=processed.str.replace(r'http\://[a-zA-Z0-9\-\.]+[a-zA-Z]{2,3}(\/S*)?$','webaddress')

# Replace money symbol
processed=processed.str.replace(r'\|$','moneysymb')

# Replace phone number
processed=processed.str.replace(r'^\[\d]{3}\?[\s-]?[\d]{3}?[\d]{4}','phonenumbr')

# Replace number
processed=processed.str.replace(r'\d+[\.\d+]?','numbr')

In [None]:
# Replace punctation
processed=processed.str.replace(r'[^\w\d\s]',' ')

# Replace whitespace between terms with single space
processed=processed.str.replace(r'\s+',' ')

# Replace loading and trailing space
processed=processed.str.replace(r'^\s+|\s+?$','')

In [None]:
# Change word into lower
processed=processed.str.lower()
print(processed)

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in numbr a wkly comp to win fa cup ...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
                              ...                        
5567    this is the numbrnd time we have tried numbr c...
5568                 will ì_ b going to esplanade fr home
5569    pity was in mood for that so any other suggest...
5570    the guy did some bitching but i acted like i d...
5571                            rofl its true to its name
Name: v2, Length: 5572, dtype: object


In [None]:
# Remove stop words 
from nltk.corpus import stopwords
stop_words=set(stopwords.words('english'))

processed=processed.apply(lambda x: ''.join(term for term in x.split() if term not in stop_words))

In [None]:
ps=nltk.PorterStemmer()
processed=processed.apply(lambda x: ''.join(ps.stem(term) for term in x.split()))

In [None]:
print(processed)

0       gojurongpointcrazyavailablebugisngreatworldlae...
1                                      oklarjokingwifuoni
2       freeentrynumbrwklycompwinfacupfinaltktsnumbrst...
3                             udunsayearlyhorucalreadysay
4                        nahthinkgoesusflivesaroundthough
                              ...                        
5567    numbrndtimetriednumbrcontactuuånumbrpoundprize...
5568                               ì_bgoingesplanadefrhom
5569                                      pitymoodsuggest
5570    guybitchingactedlikeinterestedbuyingsomethinge...
5571                                          rofltruenam
Name: v2, Length: 5572, dtype: object


In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.tokenize import word_tokenize
all_words=[]

for message in processed:
  words=word_tokenize(message)
  for w in words:
    all_words.append(w)

all_words=nltk.FreqDist(all_words)

In [None]:
print('Number of words {}'.format(len(all_words)))
print('Most common words {} '.format(all_words.most_common(15)))

Number of words 5075
Most common words [('sorrycalllat', 30), ('ok', 20), ('cantpickphonerightplssendmessag', 12), ('oki', 7), ('privatenumbraccountstatementshowsnumbrunredeemedpointscallnumbridentifiercodenumbrexpiresnumbrnumbrnumbr', 5), ('oklor', 5), ('sorrycalllatermeet', 4), ('pleasecallcustomerservicerepresentativefreephonenumbrnumbrnumbrnumbramnumbrpmguaranteedånumbrcashånumbrpr', 4), ('wenurlovablebcumsangrywidudnttakeseriouslycozangrychildishntruewayshowingdeepaffectioncarenluvkettodamandanicedayda', 4), ('opinionnumbrnumbrjadanumbrkusruthinumbrlovablenumbrsilentnumbrsplcharacternumbrmaturednumbrstylishnumbrsimpleplsrepli', 4), ('privatenumbraccountstatementnumbrshowsnumbrunredeemedpointscallnumbridentifiercodenumbrexpiresnumbrnumbrnumbr', 4), ('late', 4), ('usecretadmirerlookingnumbrmakecontactufindrrevealthinksurspecialcallnumbr', 4), ('pleasecallnumbrimmediatelyurgentmessagewait', 4), ('numbrwondersworldnumbrthnumbrthurstylenumbrthursmilenumbrthurpersonalitynumbrrdurnaturen

In [None]:
# use 1500 words as common features
word_features= list(all_words.keys())[:1500]

In [None]:
# Define find feature function
def find_feature(message):
    words=word_tokenize(message)
    features={}
    for word in word_features:
      features[word]=(word in words)
      
    return features

In [None]:
# Lets see example
features=find_feature(processed[0])

for key, value in features.items():
  if value==True:
    print(key)

gojurongpointcrazyavailablebugisngreatworldlaebuffetcinegotamorewat


In [None]:
# Find feature of all messages
messages=list(zip(processed,Y))

# Define a seed for reproducibility
seed=1
np.random_seed=seed
np.random.shuffle(messages)

# call find feature for each message
featureset=[(find_feature(text),label) for (text,label) in messages]

In [None]:
# split training and testing data
from sklearn.model_selection import  train_test_split

training,testing=train_test_split(featureset,test_size=0.25,random_state=seed)

In [None]:
print("training is {}".format(len(training)))
print("testing is {}".format(len(testing)))

training is 4179
testing is 1393


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [None]:
Names= ['KNN', 'Decision Tree','Random Forest','Logistic Regression','SGD Classifier',
        'MultinomialNB','SVC']

classifier=[
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter=108),
    MultinomialNB(),
    SVC(kernel='linear')
]

models=zip(Names,classifier)


In [None]:
print(models)

<zip object at 0x7fec43173a88>


In [None]:
from nltk.classify.scikitlearn import SklearnClassifier

for name,model in models:
  nltk_model=SklearnClassifier(model)
  nltk_model.train(training)
  accuracy=nltk.classify.accuracy(nltk_model,testing)*100
  print("{} :Accuracy {}".format(name, accuracy))

KNN :Accuracy 87.58076094759511
Decision Tree :Accuracy 89.37544867193108
Random Forest :Accuracy 89.37544867193108
Logistic Regression :Accuracy 87.07824838478105
SGD Classifier :Accuracy 89.37544867193108
MultinomialNB :Accuracy 87.07824838478105
SVC :Accuracy 89.37544867193108


In [None]:
from sklearn.ensemble import VotingClassifier
from nltk.classify.scikitlearn import SklearnClassifier

Names= ['KNN', 'Decision Tree','Random Forest','Logistic Regression','SGD Classifier',
        'MultinomialNB','SVC']

classifier=[
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter=108),
    MultinomialNB(),
    SVC(kernel='linear')
]

models=list(zip(Names,classifier))

nltk_ensemble=SklearnClassifier(VotingClassifier(estimators=models,voting='hard',n_jobs=-1))
nltk_ensemble.train(training)
accuracy=nltk.classify.accuracy(nltk_ensemble,testing)*100
print("Ensemble method :Accuracy {}".format(accuracy))

Ensemble method :Accuracy 89.37544867193108


In [None]:
txt_features,labels=zip(*testing)
prediction=nltk_ensemble.classify_many(txt_features)

In [None]:
print(classification_report(labels,prediction))

pd.DataFrame(
    confusion_matrix(labels,prediction),
    index=[['actual','actual'],['ham','spam']],
    columns=[['prediction','prediction'],['ham','spam']]
    )

              precision    recall  f1-score   support

           0       0.89      1.00      0.94      1212
           1       1.00      0.18      0.31       181

    accuracy                           0.89      1393
   macro avg       0.95      0.59      0.63      1393
weighted avg       0.91      0.89      0.86      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,prediction,prediction
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1212,0
actual,spam,148,33


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
tokenizer=Tokenizer()
predict_msg = ["Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",
          "Ok lar... Joking wif u oni...",
          "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]

In [None]:
def predict_spam(predict_msg):
    new_seq = tokenizer.texts_to_sequences(predict_msg)
    padded = pad_sequences(new_seq, maxlen =1500
                      )
    return (model.predict(padded))
predict_spam(predict_msg)

array([0, 0, 0])