<a href="https://colab.research.google.com/github/tecXworld/Natural-Language-Processing/blob/main/SpamClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re
import nltk
nltk.download('all')

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
messages = pd.read_csv('/content/gdrive/MyDrive/SMSSpamCollection', sep='\t',
                           names=["label", "message"])

In [None]:
print(messages.head())

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [None]:
#Data cleaning and preprocessing
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
ps = PorterStemmer()
wordnet=WordNetLemmatizer()
corpus = []

In [None]:
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    review = [wordnet.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    #review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
print(*corpus, sep = "\n")

In [None]:
# Bag Of Words Model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()

#TF-IDF Model
#from sklearn.feature_extraction.text import TfidfVectorizer
#cv = TfidfVectorizer()
#X = cv.fit_transform(corpus).toarray()

In [None]:
y=pd.get_dummies(messages['label'])
print(y)

      ham  spam
0       1     0
1       1     0
2       0     1
3       1     0
4       1     0
...   ...   ...
5567    0     1
5568    1     0
5569    1     0
5570    1     0
5571    1     0

[5572 rows x 2 columns]


In [None]:
y=y.iloc[:,1].values
print(y)

[0 0 1 ... 0 0 0]


In [None]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
svc = SVC()
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier()
lrc = LogisticRegression()
rfc = RandomForestClassifier()

clfs = {'SVC' : svc,'KN' : knc, 'NB': mnb, 'DT': dtc, 'LR': lrc, 'RF': rfc}

def train(clf, features, targets):    
    clf.fit(features, targets)
    
def predict(clf, features):
    return (clf.predict(features))
pred_scores_word_vectors = []
cm = []

for k,v in clfs.items():
    train(v, X_train, y_train)
    pred = predict(v, X_test)
    print("--------------",clfs[k],"-------------")
    print("Test", y_test)
    print("Pred", pred)
    cm.append((k,[confusion_matrix(y_test,pred)]))
    pred_scores_word_vectors.append((k, [accuracy_score(y_test , pred)]))

-------------- SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False) -------------
Test [0 1 0 ... 0 1 0]
Pred [0 1 0 ... 0 1 0]
-------------- KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform') -------------
Test [0 1 0 ... 0 1 0]
Pred [0 1 0 ... 0 1 0]
-------------- MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) -------------
Test [0 1 0 ... 0 1 0]
Pred [0 1 0 ... 0 1 0]
-------------- DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_

In [None]:
    print(*cm, sep = "\n")

('SVC', [array([[955,   0],
       [ 19, 141]])])
('KN', [array([[955,   0],
       [ 91,  69]])])
('NB', [array([[936,  19],
       [  7, 153]])])
('DT', [array([[949,   6],
       [ 21, 139]])])
('LR', [array([[955,   0],
       [ 17, 143]])])
('RF', [array([[954,   1],
       [ 23, 137]])])


In [None]:
    print(*pred_scores_word_vectors, sep = "\n")

('SVC', [0.9829596412556054])
('KN', [0.9183856502242153])
('NB', [0.9766816143497757])
('DT', [0.9757847533632287])
('LR', [0.9847533632286996])
('RF', [0.97847533632287])


In [None]:
def find(x):
    if x == 1:
        print ("Message is SPAM")
    else:
        print ("Message is NOT Spam")

In [None]:
text = ["Free tones Hope you enjoyed your new content"]
integers = cv.transform(text)
x = mnb.predict(integers)[0]
find(x) 

Message is SPAM


In [None]:
text = ["I HAVE A DATE ON SUNDAY WITH WILL!!"]
integers = cv.transform(text)
x = mnb.predict(integers)[0]
find(x) 

Message is NOT Spam


In [None]:
text = ["Lol your always so convincing."]
integers = cv.transform(text)
x = mnb.predict(integers)[0]
find(x) 

Message is NOT Spam


In [None]:
text = ["Hey, What should I cook for dinner?"]
integers = cv.transform(text)
x = mnb.predict(integers)[0]
find(x) 

Message is NOT Spam


In [None]:
text = ["Winner! Hola you recieved 500 credit points."]
integers = cv.transform(text)
x = mnb.predict(integers)[0]
find(x) 

Message is SPAM
