# Spam Detection

In [2]:
import pandas as pd

from string import punctuation
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer

from imblearn.over_sampling import RandomOverSampler

from sklearn.model_selection import train_test_split

from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

In [3]:
df = pd.read_csv(r"DataSets\spam.csv")
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
lem = WordNetLemmatizer()

def text_preprocessor(text):
    text = "".join([i.lower() for i in text if i not in punctuation])
    tokens = word_tokenize(text)
    lst = [i for i in tokens if i not in stopwords.words("english")]
    text = " ".join([lem.lemmatize(i,"v") for i in lst])
    return text

In [5]:
df["Message"].apply(func=text_preprocessor)

0       go jurong point crazy available bugis n great ...
1                                   ok lar joke wif u oni
2       free entry 2 wkly comp win fa cup final tkts 2...
3                     u dun say early hor u c already say
4                nah dont think go usf live around though
                              ...                        
5567    2nd time try 2 contact u u £750 pound prize 2 ...
5568                             ü b go esplanade fr home
5569                          pity mood soany suggestions
5570    guy bitch act like id interest buy something e...
5571                                       rofl true name
Name: Message, Length: 5572, dtype: object

In [6]:
df["Category"] = df["Category"].replace(["ham","spam"],[0,1])
df

  df["Category"] = df["Category"].replace(["ham","spam"],[0,1])


Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [7]:
cv = CountVectorizer()
x_vector = cv.fit_transform(df["Message"])
x_vector

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 74098 stored elements and shape (5572, 8709)>

In [8]:
y = df["Category"]

In [9]:
y.value_counts()

Category
0    4825
1     747
Name: count, dtype: int64

In [10]:
ros = RandomOverSampler()
x_res,y_res = ros.fit_resample(x_vector,y)
y_res.value_counts()

Category
0    4825
1    4825
Name: count, dtype: int64

In [11]:
x_train,x_test,y_train,y_test=train_test_split(x_res,y_res,test_size=0.3,random_state=42)

In [12]:
models = [SVC(),MultinomialNB()]
for i in models:
    print(i)
    i.fit(x_train,y_train)
    y_pred = i.predict(x_test)
    print(classification_report(y_test,y_pred))
    print(cross_val_score(i,x_train,y_train).mean())
    print()

SVC()
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1469
           1       1.00      1.00      1.00      1426

    accuracy                           1.00      2895
   macro avg       1.00      1.00      1.00      2895
weighted avg       1.00      1.00      1.00      2895

0.9951147298297556

MultinomialNB()
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      1469
           1       0.99      0.98      0.99      1426

    accuracy                           0.99      2895
   macro avg       0.99      0.99      0.99      2895
weighted avg       0.99      0.99      0.99      2895

0.981939304219097



In [13]:
svc = SVC()
svc.fit(x_train,y_train)
svc.predict(x_test)
print(cross_val_score(svc,x_train,y_train).mean())

0.9951147298297556


In [14]:
def check_spam(text):
    tp = text_preprocessor(text)
    vector = cv.transform([tp])
    op = svc.predict(vector)[0]
    if op == 1:
        print("Spam")
    else:
        print("Not Spam")

In [15]:
text = "I love programming in Python! It's amazing, and I learn new things every day."
check_spam(text)

Not Spam


In [16]:
text = "free free free"
check_spam(text)

Spam
