In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab

import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import pickle
from nltk.corpus import stopwords
from nltk.corpus import wordnet

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\stude\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\stude\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\stude\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
spam = pd.read_csv("spam.csv", encoding='utf-8')
X = spam['Message']
y = spam[['Category']]
spam

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
documents = []

from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

for sen in range(0, len(X)):
    # Удалить все специальные символы
    document = re.sub(r'\W', ' ', str(X[sen]))
    
    # удалить все одиночные символы
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Удалить отдельные символы с начала
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Замена нескольких пробелов одним пробелом
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Удаление префикса «б»
    document = re.sub(r'^b\s+', '', document)
    
    # Преобразование в нижний регистр
    document = document.lower()
    
    # лемматизация - процесс приведения словоформы к лемме — её нормальной (словарной) форме
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    
    documents.append(document)
    
documents[:10]

['go until jurong point crazy available only in bugis great world la buffet cine there got amore wat',
 'ok lar joking wif oni',
 'free entry in 2 wkly comp to win fa cup final tkts 21st may 2005 text fa to 87121 to receive entry question std txt rate c apply 08452810075over18 s',
 'u dun say so early hor c already then say',
 'nah don think he go to usf he life around here though',
 'freemsg hey there darling it been 3 week now and no word back d like some fun you up for it still tb ok xxx std chgs to send 1 50 to rcv',
 'even my brother is not like to speak with me they treat me like aid patent',
 'a per your request melle melle oru minnaminunginte nurungu vettam ha been set a your callertune for all caller press 9 to copy your friend callertune',
 'winner a valued network customer you have been selected to receivea 900 prize reward to claim call 09061701461 claim code kl341 valid 12 hour only',
 'had your mobile 11 month or more r entitled to update to the latest colour mobile with 

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=13, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(documents).toarray()
print(X)
print(*X[0])

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
0 0 0 0 0 1 0 0 0 0 0 0 0


In [5]:
codes = {
    'ham': 0,
    'spam': 1
}
y = y.replace({'Category': codes})
y

Unnamed: 0,Category
0,0
1,0
2,1
3,0
4,0
...,...
5567,1
5568,0
5569,0
5570,0


In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y.values.ravel(), test_size=0.2, random_state=0)

In [7]:
# К ближайших соседей
model = KNeighborsClassifier(n_neighbors=1)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.884304932735426

In [8]:
# Деревья решений
model = DecisionTreeClassifier(max_depth=4, random_state=0)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.9022421524663677

In [9]:
# Метод опорных векторов SVC
model = SVC(C=1, gamma=1, random_state=0)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.9094170403587444

In [10]:
# Случайный лес
model = RandomForestClassifier(n_estimators=5, random_state=0)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.9103139013452914

In [11]:
with open('text_classifier', 'wb') as picklefile:
    pickle.dump(model,picklefile)

In [12]:
with open('text_classifier', 'rb') as training_model:
    model = pickle.load(training_model)

In [13]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred)) 

[[920  35]
 [ 65  95]]
              precision    recall  f1-score   support

           0       0.93      0.96      0.95       955
           1       0.73      0.59      0.66       160

    accuracy                           0.91      1115
   macro avg       0.83      0.78      0.80      1115
weighted avg       0.90      0.91      0.91      1115

0.9103139013452914


In [14]:
from bs4 import BeautifulSoup
import requests

url = 'https://blog.textedly.com/spam-text-message-examples'

page = requests.get(url)

print(page.status_code)

200


In [15]:
soup = BeautifulSoup(page.text, "html.parser")
allTexts = soup.findAll('p')
print(allTexts)

[<p>Have you ever received a text message from a company or person you recognized, but it didn’t seem quite “right”? </p>, <p>Spam text messages (also known as phishing or “smishing” – SMS phishing) trick consumers into providing personal data to criminals who pose as a familiar business, organization or family member. Reviewing spam text message examples can help avoid falling victim to these dangerous schemes.</p>, <p>Criminals use phishing text messages to attain usernames and passwords, social security numbers, credit card numbers and PINs to commit fraud or identity theft. Other attacks focus on duping people into downloading viruses or malware by clicking seemingly innocent links.</p>, <p>If you don’t think you’d ever fall for a phishing text scam, think again. <a href="https://www.robokiller.com/spam-text-insights" rel="noopener" target="_blank"><span>11.94 billion spam texts were sent in May 2022</span></a> alone – that's almost 43 spam texts for every person in the U.S. In add

In [16]:
filteredTexts = []

for data in allTexts:
    filteredTexts.append(data.text)
        
filteredTexts[:20]

['Have you ever received a text message from a company or person you recognized, but it didn’t seem quite “right”?\xa0',
 'Spam text messages (also known as phishing or “smishing” – SMS phishing) trick consumers into providing personal data to criminals who pose as a familiar business, organization or family member. Reviewing spam text message examples can help avoid falling victim to these dangerous schemes.',
 'Criminals use phishing text messages to attain usernames and passwords, social security numbers, credit card numbers and PINs to commit fraud or identity theft. Other attacks focus on duping people into downloading viruses or malware by clicking seemingly innocent links.',
 "If you don’t think you’d ever fall for a phishing text scam, think again. 11.94 billion spam texts were sent in May 2022 alone – that's almost 43 spam texts for every person in the U.S. In addition, while the Federal Trade Commission (FTC) noted a drop in fraudulent calls, it also saw a 145% increase in re

In [17]:
documents = []

from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

for sen in range(0, len(filteredTexts)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(filteredTexts[sen]))
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    
    if document != '':
        documents.append(document)
    
documents

['have you ever received text message from company or person you recognized but it didn seem quite right',
 'spam text message also known a phishing or smishing sm phishing trick consumer into providing personal data to criminal who pose a familiar business organization or family member reviewing spam text message example can help avoid falling victim to these dangerous scheme',
 'criminal use phishing text message to attain usernames and password social security number credit card number and pin to commit fraud or identity theft other attack focus on duping people into downloading virus or malware by clicking seemingly innocent link',
 'if you don think you ever fall for phishing text scam think again 11 94 billion spam text were sent in may 2022 alone that almost 43 spam text for every person in the s in addition while the federal trade commission ftc noted drop in fraudulent call it also saw 145 increase in report of scam texting',
 'today we ll share ten common spam text message ex

In [18]:
X = vectorizer.fit_transform(documents).toarray()
print(X)

[[0 1 0 0 0 0 0 1 0 0 0 0 1]
 [1 0 0 0 1 0 0 2 1 2 0 2 2]
 [0 0 0 0 0 0 0 1 0 1 0 0 1]
 [0 0 0 0 0 0 0 0 0 1 2 2 3]
 [0 0 0 0 0 0 0 1 0 1 1 1 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 1 2]
 [1 0 1 0 0 0 0 0 0 1 0 0 1]
 [0 0 2 0 0 0 1 1 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 1 0 0 1 0 1 0 0]
 [0 1 0 1 0 1 1 0 1 0 0 0 2]
 [0 0 0 0 0 1 1 1 1 0 0 0 2]
 [0 1 1 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 1 0 1 0 0]
 [0 0 0 2 1 0 0 1 0 0 0 1 1]
 [0 0 1 1 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 2 0 0 0 0 1]
 [0 0 0 0 0 0 0 1 0 0 0 0 0]
 [1 2 2 0 0 2 0 0 0 0 1 0 1]
 [1 1 0 0 0 0 1 1 0 0 0 0 1]
 [0 0 0 0 0 0 0 3 0 1 0 1 1]
 [0 0 0 0 0 0 0 1 0 0 0 1 1]
 [0 0 0 0 0 0 1 0 0 1 1 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 1 0 0 1 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 1 0 1 2]
 [0 0 0 1 0 2 0 0 1 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 1 1]
 [1 0 0 0 0 0 0 1 0 1 1 0 1]
 [1 0 0 0 1 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 2 0 0 1 0 0 0 0 2]]


In [19]:
y_pred = model.predict(X)
print(y_pred[:20])

[0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0]


In [20]:
spams = []

for sen in range(0, len(documents)):
    if y_pred[sen] == 1:
        spams.append(documents[sen])
        
spams

['winning an unexpected prize sound great in theory however being notified of winning contest you didn enter is dead giveaway of phishing text if you re unsure whether an offer is authentic contact the business directly to verify',
 'with delivery from amazon and fedex so commonplace now text message regarding package or order would be easy to overlook while shipper send legitimate shipping update text they ll never ask for personal information or money to complete delivery',
 'one of the most disturbing spam text message lead you to believe that family member is in trouble and need immediate financial help the sender try to convince you that wiring money is necessary to prevent financial or medical emergency or that loved one is involved in kidnapping',
 'people are increasingly aware of phishing text message scam if you re business owner or marketer your challenge is to ensure your sm marketing campaign look professional']