In [57]:
import pandas as pd
import numpy as np
import re
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_selection import SelectKBest

In [16]:
data = pd.read_csv('../data/spam.csv', encoding = 'latin-1')
data.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [17]:
data.isna().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [19]:
data.drop(columns = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], inplace = True)

In [20]:
data.rename(columns = {'v1': 'type', 'v2': 'original_text'}, inplace = True)

In [21]:
data['processed_text'] = data['original_text'].apply(
    lambda x: re.sub('[^a-zA-Z]', ' ', x).lower()   
)

Токенезация

In [23]:
nlp = spacy.load("en_core_web_sm")

data['tokens'] = data['processed_text'].apply(lambda f: [token for token in nlp(f)])

In [24]:
data.head(10)

Unnamed: 0,type,original_text,processed_text,tokens
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only ...,"[go, until, jurong, point, , crazy, , avail..."
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,"[ok, lar, , joking, wif, u, oni, ]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup fina...,"[free, entry, in, , a, wkly, comp, to, win, ..."
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, , u, c, alrea..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i don t think he goes to usf he lives aro...,"[nah, i, don, t, think, he, goes, to, usf, , ..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...,freemsg hey there darling it s been week s n...,"[freemsg, hey, there, darling, it, s, been, ..."
6,ham,Even my brother is not like to speak with me. ...,even my brother is not like to speak with me ...,"[even, my, brother, is, not, like, to, speak, ..."
7,ham,As per your request 'Melle Melle (Oru Minnamin...,as per your request melle melle oru minnamin...,"[as, per, your, request, , melle, melle, , o..."
8,spam,WINNER!! As a valued network customer you have...,winner as a valued network customer you have...,"[winner, , as, a, valued, network, customer,..."
9,spam,Had your mobile 11 months or more? U R entitle...,had your mobile months or more u r entitle...,"[had, your, mobile, , months, or, more, , ..."


Удаление стоп-слов

In [25]:
stopwords = nlp.Defaults.stop_words

In [27]:
data['purified_tokens'] = data['tokens'].apply(lambda f: [token for token in f if token.lower_ not in stopwords])

In [28]:
data.head(10)

Unnamed: 0,type,original_text,processed_text,tokens,purified_tokens
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only ...,"[go, until, jurong, point, , crazy, , avail...","[jurong, point, , crazy, , available, bugis..."
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,"[ok, lar, , joking, wif, u, oni, ]","[ok, lar, , joking, wif, u, oni, ]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup fina...,"[free, entry, in, , a, wkly, comp, to, win, ...","[free, entry, , wkly, comp, win, fa, cup, fi..."
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, , u, c, alrea...","[u, dun, early, hor, , u, c, ]"
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i don t think he goes to usf he lives aro...,"[nah, i, don, t, think, he, goes, to, usf, , ...","[nah, don, t, think, goes, usf, , lives]"
5,spam,FreeMsg Hey there darling it's been 3 week's n...,freemsg hey there darling it s been week s n...,"[freemsg, hey, there, darling, it, s, been, ...","[freemsg, hey, darling, s, , week, s, word, ..."
6,ham,Even my brother is not like to speak with me. ...,even my brother is not like to speak with me ...,"[even, my, brother, is, not, like, to, speak, ...","[brother, like, speak, , treat, like, aids, p..."
7,ham,As per your request 'Melle Melle (Oru Minnamin...,as per your request melle melle oru minnamin...,"[as, per, your, request, , melle, melle, , o...","[request, , melle, melle, , oru, minnaminung..."
8,spam,WINNER!! As a valued network customer you have...,winner as a valued network customer you have...,"[winner, , as, a, valued, network, customer,...","[winner, , valued, network, customer, select..."
9,spam,Had your mobile 11 months or more? U R entitle...,had your mobile months or more u r entitle...,"[had, your, mobile, , months, or, more, , ...","[mobile, , months, , u, r, entitled, updat..."


Лемматизация

In [30]:
data['lematized_tokens'] = data['purified_tokens'].apply(lambda f: [word.lemma_ for word in f])

In [31]:
data.head()

Unnamed: 0,type,original_text,processed_text,tokens,purified_tokens,lematized_tokens
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only ...,"[go, until, jurong, point, , crazy, , avail...","[jurong, point, , crazy, , available, bugis...","[jurong, point, , crazy, , available, bugis..."
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,"[ok, lar, , joking, wif, u, oni, ]","[ok, lar, , joking, wif, u, oni, ]","[ok, lar, , joke, wif, u, oni, ]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup fina...,"[free, entry, in, , a, wkly, comp, to, win, ...","[free, entry, , wkly, comp, win, fa, cup, fi...","[free, entry, , wkly, comp, win, fa, cup, fi..."
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, , u, c, alrea...","[u, dun, early, hor, , u, c, ]","[u, dun, early, hor, , u, c, ]"
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i don t think he goes to usf he lives aro...,"[nah, i, don, t, think, he, goes, to, usf, , ...","[nah, don, t, think, goes, usf, , lives]","[nah, don, t, think, go, usf, , live]"


Vectorizer

In [33]:
tokens = []
for i in data['lematized_tokens']:
    tokens.append(' '.join([word for word in i]))
print(tokens[:2])

['jurong point   crazy    available bugis n great world la e buffet     cine get amore wat   ', 'ok lar     joke wif u oni   ']


In [41]:
count_vectorizer = CountVectorizer()
bow = count_vectorizer.fit_transform(tokens)
bow.shape

(5572, 6379)

In [42]:
bow.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

TF-IDF

In [43]:
TFIDF = TfidfVectorizer()
tfidf_vectorizer = TfidfVectorizer()
TFIDF = tfidf_vectorizer.fit_transform(tokens)
tfidf_vectorizer.vocabulary_

{'jurong': 2868,
 'point': 4148,
 'crazy': 1193,
 'available': 383,
 'bugis': 738,
 'great': 2273,
 'world': 6220,
 'la': 2971,
 'buffet': 736,
 'cine': 971,
 'get': 2159,
 'amore': 187,
 'wat': 6052,
 'ok': 3809,
 'lar': 2998,
 'joke': 2834,
 'wif': 6142,
 'oni': 3834,
 'free': 2036,
 'entry': 1710,
 'wkly': 6187,
 'comp': 1069,
 'win': 6154,
 'fa': 1827,
 'cup': 1245,
 'final': 1920,
 'tkts': 5628,
 'st': 5184,
 'text': 5515,
 'receive': 4476,
 'question': 4389,
 'std': 5213,
 'txt': 5783,
 'rate': 4435,
 'apply': 262,
 'dun': 1592,
 'early': 1609,
 'hor': 2504,
 'nah': 3585,
 'don': 1520,
 'think': 5561,
 'go': 2199,
 'usf': 5904,
 'live': 3111,
 'freemsg': 2042,
 'hey': 2436,
 'darle': 1294,
 'week': 6088,
 'word': 6213,
 'like': 3080,
 'fun': 2086,
 'tb': 5456,
 'xxx': 6287,
 'chgs': 929,
 'send': 4812,
 'rcv': 4445,
 'brother': 712,
 'speak': 5118,
 'treat': 5728,
 'aids': 125,
 'patent': 3989,
 'request': 4557,
 'melle': 3355,
 'oru': 3884,
 'minnaminunginte': 3410,
 'nurungu': 

In [44]:
TFIDF.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

LDA

In [46]:
n_samples = 2000
n_features = 1000
n_topics = 15
n_top_words = 30

In [47]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [48]:
LDA = LatentDirichletAllocation(n_components=n_topics, max_iter=20,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0).fit(TFIDF)

In [49]:
print_top_words(LDA, tfidf_vectorizer.get_feature_names_out(), 9)

Topic #0:
later sorry ll meeting account point aight sir private
Topic #1:
room pretty small machan available mark num befor activity
Topic #2:
teach possible training ah password ride gautham wana stupid
Topic #3:
hrs valid line urgent try guarantee weekend prize land
Topic #4:
look secret think reveal admirer special contact find ur
Topic #5:
surf sleep prepare bowl beer kick unsold amp church
Topic #6:
ok come go not good get home time lor
Topic #7:
person library story dude sup howz realy yoga hide
Topic #8:
regard change tat request callertune convey download kano maga
Topic #9:
special update half swing mth check price rental rakhesh
Topic #10:
na angry song wine chennai deep wen basically wid
Topic #11:
yup sec sent simple opinion savamob esplanade discount silent
Topic #12:
lt gt know yeah like sure ll minute awesome
Topic #13:
win late customer txt gift prize draw double question
Topic #14:
free mobile text stop txt reply tone claim ur


Понижение размерности

In [52]:
data["type"] = data['type'].map({"ham": 0, "spam": 1})
y = data['type']

In [63]:
SKB_bow = SelectKBest(k=35)
x_bow = SKB_bow.fit_transform(bow, y)
x_train_bow, x_test_bow, y_train_bow, y_test_bow = train_test_split(x_bow,y, test_size=0.3)

In [64]:
SKB_tfidf = SelectKBest(k=35)
x_dfidf = SKB_tfidf.fit_transform(TFIDF, y)
x_train_tf, x_test_tf, y_train_tf, y_test_tf = train_test_split(x_dfidf, y, test_size=0.3)

Обучение

In [66]:
#Bow
Bagging = BaggingClassifier().fit(x_train_bow, y_train_bow)
print(classification_report(y_test_bow, Bagging.predict(x_test_bow)))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      1467
           1       0.93      0.82      0.88       205

    accuracy                           0.97      1672
   macro avg       0.95      0.91      0.93      1672
weighted avg       0.97      0.97      0.97      1672



In [67]:
KNN = KNeighborsClassifier().fit(x_train_bow, y_train_bow)
print(classification_report(y_test_bow, KNN.predict(x_test_bow)))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1467
           1       0.96      0.69      0.80       205

    accuracy                           0.96      1672
   macro avg       0.96      0.84      0.89      1672
weighted avg       0.96      0.96      0.96      1672



In [68]:
#TF-IDF
Bagging = BaggingClassifier().fit(x_train_tf, y_train_tf)
print(classification_report(y_test_tf, Bagging.predict(x_test_tf)))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98      1456
           1       0.88      0.81      0.84       216

    accuracy                           0.96      1672
   macro avg       0.93      0.89      0.91      1672
weighted avg       0.96      0.96      0.96      1672



In [69]:
KNN = KNeighborsClassifier().fit(x_train_tf, y_train_tf)
print(classification_report(y_test_tf, KNN.predict(x_test_tf)))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97      1456
           1       0.89      0.72      0.79       216

    accuracy                           0.95      1672
   macro avg       0.93      0.85      0.88      1672
weighted avg       0.95      0.95      0.95      1672



In [None]:
#LDA
#TFIDF
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [73]:
LDA_tfidf = LatentDirichletAllocation(n_components=9, max_iter=20,
                                learning_method='online',
                                learning_offset=50).fit(TFIDF)
print_top_words(LDA_bow, tfidf_vectorizer.get_feature_names_out(), 9)

Topic #0:
lol number happy fine hour drive new com smile
Topic #1:
good love time know need day get like want
Topic #2:
ok go come not lor home get da tell
Topic #3:
text ll free stop min message phone sorry later
Topic #4:
gt lt ur send win free dear txt mobile


In [74]:
#BOW
LDA_bow = LatentDirichletAllocation(n_components=9, max_iter=20,
                                learning_method='online',
                                learning_offset=50).fit(bow)
print_top_words(LDA_bow, count_vectorizer.get_feature_names_out(), 9)

Topic #0:
love think lor go ask like home day miss
Topic #1:
win number claim prize ur cash contact award txt
Topic #2:
point account mobile update free show dude half mth
Topic #3:
da leave dear good morning day happy nice night
Topic #4:
come get ll not time ok know need want
Topic #5:
hi drive miss ur call nite aight bit yes
Topic #6:
free text stop reply ur txt send min msg
Topic #7:
gt lt cool office buy decimal post email bill
Topic #8:
pls town want book okie yup tel course trip
