In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

sw = list(punctuation) + stopwords.words('indonesian')

In [12]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
## Import Data

In [2]:
df = pd.read_csv('spam.csv')
df.head()word_tokenize

Unnamed: 0,Teks,label
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,1
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,1
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",1
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",1
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,1


In [18]:
df['label'].value_counts()

1    574
0    569
Name: label, dtype: int64

In [None]:
## Splitting Data

In [6]:
X = df['Teks']
y = df['label']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2, random_state = 42)

In [8]:
## Training Model

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
pipe_RF = Pipeline([
    ("prep", TfidfVectorizer(ngram_range=(1, 2), stop_words=sw, tokenizer=word_tokenize)),
    ("algo", RandomForestClassifier())
])

In [15]:
pipe_RF.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 TfidfVectorizer(ngram_range=(1, 2),
                                 stop_words=['!', '"', '#', '$', '%', '&', "'",
                                             '(', ')', '*', '+', ',', '-', '.',
                                             '/', ':', ';', '<', '=', '>', '?',
                                             '@', '[', '\\', ']', '^', '_', '`',
                                             '{', '|', ...],
                                 tokenizer=<function word_tokenize at 0x0000021F30B3DD30>)),
                ('algo', RandomForestClassifier())])

In [17]:
y_base = pipe_RF.predict(X_test)

In [19]:
from sklearn.metrics import classification_report, accuracy_score

In [20]:
print(classification_report(y_test, y_base))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97       114
           1       0.97      0.97      0.97       115

    accuracy                           0.97       229
   macro avg       0.97      0.97      0.97       229
weighted avg       0.97      0.97      0.97       229



In [21]:
accuracy_score(y_test, y_base)

0.9737991266375546

In [33]:
RF = Pipeline([
    ("prep", TfidfVectorizer(ngram_range=(1, 2), stop_words=sw, tokenizer=word_tokenize)),
    ("algo", RandomForestClassifier(n_jobs=-1, random_state=42))
])

In [None]:
RandomForestClassifier(
    n_estimators=100,
    *,
    criterion='gini',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features='auto',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    bootstrap=True,
    oob_score=False,
    n_jobs=None,
    random_state=None,
    verbose=0,
    warm_start=False,
    class_weight=None,
    ccp_alpha=0.0,
    max_samples=None,
)

In [29]:
param_RF = {
    "algo__n_estimators" : [100, 200, 300, 400],
    "algo__max_depth" : [None, 5, 10, 15],
    "algo__min_samples_leaf" : [1, 5, 10,15]
}

In [34]:
RF.get_params()

{'memory': None,
 'steps': [('prep', TfidfVectorizer(ngram_range=(1, 2),
                   stop_words=['!', '"', '#', '$', '%', '&', "'", '(', ')', '*',
                               '+', ',', '-', '.', '/', ':', ';', '<', '=', '>',
                               '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', ...],
                   tokenizer=<function word_tokenize at 0x0000021F30B3DD30>)),
  ('algo', RandomForestClassifier(n_jobs=-1, random_state=42))],
 'verbose': False,
 'prep': TfidfVectorizer(ngram_range=(1, 2),
                 stop_words=['!', '"', '#', '$', '%', '&', "'", '(', ')', '*',
                             '+', ',', '-', '.', '/', ':', ';', '<', '=', '>',
                             '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', ...],
                 tokenizer=<function word_tokenize at 0x0000021F30B3DD30>),
 'algo': RandomForestClassifier(n_jobs=-1, random_state=42),
 'prep__analyzer': 'word',
 'prep__binary': False,
 'prep__decode_error': 'strict',
 'pr

In [26]:
skf = StratifiedKFold(n_splits=3, random_state=42)

In [35]:
RF_Tuned = GridSearchCV(RF, param_RF, cv=skf, scoring='accuracy', n_jobs=-1, verbose=1)

In [36]:
RF_Tuned.fit(X_train, y_train)

Fitting 3 folds for each of 64 candidates, totalling 192 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   56.1s
[Parallel(n_jobs=-1)]: Done 192 out of 192 | elapsed:  4.7min finished


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=False),
             estimator=Pipeline(steps=[('prep',
                                        TfidfVectorizer(ngram_range=(1, 2),
                                                        stop_words=['!', '"',
                                                                    '#', '$',
                                                                    '%', '&',
                                                                    "'", '(',
                                                                    ')', '*',
                                                                    '+', ',',
                                                                    '-', '.',
                                                                    '/', ':',
                                                                    ';', '<',
                                                                    '=', '>',
                   

In [37]:
y_Tuned = RF_Tuned.predict(X_test)

In [38]:
accuracy_score(y_test, y_Tuned)

0.9737991266375546

In [40]:
test_1 = ["(INFO RESMI) No anda trpilih meraih dana bantuan bulan suci ramadhan Dr PT pertamina persero kode ID (479KL27) cek ID anda di www.infopertamina.ml"]

In [41]:
RF_Tuned.predict(test_1)

array([1], dtype=int64)

In [42]:
test_2 = ["""MELAYANI PESUGIHAN
DANA GOIB TANPA TUMBAL
PELET&SANTET
DI JAMIN BERHASIL
TANPA ADA RESIKO
silahkan WA:085233943119
"""]

In [43]:
RF_Tuned.predict(test_2)

array([0], dtype=int64)

In [45]:
RF_Tuned.predict_proba(test_2)

array([[0.84, 0.16]])

In [46]:
test_3 = [
"""4lhamdulillah sy4 LISNA
brkat p3sug1han h4lal
smua hutang sy4
sdh lunas,yng
mau sprti sya lihat
bukti v1di0 di www.duitg0ib.info
"""
]

In [47]:
RF_Tuned.predict_proba(test_3)

array([[0.77, 0.23]])

In [53]:
y_test

311     1
704     0
8       1
638     0
1009    0
       ..
708     0
911     0
169     1
843     0
595     0
Name: label, Length: 229, dtype: int64

In [58]:
pd.DataFrame(y_test)

Unnamed: 0,label
311,1
704,0
8,1
638,0
1009,0
...,...
708,0
911,0
169,1
843,0


In [60]:
pred = RF_Tuned.predict(X_test)
act = y_test
df_check = pd.DataFrame(y_test)
df_check['Prediction'] = pred
df_check['Text'] = X_test
df_check[df_check['label'] != df_check['Prediction']]

Unnamed: 0,label,Prediction,Text
582,0,1,2016/08/21 12:27:32 PIN baru TCash anda adalah...
212,1,0,"Sticker LINE yang lucu-lucu banyak banget loh,..."
229,1,0,"Use my invite code, nxmvf722ue, and get a free..."
900,0,1,"Maaf, Keyword yang anda masukkan ke 234 salah...."
992,0,1,"Pemakaian internet kamu sdh mencapai batasFUP,..."
316,1,0,Ingin Tahu Angka Jitu! Yg Keluar Putaran Pada ...


In [50]:
X_test.shape

(229,)

In [61]:
test_4 =["""Assalamu Alaikum wr,wb
         BPK/IBU Mohon Maaf mengganggu waktunya,
         Jika butuh MODAL Usaha dan lainnya, Silahkan chat kami ke WA : 08152790362"""]

In [62]:
RF_Tuned.predict(test_4)

array([0], dtype=int64)

In [63]:
pipe_RF.predict(test_4)

array([0], dtype=int64)