In [3]:
import numpy as np 
import os 
import pandas as pd
import re

In [8]:
train_data = pd.read_csv("Datasets/SMS_train.csv",encoding="cp1252")
test_data = pd.read_csv("Datasets/SMS_test.csv",encoding="cp1252")

In [10]:
classes = {"Non-Spam":0,"Spam":1}
train_data['Label'] = train_data['Label'].map(classes)
test_data['Label'] = test_data['Label'].map(classes)
train_data.head()

Unnamed: 0,S. No.,Message_body,Label
0,1,Rofl. Its true to its name,0
1,2,The guy did some bitching but I acted like i'd...,0
2,3,"Pity, * was in mood for that. So...any other s...",0
3,4,Will ü b going to esplanade fr home?,0
4,5,This is the 2nd time we have tried 2 contact u...,1


In [13]:
train_data.drop("S. No.",inplace=True,axis=1)

In [16]:
print(train_data.shape)
test_data.drop("S. No.",inplace=True,axis=1)
print(test_data.shape)

(957, 2)
(125, 2)


In [27]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

train_data['Message_body'] = train_data['Message_body'].apply(preprocessor)
test_data['Message_body'] = test_data['Message_body'].apply(preprocessor)

In [20]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

def tokenizer(text):
    return text.split()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [21]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\infra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [22]:
from nltk.corpus import stopwords
stop = stopwords.words("english")

In [23]:
X_train = train_data['Message_body'].values
y_train = train_data['Label'].values

X_test = test_data['Message_body'].values
y_test = test_data['Label'].values

In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None,
                        tokenizer=tokenizer,
                        stop_words=stop,
                        ngram_range=(1,1),
                        norm="l2")

lr = LogisticRegression(random_state=1,solver="liblinear",C=10.0)

pipeline = Pipeline([("vect",tfidf),
                    ("clf",lr)])
pipeline.fit(X_train,y_train)

Pipeline(steps=[('vect',
                 TfidfVectorizer(lowercase=False,
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function tokenizer at 0x000001EA6CF4BD30>)),
                ('clf',
                 LogisticRegression(C=10.0, random_state=1,
                                    solver='liblinear'))])

In [50]:
print("Training score: {}".format(pipeline.score(X_train,y_train)))
print("Testing score: {}".format(pipeline.score(X_test,y_test)))

Training score: 1.0
Testing score: 0.84


In [51]:
from sklearn.metrics import classification_report
prediction = pipeline.predict(X_test)
print(classification_report(prediction,y_test))

              precision    recall  f1-score   support

           0       1.00      0.71      0.83        69
           1       0.74      1.00      0.85        56

    accuracy                           0.84       125
   macro avg       0.87      0.86      0.84       125
weighted avg       0.88      0.84      0.84       125



In [52]:
from sklearn.model_selection import GridSearchCV

param_grid = [{"vect__ngram_range":[(1,1)],
              "vect__stop_words":[stop,None],
              "vect__tokenizer":[tokenizer,tokenizer_porter],
              "clf__penalty":['l1','l2'],
              "clf__C":[1.0,10.0,100.0]},
             {"vect__ngram_range":[(1,1)],
             "vect__stop_words":[stop,None],
             "vect__tokenizer":[tokenizer,tokenizer_porter],
             "clf__penalty":["l1","l2"],
             "clf__C":[1.0,10.0,100.0],
             "vect__norm":[None],
             "vect__use_idf":[False]}]
grid = GridSearchCV(pipeline,param_grid,
                   scoring="accuracy",
                   cv=5,
                   verbose=2,
                   n_jobs=-1)

In [53]:
grid.fit(X_train,y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:   12.5s finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect',
                                        TfidfVectorizer(lowercase=False,
                                                        stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'ours',
                                                                    'ourselves',
                                                                    'you',
                                                                    "you're",
                                                                    "you've",
                                                                    "you'll",
                                                                    "you'd",
     

In [55]:
print("Best parameter: {}".format(grid.best_params_))
print("Accuracy: {}".format(grid.best_score_))

Best parameter: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer_porter at 0x000001EA6CF4B8B0>}
Accuracy: 0.981184554973822


In [58]:
clf = grid.best_estimator_
print("test accuracy: {}".format(clf.score(X_test,y_test)))

test accuracy: 0.856


In [59]:
pred = clf.predict(X_test)
print(classification_report(pred,y_test))

              precision    recall  f1-score   support

           0       1.00      0.73      0.84        67
           1       0.76      1.00      0.87        58

    accuracy                           0.86       125
   macro avg       0.88      0.87      0.86       125
weighted avg       0.89      0.86      0.85       125

