In [10]:
import re
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

plt.style.use("fivethirtyeight")

df = pd.read_csv("../Dataset_5971.csv")
df.LABEL = df.LABEL.str.lower()

### Pre-processing function

In [11]:
def preprocess(text, 
            phone_token = ' <PHONE> ',
            email_token = ' <EMAIL> ',
            url_token = ' <URL> ',
            num_token = ' <NUM> ',):
    
    #Capitalization removal
    text = text.lower()

    #PHONE NUMBER token substitution
    text = re.sub(r'(\(\d{2}\))\s?\d{8,}|\d{10,}', 
                  phone_token, text, flags=re.MULTILINE)
    #EMAIL token substitution
    text = re.sub("([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+", 
                  email_token, text, flags=re.MULTILINE)
    #URL token substitution
    text = re.sub(r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))", 
                  url_token, text, flags=re.MULTILINE)
    #NUMERIC token substitution
    text = re.sub(r'[0-9]+', 
                  num_token, text, flags=re.MULTILINE)
    #Special characters removal
    text = re.sub(r'([^\w\s<>])|(_)', 
                  " ", text, flags=re.MULTILINE)
    #Multiple space removal
    text = re.sub(r'\s+', 
                  " ", text, flags=re.MULTILINE)

    return text.strip()

In [12]:
df['PREPROCESS'] = df['TEXT'].apply(preprocess)

In [17]:
df.LABEL = df.LABEL.apply(lambda x: x if x=="smishing" else "legitimate")

In [18]:
train, dev = train_test_split(df, test_size=0.2, stratify=df.LABEL, random_state=0)

### Pipeline
##### I decided to approach the problem as a basic TEXT -> VECTOR -> CLASSIFIER pipeline

In [19]:
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', SGDClassifier()),
])

### Grid Search
- For the vectorizer, the search involves three parameters (Minimum document frequency, Maximum document frequency and NGRAM range), for those ichoose values that have worked well for me in the past.
- As for classifier, we consider 4 well known classifiers for machine learning problems and allow variation of their main parameter mostly.

In [20]:
parameters = [
    {'vectorizer': (CountVectorizer(), TfidfVectorizer()),
     'vectorizer__min_df': (0.01, 0.05, 0.1),
     'vectorizer__max_df': (0.5, 0.75, 1.0),
     'vectorizer__ngram_range': ((1,1), (1,2), (1,3)),
     'classifier': (SGDClassifier(),),
     'classifier__penalty': ('l2', 'l1', 'elasticnet')
    },
    {'vectorizer': (CountVectorizer(), TfidfVectorizer()),
     'vectorizer__min_df': (0.01, 0.05, 0.1),
     'vectorizer__max_df': (0.5, 0.75, 1.0),
     'vectorizer__ngram_range': ((1,1), (1,2), (1,3)),
     'classifier': (GaussianNB(),),
     'classifier__var_smoothing': (1e-10, 1e-9, 1e-8)
    },
    {'vectorizer': (CountVectorizer(), TfidfVectorizer()),
     'vectorizer__min_df': (0.01, 0.05, 0.1),
     'vectorizer__max_df': (0.5, 0.75, 1.0),
     'vectorizer__ngram_range': ((1,1), (1,2), (1,3)),
     'classifier': (LogisticRegression(),),
     'classifier__C': (0.1, 1, 10)
    },
    {'vectorizer': (CountVectorizer(), TfidfVectorizer()),
     'vectorizer__min_df': (0.01, 0.05, 0.1),
     'vectorizer__max_df': (0.5, 0.75, 1.0),
     'vectorizer__ngram_range': ((1,1), (1,2), (1,3)),
     'classifier': (RandomForestClassifier(),),
     'classifier__max_depth': (None, 5, 10, 20),
    },
]

##### The search is performed using a 5 fold crossvalidation strategy which is built in the search object from SKLEARN

In [21]:
search = GridSearchCV(pipeline, parameters, cv=5, return_train_score=True, verbose=1)

In [22]:
search.fit(train.PREPROCESS,smishing_model/.LABEL)

Fitting 5 folds for each of 702 candidates, totalling 3510 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

#### Best classifier

In [23]:
search.best_estimator_

- Prediction

In [24]:
best_classifier = search.best_estimator_
dev["PREDICTION"] = best_classifier.predict(dev.PREPROCESS)

- Accuracy

In [25]:
precision_score(dev.LABEL=="smishing", dev.PREDICTION=="smishing")

0.8839285714285714

In [26]:
recall_score(dev.LABEL=="smishing", dev.PREDICTION=="smishing")

0.7734375

In [27]:
f1_score(dev.LABEL=="smishing", dev.PREDICTION=="smishing")

0.825