# 1. Load Libraries

In [19]:
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression

# 2. load dataset

In [2]:
dataframe = pd.read_csv("training_data_full.csv")
dataset = dataframe.values
print(dataframe.head())

                                             content  label
0  NY&C ONLINE: $20 Puffer Vests Back Tonight! Al...  False
1  MATCH ALERT: For the NEXT HOUR get an 800%-MAT...  False
2  SMS passcodes: 1266502  2079684  3401452  4762...  False
3  A new email has been received from notify@elea...  False
4  af0603beaf848c829834343539393735306a4c2d74008d...  False


# 3. Feature Engineering

In [3]:
# Generate X, Y data
X = dataset[:, 0]
Y = dataset[:, 1]

In [4]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
print(encoded_Y)

[0 0 0 ... 1 1 1]


In [5]:
# Data Preprocessing
documents = []
stemmer = WordNetLemmatizer()

for sen in range(0, len(X)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    
    documents.append(document)

In [6]:
# Generate TFIDF feature values with removing stopwords
tfidfconverter = TfidfVectorizer(max_features=1000, stop_words=stopwords.words('english'))
X = tfidfconverter.fit_transform(documents).toarray()
print(X.shape)

(45000, 1000)


# 4. Training and Testing Sets

In [7]:
# test data size is 20 % of dataset
X_train, X_test, y_train, y_test = train_test_split(X, encoded_Y, test_size=0.2, random_state=0)

In [9]:
# create RF classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=0)
# fit to train the classifier
classifier.fit(X_train, y_train) 

RandomForestClassifier(random_state=0)

In [14]:
# predict
y_pred = classifier.predict(X_test)
print(y_pred)
label_pred = encoder.inverse_transform(y_pred)
print(label_pred)

[1 0 0 ... 0 1 1]
[True False False ... False True True]


# 5. Evaluating the Model

In [15]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[6014   15]
 [   6 2965]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6029
           1       0.99      1.00      1.00      2971

    accuracy                           1.00      9000
   macro avg       1.00      1.00      1.00      9000
weighted avg       1.00      1.00      1.00      9000

0.9976666666666667


In [12]:
# save the model into a pickle file
with open('binary_classifier.pkl', 'wb') as picklefile:
    pickle.dump(classifier,picklefile)

In [16]:
# load the model from a pickle file
with open('binary_classifier.pkl', 'rb') as training_model:
    model = pickle.load(training_model)

In [17]:
print(model)

RandomForestClassifier(random_state=0)


In [20]:
log_reg = LogisticRegression(random_state=0)
model=log_reg.fit(X_train, y_train)

In [21]:
# predict
y_pred = log_reg.predict(X_test)
print(y_pred)
label_pred = encoder.inverse_transform(y_pred)
print(label_pred)

[1 0 0 ... 0 1 1]
[True False False ... False True True]


In [22]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[5994   35]
 [  12 2959]]
              precision    recall  f1-score   support

           0       1.00      0.99      1.00      6029
           1       0.99      1.00      0.99      2971

    accuracy                           0.99      9000
   macro avg       0.99      1.00      0.99      9000
weighted avg       0.99      0.99      0.99      9000

0.9947777777777778
