# 1. Load Libraries

In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression

# 2. load dataset

In [2]:
dataframe = pd.read_csv("training_data_full.csv")
dataset = dataframe.values
print(dataframe.head())

                                             content  label
0  NY&C ONLINE: $20 Puffer Vests Back Tonight! Al...  False
1  MATCH ALERT: For the NEXT HOUR get an 800%-MAT...  False
2  SMS passcodes: 1266502  2079684  3401452  4762...  False
3  A new email has been received from notify@elea...  False
4  af0603beaf848c829834343539393735306a4c2d74008d...  False


# 3. Feature Engineering

In [3]:
# Generate X, Y data
X = dataset[:, 0]
Y = dataset[:, 1]

In [4]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
print(encoded_Y)

[0 0 0 ... 1 1 1]


In [5]:
# Data Preprocessing
def preprocessing(X):
    documents = []
    stemmer = WordNetLemmatizer()

    for sen in range(0, len(X)):
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str(X[sen]))

        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        document = document.split()

        document = [stemmer.lemmatize(word) for word in document]
        document = ' '.join(document)

        documents.append(document)
    return documents

In [6]:
# Generate TFIDF feature values with removing stopwords
tfidfconverter = TfidfVectorizer(max_features=1000, stop_words=stopwords.words('english'))
X = tfidfconverter.fit_transform(preprocessing(X)).toarray()
# Dump the file
pickle.dump(tfidfconverter, open("tfidfconverter.pkl", "wb"))
print(X.shape)

(45000, 1000)


# 4. Training and Testing Sets

In [7]:
# test data size is 20 % of dataset
X_train, X_test, y_train, y_test = train_test_split(X, encoded_Y, test_size=0.2, random_state=0)

In [8]:
# create RF classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=0)
# fit to train the classifier
classifier.fit(X_train, y_train) 

RandomForestClassifier(random_state=0)

In [9]:
# predict 
def predict(classifier, X_test):
    y_pred = classifier.predict(X_test)
    return y_pred

# inverse transform
def inverse_transform(encoder, y_pred):
    label_pred = encoder.inverse_transform(y_pred)
    return label_pred

# 5. Evaluating the Model

In [10]:
# predict
y_pred = predict(classifier, X_test)
# display measurements
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))


[[6014   15]
 [   6 2965]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6029
           1       0.99      1.00      1.00      2971

    accuracy                           1.00      9000
   macro avg       1.00      1.00      1.00      9000
weighted avg       1.00      1.00      1.00      9000

0.9976666666666667


In [11]:
# save the model into a pickle file
with open('binary_classifier.pkl', 'wb') as picklefile:
    pickle.dump(classifier,picklefile)

In [12]:
# load the model from a pickle file
with open('binary_classifier.pkl', 'rb') as training_model:
    classifier = pickle.load(training_model)

# 6. Predict the new unlabeled Dataset

In [20]:
test_dataframe = pd.read_csv("test_data_10000.csv")
print(test_dataframe)
test_dataset = test_dataframe.values

                                                 content  label
0      NY&C ONLINE: $20 Puffer Vests Back Tonight! Al...    NaN
1      Hi, thanks for visiting Core Physicians. Pleas...    NaN
2      There is a new VET available from Sep 30, 2020...    NaN
3      Pres. Trump has requested your input on nomina...    NaN
4      NY&C ONLINE: $20 Puffer Vests Back Tonight! Al...    NaN
...                                                  ...    ...
9996   Thomas Podmajersky paid you $100.00 Payment fo...    NaN
9997   9/28 from being an @Home Learner then we would...    NaN
9998   Time is running out! Complete the 2020 Census ...    NaN
9999   Pres. Trump has requested your input on nomina...    NaN
10000  Pres. Trump has requested your input on nomina...    NaN

[10001 rows x 2 columns]


In [21]:
# Get X data and fit it with TF-IDF Vectorizer
X_test_data = test_dataset[:, 0]
X_test_data = tfidfconverter.transform(preprocessing(X_test_data)).toarray()
print(len(X_test_data))

10001


In [22]:
# predict for new dataset
y_pred = predict(classifier, X_test_data)
print(len(y_pred))
label_pred = inverse_transform(encoder, y_pred)
print(label_pred)

10001
[False False False ... False False False]


In [23]:
# write the dataframe into csv file
test_dataframe['label'] = label_pred
test_dataframe.to_csv("test_data_10000_1_test.csv", index=False)

In [None]:
log_reg = LogisticRegression(random_state=0)
model=log_reg.fit(X_train, y_train)

In [None]:
# predict
y_pred = predict(log_reg, X_test)
print(y_pred)
label_pred =inverse_transform(encoder, y_pred)
print(label_pred)

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))