In [1]:
pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.6.2-cp36-cp36m-manylinux2010_x86_64.whl (458.3 MB)
     |████████████████████████████████| 458.3 MB 15 kB/s              
Collecting termcolor~=1.1.0
  Downloading termcolor-1.1.0.tar.gz (3.9 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting typing-extensions~=3.7.4
  Downloading typing_extensions-3.7.4.3-py3-none-any.whl (22 kB)
Collecting astunparse~=1.6.3
  Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting opt-einsum~=3.3.0
  Downloading opt_einsum-3.3.0-py3-none-any.whl (65 kB)
     |████████████████████████████████| 65 kB 1.1 MB/s             
[?25hCollecting gast==0.4.0
  Downloading gast-0.4.0-py3-none-any.whl (9.8 kB)
Collecting tensorflow-estimator<2.7,>=2.6.0
  Downloading tensorflow_estimator-2.6.0-py2.py3-none-any.whl (462 kB)
     |████████████████████████████████| 462 kB 40.2 MB/s            
[?25hCollecting keras<2.7,>=2.6.0
  Downloading keras-2.6.0-py2.py3-none-any.whl (1.3 MB)
     |████

In [16]:
import boto3
import pandas as pd
import numpy as np
from tqdm import tqdm
import tensorflow as tf
import precision_recall_cutoff as prc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import recall_score
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [17]:
## Defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'data-448-bucket-callaghan'
bucket = s3.Bucket(bucket_name)

## Defining the csv file
file_key = 'spambase_csv.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

spam = pd.read_csv(file_content_stream)

spam.head()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_%3B,char_freq_%28,char_freq_%5B,char_freq_%21,char_freq_%24,char_freq_%23,capital_run_length_average,capital_run_length_longest,capital_run_length_total,class
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [18]:
spam.shape

(4601, 58)

In [19]:
spam.dropna().shape

(4601, 58)

In [20]:
spam['class'].value_counts() / spam.shape[0]

0    0.605955
1    0.394045
Name: class, dtype: float64

## Splitting the data

In [21]:
## Defining input and target variables
X = spam.drop(columns = ['class'])
Y = spam['class']

## Splitting the data
X_training, X_testing, Y_training, Y_testing = train_test_split(X, Y, test_size = 0.2, stratify = Y)

## Feature Selection

In [24]:
## Performing feature selection to limit the number of input variables in the final model

## Defining empty lists to store results
variable_support = list()

## Repeating the process 10 times
for i in tqdm(range(0, 25)):
    
    ## Building the RFECV model
    rf_rfecv = RFECV(estimator = RandomForestClassifier(max_depth = 3), step = 1, min_features_to_select = 2,
                    cv = 3, scoring = 'accuracy', n_jobs = -1).fit(X_training, Y_training)
    
    ## Appenind RFECV results
    variable_support.append(rf_rfecv.support_)
    
    
## Extracting variable selection results
support = pd.DataFrame(variable_support, columns = X_training.columns)
support_2 = 100 * support.apply(np.sum, axis = 0) / support.shape[0]
support_3 = pd.DataFrame({'Variable': support_2.index, 'Score': support_2.values})

100%|██████████| 25/25 [11:12<00:00, 26.89s/it]


In [25]:
## Printing results
support_3.sort_values('Score', ascending = False).head(10)

Unnamed: 0,Variable,Score
56,capital_run_length_total,100.0
54,capital_run_length_average,100.0
26,word_freq_george,100.0
25,word_freq_hpl,100.0
24,word_freq_hp,100.0
23,word_freq_money,100.0
22,word_freq_000,100.0
15,word_freq_free,100.0
51,char_freq_%21,100.0
52,char_freq_%24,100.0


## Redefining input variables

In [26]:
X_training = X_training[['capital_run_length_total', 'capital_run_length_average', 'word_freq_george', 
                         'word_freq_hpl', 'word_freq_hp', 'word_freq_money', 'word_freq_000', 'word_freq_free', 
                         'char_freq_%21', 'char_freq_%24']]
X_testing = X_testing[['capital_run_length_total', 'capital_run_length_average', 'word_freq_george', 
                         'word_freq_hpl', 'word_freq_hp', 'word_freq_money', 'word_freq_000', 'word_freq_free', 
                         'char_freq_%21', 'char_freq_%24']]

## Modeling

In [49]:
## Building a Decision Tree Classifier model

tree_md = DecisionTreeClassifier(max_depth = 3).fit(X_training, Y_training)

## Predicting on the testing set
model_preds = tree_md.predict_proba(X_testing)[:, 1]

## Using precision-recall curve to find optimal cutoff value
model_labels = prc.precision_recall_cutoff(Y_testing, model_preds)

## Computing the classification report for the DTC model
print(classification_report(Y_testing, model_labels))

              precision    recall  f1-score   support

           0       0.89      0.89      0.89       558
           1       0.83      0.84      0.84       363

    accuracy                           0.87       921
   macro avg       0.86      0.86      0.86       921
weighted avg       0.87      0.87      0.87       921



In [47]:
## Building a multi-layer perceptron Neural Network model

nn_md = tf.keras.models.Sequential([tf.keras.layers.Dense(4, input_dim = 10, activation = 'tanh'), 
                                    tf.keras.layers.Dense(2, activation = 'softmax')])

nn_md.compile(optimizer = 'sgd', loss = 'categorical_crossentropy', metrics = ['accuracy'])

## Fitting the model and using the model to predict on the test dataset
nn_md.fit(X_training, tf.keras.utils.to_categorical(Y_training, num_classes = 2), epochs = 100, batch_size = 500, 
                  validation_data = (X_testing, tf.keras.utils.to_categorical(Y_testing, num_classes = 2)), verbose = 0)


## Predicting on the testing set
model_preds = nn_md.predict(X_testing)[:, 1]

## Using precision-recall curve to find optimal cutoff value
model_labels = prc.precision_recall_cutoff(Y_testing, model_preds)

## Computing the classification report for the NN model
print(classification_report(Y_testing, model_labels))

              precision    recall  f1-score   support

           0       0.87      0.43      0.58       558
           1       0.51      0.90      0.65       363

    accuracy                           0.62       921
   macro avg       0.69      0.67      0.61       921
weighted avg       0.73      0.62      0.61       921

