In [3]:
import numpy as np 
import pandas as pd 
#import data and cleaning
data = pd.read_csv('spam.csv', encoding = 'latin1')
data1 = data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

In [48]:
from collections import Counter
def structures_counter(data1):
    structures = Counter()
    for text in data1:
        structure = get_email_structure(text)
        structures[structure] += 1
    return structures

In [35]:
#train test val split
from sklearn.model_selection import train_test_split
X = np.array(data1['v2'])
y = np.array((data1['v1'] == 'spam') *1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
try:
    import nltk

    stemmer = nltk.PorterStemmer()
    for word in ("Computations", "Computation", "Computing", "Computed", "Compute", "Compulsive"):
        print(word, "=>", stemmer.stem(word))
except ImportError:
    print("Error: stemming requires the NLTK module.")
    stemmer = None

Computations => comput
Computation => comput
Computing => comput
Computed => comput
Compute => comput
Compulsive => compuls


In [49]:
from sklearn.base import BaseEstimator, TransformerMixin

import re
class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,
                 replace_urls=True, replace_numbers=True, stemming=True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for text in X:
            if self.lower_case:
                text = text.lower()
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', 'NUMBER', text)
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            word_counts = Counter(text.split())
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)

In [50]:
X_few = X_train[:3]
X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(X_few)
X_few_wordcounts

array([Counter({'i': 2, 'm': 2, 'no': 1, 'in': 1, 'the': 1, 'same': 1, 'boat': 1, 'still': 1, 'here': 1, 'at': 1, 'my': 1, 'mom': 1, 'check': 1, 'me': 1, 'out': 1, 'on': 1, 'yo': 1, 'half': 1, 'nake': 1}),
       Counter({'number': 3, 'bank': 1, 'of': 1, 'granit': 1, 'issu': 1, 'strong': 1, 'buy': 1, 'explos': 1, 'pick': 1, 'for': 1, 'our': 1, 'member': 1, 'up': 1, 'over': 1, 'nasdaq': 1, 'symbol': 1, 'cdgt': 1, 'that': 1, 'is': 1, 'a': 1, 'per': 1}),
       Counter({'they': 1, 'r': 1, 'give': 1, 'a': 1, 'second': 1, 'chanc': 1, 'to': 1, 'rahul': 1, 'dengra': 1})],
      dtype=object)

In [51]:
from scipy.sparse import csr_matrix

class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.most_common_ = most_common
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))

In [52]:
vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors = vocab_transformer.fit_transform(X_few_wordcounts)
X_few_vectors

<3x11 sparse matrix of type '<class 'numpy.int64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [53]:
X_few_vectors.toarray()

array([[11,  0,  2,  2,  0,  1,  1,  1,  1,  1,  1],
       [19,  3,  0,  0,  1,  0,  0,  0,  0,  0,  0],
       [ 8,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0]], dtype=int64)

In [54]:
vocab_transformer.vocabulary_

{'number': 1,
 'i': 2,
 'm': 3,
 'a': 4,
 'no': 5,
 'in': 6,
 'the': 7,
 'same': 8,
 'boat': 9,
 'still': 10}

In [55]:
from sklearn.pipeline import Pipeline

preprocess_pipeline = Pipeline([
    ("email_to_wordcount", EmailToWordCounterTransformer()),
    ("wordcount_to_vector", WordCounterToVectorTransformer()),
])

X_train_transformed = preprocess_pipeline.fit_transform(X_train)

In [56]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(random_state=42)
score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3, verbose=3)
score.mean()

[CV]  ................................................................
[CV] ....................... , score=0.9851951547779273, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.9845222072678331, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.9824915824915825, total=   0.0s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s finished


0.9840696481791144

In [57]:
from sklearn.metrics import precision_score, recall_score

X_test_transformed = preprocess_pipeline.transform(X_test)

log_clf = LogisticRegression(random_state=42)
log_clf.fit(X_train_transformed, y_train)

y_pred = log_clf.predict(X_test_transformed)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

Precision: 98.54%
Recall: 90.00%
