In [1]:
import numpy as np 
import pandas as pd 
import tarfile
import matplotlib.pyplot as plt
import seaborn as sns
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud

import sklearn
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,\
                            roc_curve, auc, roc_auc_score, f1_score, precision_recall_curve,\
                            precision_score, average_precision_score, recall_score
from sklearn.metrics.scorer import make_scorer
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [2]:
import warnings
warnings.filterwarnings("ignore")
import scikitplot 
from sklearn import metrics
from sklearn.metrics import roc_curve, auc

### data 1: read from spam.csv

In [3]:
data1 = pd.read_csv('spam.csv', usecols = [0,1], encoding='latin-1')
data1.columns = ['label', 'text']
data1.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### data 2:  add enron spam and ham email - Enron-Spam in pre-processed formlink: 
- Links : http://www2.aueb.gr/users/ion/data/enron-spam/

In [288]:
def read_label_text(file_name):
    """
    read file contents(text) and put them into data frame
    """
    rows = []
    tar = tarfile.open(file_name, "r:gz")
    for member in tar.getmembers():
        if "ham" in member.name:
                f = tar.extractfile(member)
                if f is not None:
                    row = f.read().splitlines()
                    row = b' '.join(row)
                    row = row.decode(encoding='utf-8', errors='ignore')
#                     row = row.replace('Subject: ', '').replace('re : ', '').strip()
                    rows.append({'label': 'ham','text': row})

        if "spam" in member.name:
                f = tar.extractfile(member)
                if f is not None:
                    row = f.read().splitlines()
                    row = b' '.join(row)
                    row = row.decode(encoding='utf-8', errors='ignore')
#                     row = row.replace('Subject: ', '').replace('re : ', '').strip()
                    rows.append({'label': 'spam','text': row})
    tar.close()
    return pd.DataFrame(rows)

In [289]:
def read_enron_dataset(file_names):
    """
    read all six files and combine all six data frames to one big data frame
    """
    df = pd.DataFrame()
    for ele in file_names:
        temp = read_label_text(ele)
        df = df.append(temp)
    return df.reset_index(drop = True)

In [290]:
file_names = ["enron1.tar.gz","enron2.tar.gz","enron3.tar.gz","enron4.tar.gz","enron5.tar.gz","enron6.tar.gz"]

In [291]:
data2 = read_enron_dataset(file_names)

### data = data1 + data2 

In [292]:
data= pd.concat([data1,data2], axis = 0).reset_index(drop = True)

In [293]:
data.shape

(39288, 2)

In [294]:
data.groupby('label').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,21370,20106,"Sorry, I'll call later",30
spam,17918,15201,Subject:,51


In [295]:
data['length'] = data.text.str.len()
data['cat'] = data['label'].map({'ham' : 1, 'spam' : 0})
data.head()

Unnamed: 0,label,text,length,cat
0,ham,"Go until jurong point, crazy.. Available only ...",111,1
1,ham,Ok lar... Joking wif u oni...,29,1
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,0
3,ham,U dun say so early hor... U c already then say...,49,1
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,1


In [298]:
regex = re.compile('[%s]' % re.escape(string.punctuation))
stop_words = set(stopwords.words('english'))
def test_re(s):  # From Vinko's solution, with fix.
    return regex.sub('', s)

In [297]:
def remove_punc_stop2(text):
    sen = [test_re(i) for i in text]
    word = "".join(sen).split()
    word = [i.lower() for i in word if i.lower() not in stopwords.words("english")]
    word = " ".join(word)
    return word

### clean data

In [300]:
data['cleaned text'] = data.text.apply(remove_punc_stop2)

In [303]:
data['cleaned text'].shape

(39288,)

### store cleaned data

In [329]:
## store cleaned data
with open('data_cleaned.pkl','wb') as f:
    pickle.dump(data['cleaned text'], f)
    
with open('data_cat.pkl','wb') as f:
    pickle.dump(data['cat'], f)

In [333]:
## code for extract data from pickle
with open('data_cleaned.pkl', 'rb') as f:
      data_text = pickle.load(f)
        
with open('data_cat.pkl', 'rb') as f:
      data_cate = pickle.load(f)