# Import Libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.naive_bayes import *
from sklearn.dummy import *
from sklearn.ensemble import *
from sklearn.neighbors import *
from sklearn.tree import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.calibration import *
from sklearn.linear_model import *
from sklearn.multiclass import *
from sklearn.svm import *

  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'


loading the dataset into a pandas dataframe

In [3]:
df = pd.read_csv('data/spam/spam.csv', encoding="latin-1")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


remove column 2, 3 and 4 as they have no useful information

In [4]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
df['body'] = df['v2']
df['label'] = df['v1'].map({'ham': 0, 'spam': 1})
df.drop(['v1', 'v2'], axis=1, inplace=True)
train_data = df[:4400]
test_data = df[4400:]
df.head()

Unnamed: 0,body,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [5]:
def perform(classifiers, vectorizers, train_data, test_data):
    max_score = 0
    max_name = 0
    for classifier in classifiers:
        for vectorizer in vectorizers:
        
            # train
            vectorize_text = vectorizer.fit_transform(train_data.body)
            classifier.fit(vectorize_text, train_data.label)

            # score
            vectorize_text = vectorizer.transform(test_data.body)
            score = classifier.score(vectorize_text, test_data.label)
            name = classifier.__class__.__name__ + ' with ' + vectorizer.__class__.__name__ 
            print(name, score)
        if score > max_score:
            max_score = score
            max_name = name
    print ('===========================================')
    print ('===========================================')
    print (max_name, max_score)
    print ('===========================================')
    print ('===========================================')

list of various classifiers we are going to use

In [6]:
classifiers = [
        BernoulliNB(),
        RandomForestClassifier(n_estimators=100, n_jobs=-1),
        AdaBoostClassifier(),
        BaggingClassifier(),
        ExtraTreesClassifier(),
        GradientBoostingClassifier(),
        DecisionTreeClassifier(),
        CalibratedClassifierCV(),
        DummyClassifier(),
        PassiveAggressiveClassifier(),
        RidgeClassifier(),
        RidgeClassifierCV(),
        SGDClassifier(),
        OneVsRestClassifier(SVC(kernel='linear')),
        OneVsRestClassifier(LogisticRegression()),
        KNeighborsClassifier()
    ]

list of various vectorizers we are going to use

In [7]:
vectorizers = [
        CountVectorizer(),
        TfidfVectorizer(),
        HashingVectorizer()
    ]

perform classification and save results to a new dataframe

In [8]:
perform(
    classifiers,
    vectorizers,
    train_data,
    test_data
)

BernoulliNB with CountVectorizer 0.9778156996587031
BernoulliNB with TfidfVectorizer 0.9778156996587031
BernoulliNB with HashingVectorizer 0.8728668941979523


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


RandomForestClassifier with CountVectorizer 0.9735494880546075


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


RandomForestClassifier with TfidfVectorizer 0.9795221843003413


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


RandomForestClassifier with HashingVectorizer 0.9675767918088737
AdaBoostClassifier with CountVectorizer 0.9718430034129693
AdaBoostClassifier with TfidfVectorizer 0.9692832764505119
AdaBoostClassifier with HashingVectorizer 0.9735494880546075
BaggingClassifier with CountVectorizer 0.9667235494880546
BaggingClassifier with TfidfVectorizer 0.9573378839590444
BaggingClassifier with HashingVectorizer 0.9701365187713311


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


ExtraTreesClassifier with CountVectorizer 0.9795221843003413


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


ExtraTreesClassifier with TfidfVectorizer 0.9752559726962458


  if _joblib.__version__ >= LooseVersion('0.12'):


ExtraTreesClassifier with HashingVectorizer 0.9641638225255973
GradientBoostingClassifier with CountVectorizer 0.9709897610921502
GradientBoostingClassifier with TfidfVectorizer 0.9658703071672355
GradientBoostingClassifier with HashingVectorizer 0.9718430034129693
DecisionTreeClassifier with CountVectorizer 0.9667235494880546
DecisionTreeClassifier with TfidfVectorizer 0.962457337883959
DecisionTreeClassifier with HashingVectorizer 0.9598976109215017




CalibratedClassifierCV with CountVectorizer 0.9872013651877133




CalibratedClassifierCV with TfidfVectorizer 0.985494880546075




CalibratedClassifierCV with HashingVectorizer 0.9820819112627986
DummyClassifier with CountVectorizer 0.7815699658703071
DummyClassifier with TfidfVectorizer 0.7790102389078498
DummyClassifier with HashingVectorizer 0.78839590443686




PassiveAggressiveClassifier with CountVectorizer 0.9837883959044369
PassiveAggressiveClassifier with TfidfVectorizer 0.9863481228668942
PassiveAggressiveClassifier with HashingVectorizer 0.9829351535836177
RidgeClassifier with CountVectorizer 0.9820819112627986
RidgeClassifier with TfidfVectorizer 0.9829351535836177
RidgeClassifier with HashingVectorizer 0.9820819112627986
RidgeClassifierCV with CountVectorizer 0.9829351535836177
RidgeClassifierCV with TfidfVectorizer 0.9837883959044369
RidgeClassifierCV with HashingVectorizer 0.9803754266211604




SGDClassifier with CountVectorizer 0.985494880546075
SGDClassifier with TfidfVectorizer 0.9880546075085325
SGDClassifier with HashingVectorizer 0.9803754266211604
OneVsRestClassifier with CountVectorizer 0.9863481228668942
OneVsRestClassifier with TfidfVectorizer 0.9880546075085325
OneVsRestClassifier with HashingVectorizer 0.9829351535836177




OneVsRestClassifier with CountVectorizer 0.9837883959044369




OneVsRestClassifier with TfidfVectorizer 0.9752559726962458
OneVsRestClassifier with HashingVectorizer 0.9692832764505119
KNeighborsClassifier with CountVectorizer 0.924061433447099
KNeighborsClassifier with TfidfVectorizer 0.962457337883959
KNeighborsClassifier with HashingVectorizer 0.9607508532423208
PassiveAggressiveClassifier with HashingVectorizer 0.9829351535836177


In [9]:
# train the classifier with best accuracy
Classifier = OneVsRestClassifier(SVC(kernel='linear', probability=True))
Vectorizer = TfidfVectorizer()
vectorize_text = Vectorizer.fit_transform(train_data.body)
Classifier.fit(vectorize_text, train_data.label)

OneVsRestClassifier(estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
          n_jobs=None)

In [10]:
Body = ' won a 1 week FREE membership in our $100,000 Prize Jackpot! Txt the word: C'
vectorize_message = Vectorizer.transform([Body])
predict = Classifier.predict(vectorize_message)[0]

In [11]:
if predict == 0:
    print ('ham')
else:
    print ('spam')

spam
