In [1]:
from sklearn.naive_bayes import *
from sklearn.dummy import *
from sklearn.ensemble import *
from sklearn.neighbors import *
from sklearn.tree import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.calibration import *
from sklearn.linear_model import *
from sklearn.multiclass import *
from sklearn.svm import *
import pandas


def perform(classifiers, vectorizers, train_data, test_data):
    for classifier in classifiers:
      for vectorizer in vectorizers:
        string = ''
        string += classifier.__class__.__name__ + ' with ' + vectorizer.__class__.__name__

        # train
        vectorize_text = vectorizer.fit_transform(train_data.v2)
        classifier.fit(vectorize_text, train_data.v1)

        # score
        vectorize_text = vectorizer.transform(test_data.v2)
        score = classifier.score(vectorize_text, test_data.v1)
        string += '. Has score: ' + str(score)
        print(string)

# open data-set and divide it
data = pandas.read_csv('spam.csv', encoding='latin-1')
learn = data[:4400] # 4400 items
test = data[4400:] # 1172 items

perform(
    [
        BernoulliNB(),
        RandomForestClassifier(n_estimators=100, n_jobs=-1),
        AdaBoostClassifier(),
        BaggingClassifier(),
        ExtraTreesClassifier(),
        GradientBoostingClassifier(),
        DecisionTreeClassifier(),
        CalibratedClassifierCV(),
        DummyClassifier(),
        PassiveAggressiveClassifier(),
        RidgeClassifier(),
        RidgeClassifierCV(),
        SGDClassifier(),
        OneVsRestClassifier(SVC(kernel='linear')),
        OneVsRestClassifier(LogisticRegression()),
        KNeighborsClassifier()
    ],
    [
        CountVectorizer(),
        TfidfVectorizer(),
        HashingVectorizer()
    ],
    learn,
    test
)

BernoulliNB with CountVectorizer. Has score: 0.9778156996587031
BernoulliNB with TfidfVectorizer. Has score: 0.9778156996587031
BernoulliNB with HashingVectorizer. Has score: 0.8728668941979523
RandomForestClassifier with CountVectorizer. Has score: 0.9761092150170648
RandomForestClassifier with TfidfVectorizer. Has score: 0.9761092150170648
RandomForestClassifier with HashingVectorizer. Has score: 0.9709897610921502
AdaBoostClassifier with CountVectorizer. Has score: 0.947098976109215
AdaBoostClassifier with TfidfVectorizer. Has score: 0.9539249146757679
AdaBoostClassifier with HashingVectorizer. Has score: 0.9505119453924915
BaggingClassifier with CountVectorizer. Has score: 0.9641638225255973
BaggingClassifier with TfidfVectorizer. Has score: 0.9667235494880546
BaggingClassifier with HashingVectorizer. Has score: 0.9658703071672355
ExtraTreesClassifier with CountVectorizer. Has score: 0.9778156996587031
ExtraTreesClassifier with TfidfVectorizer. Has score: 0.9778156996587031
ExtraTr

In [1]:
# We now know that OneVsRestClassifier works best on our dataset
# Vectorizer is TfidVectorizer
# Now we try to get each prediction ina more detailed manner.
from sklearn.naive_bayes import *
from sklearn.dummy import *
from sklearn.ensemble import *
from sklearn.neighbors import *
from sklearn.tree import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.calibration import *
from sklearn.linear_model import *
from sklearn.multiclass import *
from sklearn.svm import *
import pandas
import csv

data = pandas.read_csv('spam.csv', encoding='latin-1')
train_data = data[:4400] # 4400 items
test_data = data[4400:] # 1172 items

classifier = OneVsRestClassifier(SVC(kernel='linear'))
vectorizer = TfidfVectorizer()

# train
vectorize_text = vectorizer.fit_transform(train_data.v2)
classifier.fit(vectorize_text, train_data.v1)

# score
# vectorize_text = vectorizer.transform(test_data.v2)
# score = classifier.score(vectorize_text, test_data.v1)
# print(score) # 98,8


csv_arr = []
for index, row in test_data.iterrows():
    answer = row.iloc[0]
    text = row.iloc[1]
    vectorize_text = vectorizer.transform([text])
    predict = classifier.predict(vectorize_text)[0]
    if predict == answer:
        result = 'right'
    else:
        result = 'wrong'
    csv_arr.append([len(csv_arr), text, answer, predict, result])


# write csv
with open('test_score.csv', 'w', newline='',encoding='utf-8') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=';',
            quotechar='"', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writerow(['#', 'text', 'answer', 'predict', 'result'])

    for row in csv_arr:
        spamwriter.writerow(row)