In [14]:
from base64 import b64decode
from codecs import open as codopen
from collections import namedtuple
from tqdm import tqdm_notebook as tqdm
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier

In [15]:
TRAIN_FILE = "data/kaggle_train_data_tab.csv"
TEST_FILE = "data/kaggle_test_data_tab.csv"
SOLUTION_FILE = "data/prediction.csv"

In [16]:
DocItem = namedtuple('DocItem', ['doc_id', 'is_spam', 'html'])

In [17]:
def load_docs(input_file):
    docs = []
    with codopen(input_file, mode="r", encoding="utf-8") as documents:
        documents.readline()
        for document in tqdm(documents):
            doc_id_str, spam_str, url, text64 = document.strip().split("\t")
            docs.append(DocItem(int(doc_id_str), int(spam_str), b64decode(text64)))
    return docs


def write_predictions(docs, predictions):
    with codopen(SOLUTION_FILE, mode="w", encoding="utf-8") as fout:
        writer = csv.writer(fout)
        writer.writerow(['Id','Prediction'])
        for doc, prediction in zip(docs, predictions):
            writer.writerow([doc.doc_id, prediction])

In [28]:
class DocVectorizer:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(decode_error='ignore')
    
    
    @staticmethod
    def __get_htmls(docs):
        return list(map(lambda doc: doc.html, docs))
    
    
    def fit_transform(self, docs):
        return self.vectorizer.fit_transform(self.__get_htmls(docs))
    
    
    def transform(self, docs):
        return self.vectorizer.transform(self.__get_htmls(docs))

In [19]:
train_docs = load_docs(TRAIN_FILE)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [20]:
test_docs = load_docs(TEST_FILE)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [29]:
vectorizer = DocVectorizer()

In [33]:
X_train = vectorizer.fit_transform(train_docs)
X_train.shape

(7044, 1012986)

In [34]:
y_train = list(map(lambda doc: doc.is_spam, train_docs))

In [35]:
classifier = SGDClassifier()
classifier.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [36]:
X_test = vectorizer.transform(test_docs)
X_test.shape

(16039, 1012986)

In [37]:
predictions = classifier.predict(X_test)

In [38]:
write_predictions(test_docs, predictions)