In [2]:
from __future__ import division
from importlib import reload

import base64
import csv
import gzip
import zlib

from collections import namedtuple
import nltk 
from nltk.corpus import stopwords
nltk.download("stopwords")
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/glebdrozdov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
TRACE_NUM = 1000
import logging
reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%H:%M:%S')

def trace(items_num, trace_num=TRACE_NUM):
    if items_num % trace_num == 0: logging.info("Complete items %05d" % items_num)

In [4]:
from bs4 import BeautifulSoup

def bs_parsertext(text):
    return BeautifulSoup(text, "html.parser").get_text()

In [5]:
html2text = bs_parsertext

In [6]:
def easy_tokenizer(text):
    word = str()
    for symbol in text:
        if symbol.isalnum(): word += symbol
        elif word:
            yield word
            word = str()
    if word : yield word

PYMORPHY_CACHE = {}
MORPH = None
def get_lemmatizer():
    import pymorphy2
    global MORPH
    if MORPH is None: MORPH = pymorphy2.MorphAnalyzer()
    return MORPH

def pymorphy_tokenizer(text):
    global PYMORPHY_CACHE
    for word in easy_tokenizer(text):
        word_hash = hash(word)
        if word_hash not in PYMORPHY_CACHE:
            PYMORPHY_CACHE[word_hash] = get_lemmatizer().parse(word)[0].normal_form            
        yield PYMORPHY_CACHE[word_hash]

In [7]:
def html2word(raw_html, to_text=html2text, tokenizer=pymorphy_tokenizer):
    return tokenizer(raw_html.lower())

In [8]:
DocItem = namedtuple('DocItem', ['doc_id', 'is_spam', 'url', 'page_text'])

def load_csv(input_file_name):    
    with gzip.open(input_file_name, 'rt', encoding='utf-8') if input_file_name.endswith('gz') else open(input_file_name)  as input_file:            
        headers = input_file.readline()
        
        for i, line in enumerate(input_file):
            trace(i)
            parts = line.strip().split('\t')
            url_id = int(parts[0])                                        
            mark = bool(int(parts[1]))   
            url = parts[2]
            pageInb64 = parts[3]
            html_data = base64.b64decode(pageInb64).decode('utf-8', errors="ignore")
            page_text = list(html2word(html_data))
            page_text = " ".join(str(x) for x in page_text)
            yield DocItem(url_id, mark, url, page_text)
        trace(i, 1)

In [9]:
%%time

TRAIN_DATA_FILE  = 'kaggle_train_data_tab_new.csv.gz'

train_docs = list(load_csv(TRAIN_DATA_FILE))

02:29:16 INFO:Complete items 00000
02:29:16 INFO:Loading dictionaries from /anaconda3/lib/python3.6/site-packages/pymorphy2_dicts/data
02:29:16 INFO:format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168
02:30:00 INFO:Complete items 01000
02:30:23 INFO:Complete items 02000
02:30:49 INFO:Complete items 03000
02:31:12 INFO:Complete items 04000
02:31:34 INFO:Complete items 05000
02:31:54 INFO:Complete items 06000
02:32:19 INFO:Complete items 07000
02:32:20 INFO:Complete items 07043


CPU times: user 2min 51s, sys: 2.46 s, total: 2min 53s
Wall time: 3min 3s


In [65]:
TEST_DATA_FILE  = 'kaggle_test_data_tab_new.csv.gz'

test_docs = list(load_csv(TEST_DATA_FILE))

17:44:55 INFO:Complete items 00000
17:45:21 INFO:Complete items 01000
17:45:40 INFO:Complete items 02000
17:45:59 INFO:Complete items 03000
17:46:15 INFO:Complete items 04000
17:46:31 INFO:Complete items 05000
17:46:47 INFO:Complete items 06000
17:47:01 INFO:Complete items 07000
17:47:14 INFO:Complete items 08000
17:47:26 INFO:Complete items 09000
17:47:44 INFO:Complete items 10000
17:47:56 INFO:Complete items 11000
17:48:08 INFO:Complete items 12000
17:48:19 INFO:Complete items 13000
17:48:32 INFO:Complete items 14000
17:48:45 INFO:Complete items 15000
17:48:59 INFO:Complete items 16000
17:48:59 INFO:Complete items 16038


In [66]:
df_train = pd.DataFrame(train_docs)
X_train = df_train.loc[:, df_train.columns != 'is_spam']
y_train = df_train.is_spam
df_test = pd.DataFrame(test_docs)
X_test = df_test.loc[:, df_test.columns != 'is_spam']
y_test = df_test.is_spam

In [67]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

In [68]:
def write_to_file(docs, predicted_labels):
    with open('my_submission.csv' , 'w') as fout:
        fout.write("Id,Prediction\n")
        for doc_id, pred in zip(docs['doc_id'].values, predicted_labels):
            res = 1 if pred else 0
            fout.write("%d,%d\n" % (doc_id, res))

In [69]:
def count_f_metric(true_labels, predicted_labels):
    from sklearn.metrics import f1_score
    return f1_score(true_labels, predicted_labels, average='micro')

In [14]:
class BayesClassifier:
    def __init__(self):
        self.vectorizer = CountVectorizer(stop_words = stopwords.words('russian') + stopwords.words('english'),
                                         ngram_range = (1, 2))
        self.clf = MultinomialNB()

    def predict(self, data):
        test_trans = self.vectorizer.transform(data)
        return self.clf.predict(test_trans)
    
    def train(self, train_X, train_y):
        train_trans = self.vectorizer.fit_transform(train_X)
        #train_trans = self.vectorizer.transform(train_X)
        self.clf.fit(train_trans, train_y)  

In [88]:
class XgBoostClassifier:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(ngram_range = (1, 2), stop_words = stopwords.words('russian') + stopwords.words('english'), 
                                                            min_df = 0.02)
        self.bst = None

    def predict(self, data):
        test_trans = self.vectorizer.transform(data)
        dtest = xgb.DMatrix(test_trans)
        return self.bst.predict(dtest)
    
    def train(self, train_X, train_y):
        params = {
            'max_depth': 4, 
            'eta': 0.2, 
            'silent': 1,
            'objective': 'multi:softmax', 
            'num_class': 2
        }
        rounds = 500
        train_trans = self.vectorizer.fit_transform(train_X)
        dtrain = xgb.DMatrix(train_trans, label=train_y)
        self.bst = xgb.train(params, dtrain, rounds) 

In [106]:
class SVMClassifier:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(ngram_range = (1, 3),
                                          stop_words = stopwords.words('russian') + stopwords.words('english'))
        self.clf = SGDClassifier(verbose=True)
    
    def train(self, train_X, train_y):
        train_trans = self.vectorizer.fit_transform(train_X)
        self.clf.fit(train_trans, train_y)
    
    def train_all(self, train_all, train_X, train_y):
        self.vectorizer.fit(train_all)
        train_trans = self.vectorizer.transform(train_X)
        self.clf.fit(train_trans, train_y)
    
    def predict(self, data):
        test_trans = self.vectorizer.transform(data)
        return self.clf.predict(test_trans)

In [99]:
def make_predictions(classifier, train_X, train_y, test_X):
    classifier.train(train_X, train_y)
    preds = classifier.predict(test_X)
    print(count_f_metric(y_test, preds))
    write_to_file(X_test, preds)

In [97]:
make_predictions(SVMClassifier(), X_train['page_text'], y_train.values,  X_test['page_text'])

-- Epoch 1
Norm: 36.44, NNZs: 16602, Bias: 0.103635, T: 7044, Avg. loss: 0.130494
Total training time: 0.02 seconds.
-- Epoch 2
Norm: 31.56, NNZs: 16602, Bias: 0.099918, T: 14088, Avg. loss: 0.054527
Total training time: 0.03 seconds.
-- Epoch 3
Norm: 30.15, NNZs: 16605, Bias: 0.077696, T: 21132, Avg. loss: 0.045364
Total training time: 0.04 seconds.
-- Epoch 4
Norm: 29.25, NNZs: 16605, Bias: 0.044421, T: 28176, Avg. loss: 0.040523
Total training time: 0.06 seconds.
-- Epoch 5
Norm: 28.86, NNZs: 16605, Bias: 0.044470, T: 35220, Avg. loss: 0.038382
Total training time: 0.07 seconds.
0.5046449279880292


In [100]:
make_predictions(SVMClassifier(), X_train['page_text'], y_train.values,  X_test['page_text'])

-- Epoch 1
Norm: 53.67, NNZs: 5332726, Bias: -0.143150, T: 7044, Avg. loss: 0.113190
Total training time: 0.42 seconds.
-- Epoch 2
Norm: 42.97, NNZs: 7621329, Bias: -0.046632, T: 14088, Avg. loss: 0.031933
Total training time: 0.92 seconds.
-- Epoch 3
Norm: 39.81, NNZs: 8647101, Bias: -0.011756, T: 21132, Avg. loss: 0.023905
Total training time: 1.42 seconds.
-- Epoch 4
Norm: 38.08, NNZs: 9172913, Bias: -0.011109, T: 28176, Avg. loss: 0.019014
Total training time: 1.91 seconds.
-- Epoch 5
Norm: 37.38, NNZs: 9452634, Bias: 0.003133, T: 35220, Avg. loss: 0.017070
Total training time: 2.41 seconds.
0.5030862273209052


In [91]:
make_predictions(XgBoostClassifier(), X_train['page_text'], y_train,  X_test['page_text'])

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


0.5135606958039778
