In [1]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import scipy.sparse as sp

In [6]:
df_train = pd.read_pickle('/mnt/sframe/df_train')
df_test = pd.read_pickle('/mnt/sframe/df_test')

In [None]:
from sklearn.cross_validation import train_test_split
train_train, train_cv = train_test_split(df_train, test_size=0.2, random_state=107)
train_train_unseen, train_train_seen = train_test_split(train_train, test_size=0.25, random_state=113)
TRAIN, CV = train_test_split(train_train_seen, test_size=0.5, random_state=117)

In [158]:
T = TRAIN
C = CV

In [159]:
TRAIN = train_train
CV = train_cv

In [160]:
TRAIN['text'].fillna('', inplace=True)
CV['text'].fillna('', inplace=True)
#TRAIN['text'].isnull().any()

In [161]:
CV[CV['sponsored'] == 1]['text'][0:3]

111161    great post veri refresh male blogger talk abou...
35141     were realli happi internet number sourc gif ti...
270699    youv ever seen pictur troll tongu havent just ...
Name: text, dtype: object

In [162]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words='english', min_df=5)
TRAIN_counts = count_vect.fit_transform(TRAIN['text'])
CV_counts = count_vect.transform(CV['text'])
TRAIN_counts.shape, CV_counts.shape

((269619, 277131), (67405, 277131))

In [111]:
count_vect.vocabulary_.get('cart')

8691

In [163]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
TRAIN_tfidf = tfidf_transformer.fit_transform(TRAIN_counts)
CV_tfidf = tfidf_transformer.transform(CV_counts)

TRAIN_tfidf.shape, CV_tfidf.shape

((269619, 277131), (67405, 277131))

In [164]:
bad_cols = [col for col in list(TRAIN.columns) 
 if col.startswith('word2vec') or col in set(['text_words', 'text', 'id', 'sponsored'])]

In [165]:
DF_TRAIN = TRAIN.drop(bad_cols, 1).fillna(0)
DF_CV = CV.drop(bad_cols, 1).fillna(0)

In [166]:
DF_TRAIN.shape

(269619, 38)

In [167]:
DF_DF_TRAIN = sp.hstack( [TRAIN_tfidf,  sp.csr_matrix(np.array(DF_TRAIN)) ] )
DF_DF_CV = sp.hstack( [CV_tfidf,  sp.csr_matrix(np.array(DF_CV)) ] )

In [154]:
from sklearn.ensemble import GradientBoostingClassifier
clf2 = GradientBoostingClassifier(n_estimators=100, 
                             max_features='sqrt', 
                             max_depth=3, 
                             min_samples_split=20, 
                             random_state=0)
clf2.fit(TRAIN[[col for col in TRAIN.columns if col.startswith('word2vec')]], TRAIN['sponsored'])

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features='sqrt', max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=20,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              random_state=0, subsample=1.0, verbose=0, warm_start=False)

In [155]:
predictions = clf2.predict_proba(CV[[col for col in CV.columns if col.startswith('word2vec')]])[:, 1]
roc_auc_score(CV['sponsored'], predictions)

0.75607556139665633

In [170]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=500, 
                             max_features='sqrt', 
                             criterion='entropy', 
                             max_depth=150, 
                             min_samples_split=20, 
                             class_weight='auto',
                             n_jobs=-1, random_state=0)
clf.fit(DF_DF_TRAIN, TRAIN['sponsored'])

RandomForestClassifier(bootstrap=True, class_weight='auto',
            criterion='entropy', max_depth=150, max_features='sqrt',
            max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=20,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [171]:
print("22:29:00")
import datetime
print(datetime.datetime.now())

22:29:00
2015-10-05 06:28:49.539434


In [172]:
predictions = clf.predict_proba(DF_DF_CV)[:, 1]
roc_auc_score(CV['sponsored'], predictions)

0.9298068448796003

In [173]:
print("0.9231")
import datetime
print(datetime.datetime.now())

0.9231
2015-10-05 06:29:03.941503


In [None]:
import xgboost as xgb

dtrain = xgb.DMatrix(DF_DF_TRAIN, label = TRAIN['sponsored'])
dtest = xgb.DMatrix(DF_DF_CV, label = CV['sponsored'])

watchlist  = [(dtest,'eval'), (dtrain,'train')]

In [252]:
param = {'max_depth':5, 
         'eta': 1, 
         'gamma': 1.0,
         'colsample_bytree': 0.3,
         'subsample': 0.9,
         'silent':1, 
         'objective':'binary:logistic', 
         'eval_metric':'auc',
         'early_stopping_rounds':20}
num_round = 300

print(datetime.datetime.now())

2015-10-05 08:22:02.279158


In [253]:
bst = xgb.train(param, dtrain, num_round, watchlist)
print(datetime.datetime.now())

[0]	eval-auc:0.571888	train-auc:0.570578
[1]	eval-auc:0.669116	train-auc:0.671742
[2]	eval-auc:0.711978	train-auc:0.714102
[3]	eval-auc:0.735423	train-auc:0.742350
[4]	eval-auc:0.765897	train-auc:0.775788
[5]	eval-auc:0.775559	train-auc:0.786102
[6]	eval-auc:0.793869	train-auc:0.804149
[7]	eval-auc:0.799272	train-auc:0.809697
[8]	eval-auc:0.805505	train-auc:0.817812
[9]	eval-auc:0.803777	train-auc:0.816820
[10]	eval-auc:0.809923	train-auc:0.823138
[11]	eval-auc:0.812229	train-auc:0.825675
[12]	eval-auc:0.815797	train-auc:0.830258
[13]	eval-auc:0.818641	train-auc:0.834471
[14]	eval-auc:0.821311	train-auc:0.837129
[15]	eval-auc:0.822955	train-auc:0.838018
[16]	eval-auc:0.824562	train-auc:0.839895
[17]	eval-auc:0.824288	train-auc:0.840077
[18]	eval-auc:0.826723	train-auc:0.841895
[19]	eval-auc:0.828429	train-auc:0.844777
[20]	eval-auc:0.831499	train-auc:0.846869
[21]	eval-auc:0.834575	train-auc:0.850612
[22]	eval-auc:0.833662	train-auc:0.850549
[23]	eval-auc:0.832254	train-auc:0.849776
[2

KeyboardInterrupt: 

In [193]:
bst.predict?
#preds = bst.predict(dtest, ntree_limit=bst.best_iteration)


In [257]:
dtrain.

Object `dtrain.buffer` not found.


In [187]:
#xgb.plot_importance(bst)
#dtrain.save_binary('dtrain.buffer')
#dtest.save_binary('dtest.buffer')

In [133]:
RandomForestClassifier?

In [84]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=0)
clf.fit(DF_TRAIN, TRAIN['sponsored'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [85]:
predictions = clf.predict_proba(DF_CV)[:, 1]

In [86]:
roc_auc_score(CV['sponsored'], predictions)

0.85235548381450854

In [90]:
count_vect = CountVectorizer()
TRAIN_counts = count_vect.fit_transform(TRAIN['text'])
CV_counts = count_vect.fit_transform(CV['text'])

tfidf_transformer = TfidfTransformer()
TRAIN_tfidf = tfidf_transformer.fit_transform(TRAIN_counts)
CV_tfidf = tfidf_transformer.fit_transform(CV_counts)

In [96]:
tfidf_transformer.transform?

In [97]:
TfidfTransformer?

In [98]:
CountVectorizer?

In [94]:
TRAIN_tfidf.shape

(33702, 545537)

In [91]:
DF_DF_TRAIN = pd.concat([DF_TRAIN, TRAIN_tfidf], axis=1, ignore_index=True)
DF_DF_CV = pd.concat([DF_CV, CV_tfidf], axis=1, ignore_index=True)

TypeError: cannot concatenate a non-NDFrame object

In [88]:
DF_TRAIN.shape, TRAIN_tfidf.shape, CV_tfidf.shape, DF_DF_TRAIN.shape, DF_DF_CV.shape

Index([u'num_words', u'a_href', u'par', u'title', u'img', u'btn', u'input',
       u'li', u'link', u'meta', u'script_avg', u'script_b_avg', u'script_cnt',
       u'script_b_cnt', u'style_avg', u'style_cnt', u'word_set_size',
       u'word_set_size_ratio', u'word_set_size2', u'word_set_size2_ratio',
       u'shinn.braces', u'shinn.brackets', u'shinn.cheap', u'shinn.col',
       u'shinn.commas', u'shinn.get', u'shinn.length', u'shinn.lines',
       u'shinn.parens', u'shinn.periods', u'shinn.sale', u'shinn.semi',
       u'shinn.shop', u'shinn.spaces', u'shinn.tabs', u'shinn.uppers',
       u'shinn.words', u'shinn.you'],
      dtype='object')

In [None]:
submission = test[['file']].reset_index(drop=True)
submission['sponsored'] = clf.predict_proba(test.drop(['file', 'sponsored'], 1))[:, 1]

In [42]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(TRAIN_tfidf, TRAIN['sponsored'])

In [None]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
predicted = text_clf.predict(docs_test)

In [None]:
X_new_counts = count_vect.transform(CV['text'])
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [65]:
predicted = clf.predict_proba(X_new_tfidf)[:, 1]
#np.mean(predicted == CV['sponsored'])       

In [66]:
from sklearn.metrics import roc_auc_score
roc_auc_score(CV['sponsored'], predicted)

0.51081647721840828

In [56]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='squared_loss', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))])
_ = text_clf.fit(TRAIN['text'], TRAIN['sponsored'])

In [58]:
predicted = text_clf.predict_proba? # (CV['text'])[:, 1]
np.mean(predicted == CV['sponsored'])          

SyntaxError: invalid syntax (<ipython-input-58-4b815ce93677>, line 1)

In [61]:
clf.predict_proba?

In [53]:
from sklearn.metrics import roc_auc_score
roc_auc_score(CV['sponsored'], predicted)

0.5

In [49]:
docs_new = ['God is love', 'OpenGL on the GPU is fast', 'shop account support view from captain clark toll free sinc product sold europ union must']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, str(category)))

'God is love' => 0
'OpenGL on the GPU is fast' => 0
'shop account support view from captain clark toll free sinc product sold europ union must' => 0
