In [1]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats
import re
import graphlab as gl
from graphlab.toolkits.feature_engineering import TFIDF, FeatureHasher, QuadraticFeatures

gl.canvas.set_target('ipynb')

PATH_TO_JSON2 = "/mnt/sframe/docs_prod_02/"
PATH_TO_JSON = "/mnt/sframe/docs_prod_05/"
PATH_TO_JSON6 = "/mnt/sframe/docs_prod_06/"
PATH_TO_TRAIN_LABELS = "input/train.csv"
PATH_TO_TEST_LABELS = "input/sampleSubmission.csv"

### Read processed documents

In [2]:
gl.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 128)
gl.set_runtime_config('GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY', 100*1024*1024*1024) # 100GB
gl.set_runtime_config('GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY_PER_FILE', 100*1024*1024*1024) # 100GB

[INFO] This trial license of GraphLab Create is assigned to renatbek@gmail.com and will expire on October 08, 2015. Please contact trial@dato.com for licensing options or to request a free non-commercial license for personal or academic use.

[INFO] Start server at: ipc:///tmp/graphlab_server-11685 - Server binary: /usr/local/lib/python2.7/site-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1443941241.log
[INFO] GraphLab Server Version: 1.6.1


In [3]:
def transf(x):
    return 50.0 * np.log1p(np.log1p(x))

In [None]:
def transf(x):
    return x

In [None]:
gl.get_runtime_config()

In [4]:
# documents
sf = gl.SFrame.read_csv(PATH_TO_JSON, header=False, verbose=False)
sf = sf.unpack('X1',column_name_prefix='')
sf['id'] = sf['id'].apply(lambda x: str(x.split('_')[0] ))
sf['num_words'] = sf['text'].apply(lambda xs: transf(len(xs)))

In [5]:
sf2 = gl.SFrame.read_csv(PATH_TO_JSON2, header=False, verbose=False)
sf2 = sf2.unpack('X1',column_name_prefix='')
sf2['id'] = sf2['id'].apply(lambda x: str(x.split('_')[0] ))

In [6]:
sf6 = gl.SFrame.read_csv(PATH_TO_JSON6, header=False, verbose=False)
sf6 = sf6.unpack('X1',column_name_prefix='')
sf6['id'] = sf6['id'].apply(lambda x: str(x.split('_')[0] ))
sf6['word2vec'] = sf6['word2vec'].apply(lambda xs: np.array(xs ,dtype='float32').tolist())

In [None]:
sf6.head()

In [None]:
sf.column_names

In [7]:
sf_cnt = gl.SFrame()
sf_cnt['id'] = sf2['id']

sf_cnt['a_href'] = sf2['ahref'].apply(lambda x: transf(len(x)))
sf_cnt['par'] = sf2['par'].apply(lambda x: transf(len(x)))
sf_cnt['title'] = sf2['title'].apply(lambda x: transf(len(x)))

sf_cnt['img'] = sf2['img_cnt'].apply(transf)
sf_cnt['btn'] = sf2['misc_button'].apply(transf)
sf_cnt['input'] = sf2['misc_input'].apply(transf)
sf_cnt['li'] = sf2['misc_li'].apply(transf)
sf_cnt['link'] = sf2['misc_link'].apply(transf)
sf_cnt['meta'] = sf2['misc_meta'].apply(transf)

sf_cnt['script_avg'] = sf2['script_avg'].apply(transf)
sf_cnt['script_b_avg'] = sf2['script_b_avg'].apply(transf)
sf_cnt['script_cnt'] = sf2['script_cnt'].apply(transf)
sf_cnt['script_b_cnt'] = sf2['script_b_cnt'].apply(transf)

sf_cnt['style_avg'] = sf2['style_avg'].apply(transf)
sf_cnt['style_cnt'] = sf2['style_cnt'].apply(transf)

### Read train/test labels and merge into documents

In [8]:
# train/test labels
train_labels = gl.SFrame.read_csv(PATH_TO_TRAIN_LABELS, verbose=False)
test_labels = gl.SFrame.read_csv(PATH_TO_TEST_LABELS, verbose=False)
train_labels['id'] = train_labels['file'].apply(lambda x: str(x.split('_')[0] ))
train_labels = train_labels.remove_column('file')
test_labels['id'] = test_labels['file'].apply(lambda x: str(x.split('_')[0] ))
test_labels = test_labels.remove_column('file')

In [9]:
# join
train = train_labels.join(sf, on='id', how='left')
test = test_labels.join(sf, on='id', how='left')

In [None]:
train = train.join(sf_cnt, on='id', how='left')
test = test.join(sf_cnt, on='id', how='left')

In [None]:
train = train.join(sf6, on='id', how='left')
test = test.join(sf6, on='id', how='left')

In [None]:
train.column_names

In [None]:
features = [
            'a_href',
            'par',
            'title',
            'img',
            'btn',
            'input',
            'li',
            'link',
            'meta',
            'script_avg',
            'script_b_avg',
            'script_cnt',
            'script_b_cnt',
            'style_avg',
            'style_cnt',
            'num_words'
           ]

In [None]:
# fill in empty
for f in features:
    train = train.fillna(f, 0.0)     
    test = test.fillna(f, 0.0)

In [None]:
train = train.fillna('shinn', {})     
test = test.fillna('shinn', {})

train['shinn'] = train['shinn'].apply(lambda ws: ws if ws else {})
test['shinn'] = test['shinn'].apply(lambda ws: ws if ws else {})

features = features + ['shinn']

In [None]:
train = train.fillna('word2vec', np.zeros(300))     
test = test.fillna('word2vec', np.zeros(300))

train['word2vec'] = train['word2vec'].apply(lambda ws: ws if ws else np.zeros(300))
test['word2vec'] = test['word2vec'].apply(lambda ws: ws if ws else np.zeros(300))

In [None]:
train = train.fillna('words', [])     
test = test.fillna('words', [])   

train['words'] = train['words'].apply(lambda ws: ws if ws else [])
test['words'] = test['words'].apply(lambda ws: ws if ws else [])

In [None]:
train['word_set_size'] = train['words'].apply(lambda ws: len(set(ws)))
test['word_set_size'] = test['words'].apply(lambda ws: len(set(ws)))

train['word_set_size_ratio'] = train.apply(lambda r: r['word_set_size'] * 1.0 / len(r['words']) if len(r['words']) > 0 else 0.0)
test['word_set_size_ratio'] = test.apply(lambda r: r['word_set_size'] * 1.0 / len(r['words']) if len(r['words']) > 0 else 0.0)

In [None]:
train['text_words'] = train['words'].apply(lambda ws: ' '.join(ws))
test['text_words'] = test['words'].apply(lambda ws: ' '.join(ws))

In [None]:
features = features + ['word_set_size', 'word_set_size_ratio']

### Generate BOW

In [None]:
bow_trn = gl.text_analytics.count_words(train['text_words'])
bow_trn = bow_trn.dict_trim_by_keys(gl.text_analytics.stopwords())

bow_tst = gl.text_analytics.count_words(test['text_words'])
bow_tst = bow_tst.dict_trim_by_keys(gl.text_analytics.stopwords())

train['bow_words'] = bow_trn
test['bow_words'] = bow_tst

In [None]:
bow_trn = gl.text_analytics.count_words(train['text'])
bow_trn = bow_trn.dict_trim_by_keys(gl.text_analytics.stopwords())

bow_tst = gl.text_analytics.count_words(test['text'])
bow_tst = bow_tst.dict_trim_by_keys(gl.text_analytics.stopwords())

train['bow'] = bow_trn
test['bow'] = bow_tst

In [None]:
train.head()

### Generate TF-IDF

In [None]:
encoder = gl.feature_engineering.create(train, TFIDF('bow_words', output_column_name='tfidf_words', min_document_frequency=5e-5))
train = encoder.transform(train)
test = encoder.transform(test)

In [None]:
encoder = gl.feature_engineering.create(train, TFIDF('bow', output_column_name='tfidf', min_document_frequency=5e-5))
train = encoder.transform(train)
test = encoder.transform(test)

train_train['tfidf'] = train_train['tfidf'].fillna({})
train_cv['tfidf'] = train_cv['tfidf'].fillna({})

train_train['tfidf'] = train_train['tfidf'].apply(lambda x: x if x else {})
train_cv['tfidf'] = train_cv['tfidf'].apply(lambda x: x if x else {})

In [None]:
train.head()

### Hash TF-IDF

In [None]:
hash_encoder = gl.feature_engineering.create(train, FeatureHasher(features = ['tfidf'], num_bits=18, 
                                                                  output_column_name='tfidf_hashed_18'))
train['tfidf_hashed_18'] = hash_encoder.transform(train)['tfidf_hashed_18']
test['tfidf_hashed_18'] = hash_encoder.transform(test)['tfidf_hashed_18']

### Split training set for cross-validation

In [None]:
train.head()

In [None]:
train_train, train_cv = train.random_split(0.80, seed=107)

In [None]:
TRAIN, CV = train_cv.random_split(0.50, seed=113)

# Generate Submission Model

Started at 20:36:00am

In [None]:
features

In [None]:
gl.classifier.random_forest_classifier.create?

In [None]:
model200=model

In [None]:
train.head()

In [None]:
feature_imp = model200.get_feature_importance()

In [None]:
gl.canvas.set_target('ipynb')

In [None]:
fi = list(gl.load_sframe('feature_importance.csv')['feature'])

In [None]:
def decode_dict(a):
    #a = "shinn[\"hello\"]"
    dic, word = a.split('[')
    _, word, _ = word.split('\"')
    return dic, word

def get_or_else(dic, word, no=0.0):
    if dic.has_key(word) and dic[word]:
        return dic[word]
    else:
        return no
    
def value_it(a, data, out):
    if '[' in a:
        dic, word = decode_dict(a)
        out[dic + '.' + word] = data[dic].apply(lambda d: get_or_else(d, word))
    else:
        out[a] = data[a]

In [None]:
CV_l = gl.SFrame()
for a in fi[0:150]:
    value_it(a, CV, CV_l)
    
TRAIN_l = gl.SFrame()
for a in fi[0:150]:
    value_it(a, TRAIN, TRAIN_l)    
    
CV_l['sponsored'] = CV['sponsored']
TRAIN_l['sponsored'] = TRAIN['sponsored']
new_feats = set(CV_l.column_names()).difference(set(['sponsored']))

In [None]:
'sponsored' in new_feats

In [None]:
new_feats

In [None]:
a = {}
get_or_else(a, 's')

In [None]:
TRAIN_l = TRAIN.unpack('shinn')
CV_l = CV.unpack('shinn')

In [None]:
CV_l = CV_l.unpack('tfidf5e5')

In [None]:
'sponsored' in set(CV_l.column_names())

In [None]:
CV_l['tfidf5e5.copyright'].show()

In [None]:
3 + 4

In [None]:
CV[CV['sponsored'] == 0]['a_href'].sum()  / CV[CV['sponsored'] == 0].shape[0]

In [None]:
model = gl.classifier.boosted_trees_classifier.create(train, target='sponsored',
                                                      #features=features + ['tfidf_hashed_18'],
                                                      features=features + ['tfidf5e5'],
                                                      max_depth=6,
                                                      step_size=0.2,
                                                      max_iterations=300,
                                                      column_subsample=0.3,
                                                      row_subsample=1.0,
                                                      class_weights='auto')

In [None]:
model = gl.classifier.random_forest_classifier.create(train, target='sponsored',
                                                      features=features + ['tfidf', 'word2vec'],
                                                      num_trees=200,
                                                      max_depth=150,
                                                      validation_set=None,
                                                      column_subsample=0.45,
                                                      row_subsample=1.0,
                                                      class_weights='auto')

In [None]:
model = gl.classifier.boosted_trees_classifier.create(train, target='sponsored',
                                                      #features=features + ['tfidf_hashed_18'],
                                                      features=features + ['tfidf'],
                                                      max_depth=6,
                                                      step_size=0.2,
                                                      max_iterations=300,
                                                      column_subsample=0.3,
                                                      row_subsample=1.0,
                                                      class_weights='auto',
                                                      validation_set=None)

In [None]:
lr_model = gl.logistic_classifier.create(TRAIN_l, target='sponsored', 
                                      features=new_feats,
                                      validation_set=CV_l,
                                      class_weights='auto',
                                      max_iterations=10,
                                      l2_penalty=0.00,
                                      l1_penalty=0.00)

In [None]:
svm_model = gl.svm_classifier.create(train, target='sponsored', 
                                      features=['tfidf_hashed'],
                                      validation_set=None,                                           
                                      class_weights='auto',
                                      max_iterations=20)

### Output model

In [None]:
import datetime
print(datetime.datetime.now())

In [None]:
ypred = model.predict(test, 'probability')

submission = gl.SFrame()
submission['file'] = test['id'].apply(lambda x: x + '_raw_html.txt')
submission['sponsored'] = ypred 
#submission.save('submission_version_4.csv', format='csv')

submission = submission.to_dataframe()
submission.to_csv('submission_rf_word_set_word2vec.csv', index=False, float_format='%1.8f')

In [None]:
model.save('/mnt/sframe/model_RF_200_150_noword2vec')

In [None]:
test

# Experiment

### Split train into *train_train*/*train_cv*

In [None]:
features

In [None]:
train_train = train_train.dropna()
train_cv = train_cv.dropna()

In [None]:
model = gl.logistic_classifier.create(train_train, target='sponsored', 
                                      features=features + ['tfidf'],
                                      validation_set=train_cv,
                                      class_weights='auto',
                                      max_iterations=30,
                                      feature_rescaling=True,
                                      l2_penalty=0.00,
                                      l1_penalty=0.00)

In [None]:
results = gl.SFrame()
results['id'] = train_cv['id']
results['actual'] = train_cv['sponsored']
results['predicted'] = model.predict(train_cv)

In [None]:
train_cv.unpack('tfidf')

In [None]:
FN.shape, FP.shape

In [None]:
FN = results[results['actual'] > results['predicted']]
FP = results[results['actual'] < results['predicted']]

In [None]:
FN[720:730]

In [None]:
FP

In [None]:
model.evaluate(train_cv)

In [None]:
results = model.evaluate(train_cv, metric='roc_curve')
a = results['roc_curve']

fpr = list(a['fpr'])
tpr = list(a['tpr'])
fpr[0] = 1.0
tpr[0] = 1.0
fpr = np.array(fpr)
tpr = np.array(tpr)

AUC = np.sum((fpr[:-1] - fpr[1:]) * (tpr[:-1] + (tpr[:-1] - tpr[1:])/2))
plt.plot(fpr, tpr)
print('AUC = %f'%AUC)

In [None]:
train_cv.remove_column('tfidf_hashed17')

In [None]:
svm_model = gl.svm_classifier.create(train_train, target='sponsored', 
                                      features=features + ['tfidf_hashed'],
                                      validation_set=train_cv,                                           
                                      class_weights='auto',
                                      max_iterations=15)

In [None]:
svm_model

In [None]:
train_cv['margin'] = svm_model.predict(train_cv, output_type='margin')
preds = train_cv[['sponsored', 'margin']].sort('margin')
train_cv.remove_column('margin')

pd_preds = preds.to_dataframe()
pd_preds['number'] = 1.0

pd_preds_cum = pd_preds.cumsum()

total_positives = np.asarray(pd_preds_cum['sponsored'])[-1]
total = np.asarray(pd_preds_cum['number'])[-1]
total_negatives = total - total_positives

pd_preds_cum['FN'] = pd_preds_cum['sponsored']
pd_preds_cum['TN'] = pd_preds_cum['number'] - pd_preds_cum['sponsored']

pd_preds_cum['TP'] = total_positives - pd_preds_cum['FN']
pd_preds_cum['FP'] = total - total_positives - pd_preds_cum['TN']

pd_preds_cum['fpr'] = pd_preds_cum['FP'] / (pd_preds_cum['FP'] + pd_preds_cum['TN'])
pd_preds_cum['tpr'] = pd_preds_cum['TP'] / (pd_preds_cum['TP'] + pd_preds_cum['FN'])



In [None]:
a = pd_preds_cum

fpr = list(a['fpr'])
tpr = list(a['tpr'])
fpr[0] = 1.0
tpr[0] = 1.0
fpr = np.array(fpr)
tpr = np.array(tpr)

AUC = np.sum((fpr[:-1] - fpr[1:]) * (tpr[:-1] + (tpr[:-1] - tpr[1:])/2))
plt.plot(fpr, tpr)
print('AUC = %f'%AUC)

In [None]:
train_cv['margin'] = svm_model.predict(train_cv, output_type='margin')
preds = train_cv[['sponsored', 'margin']]
preds['margin'].show()

In [None]:
(preds[preds['margin'] < 55]['sponsored']).sum()

In [None]:
ts = np.arange(-22, 50, 0.1)
[for t in ts]

In [None]:
svm_model.evaluate(train_cv)

In [None]:
results = svm_model.evaluate(train_cv, metric='roc_curve')
a = results['roc_curve']

fpr = list(a['fpr'])
tpr = list(a['tpr'])
fpr[0] = 1.0
tpr[0] = 1.0
fpr = np.array(fpr)
tpr = np.array(tpr)

AUC = np.sum((fpr[:-1] - fpr[1:]) * (tpr[:-1] + (tpr[:-1] - tpr[1:])/2))
plt.plot(fpr, tpr)
print('AUC = %f'%AUC)

In [None]:
results

# Save Datasets?

In [None]:
train_train = gl.load_sframe('/mnt/sframe/shinn_split_train_train')
train_cv = gl.load_sframe('/mnt/sframe/shinn_split_train_cv')

In [None]:
train_train.head()

# Junk

In [None]:
hash_encoder = gl.feature_engineering.create(train_train, FeatureHasher(features = ['tfidf'], num_bits=17, 
                                                                  output_column_name='tfidf_hashed_17'))
train_train['tfidf_hashed_17'] = hash_encoder.transform(train_train)['tfidf_hashed_17']
train_cv['tfidf_hashed_17'] = hash_encoder.transform(train_cv)['tfidf_hashed_17']

In [None]:
train_train.head()

In [None]:
model_15 = gl.logistic_classifier.create(train_train, target='sponsored', 
                                      features=['tfidf_hashed_15'],
                                      validation_set=train_cv,
                                      class_weights='auto',
                                      max_iterations=40,
                                      feature_rescaling=True,
                                      l2_penalty=0.00,
                                      l1_penalty=0.00)

In [None]:
model_15

In [None]:
model_15.evaluate(train_cv)

In [None]:
results = model_15.evaluate(train_cv, metric='roc_curve')
a = results['roc_curve']

fpr = list(a['fpr'])
tpr = list(a['tpr'])
fpr[0] = 1.0
tpr[0] = 1.0
fpr = np.array(fpr)
tpr = np.array(tpr)

AUC = np.sum((fpr[:-1] - fpr[1:]) * (tpr[:-1] + (tpr[:-1] - tpr[1:])/2))
plt.plot(fpr, tpr)
print('AUC = %f'%AUC)

In [None]:
model_16 = gl.logistic_classifier.create(train_train, target='sponsored', 
                                      features=['tfidf_hashed_16'],
                                      validation_set=train_cv,
                                      class_weights='auto',
                                      max_iterations=20,
                                      feature_rescaling=True,
                                      l2_penalty=0.00,
                                      l1_penalty=0.00)

In [None]:
model_16.evaluate(train_cv)

In [None]:
results = model_16.evaluate(train_cv, metric='roc_curve')
a = results['roc_curve']

fpr = list(a['fpr'])
tpr = list(a['tpr'])
fpr[0] = 1.0
tpr[0] = 1.0
fpr = np.array(fpr)
tpr = np.array(tpr)

AUC = np.sum((fpr[:-1] - fpr[1:]) * (tpr[:-1] + (tpr[:-1] - tpr[1:])/2))
plt.plot(fpr, tpr)
print('AUC = %f'%AUC)

In [None]:
model_17 = gl.logistic_classifier.create(train_train, target='sponsored', 
                                      features=['tfidf_hashed_17'],
                                      validation_set=train_cv,
                                      class_weights=None, #'auto',
                                      max_iterations=8,
                                      feature_rescaling=True,
                                      l2_penalty=0.00,
                                      l1_penalty=0.00)

In [None]:
model_17 = gl.logistic_classifier.create(train_train, target='sponsored', 
                                      features=['tfidf_hashed_17'],
                                      validation_set=train_cv,
                                      class_weights=None, #'auto',
                                      max_iterations=7,
                                      feature_rescaling=True,
                                      l2_penalty=0.00,
                                      l1_penalty=0.00)

In [None]:
model_17.evaluate(train_cv)

In [None]:
results = model_17.evaluate(train_cv, metric='roc_curve')
a = results['roc_curve']

fpr = list(a['fpr'])
tpr = list(a['tpr'])
fpr[0] = 1.0
tpr[0] = 1.0
fpr = np.array(fpr)
tpr = np.array(tpr)

AUC = np.sum((fpr[:-1] - fpr[1:]) * (tpr[:-1] + (tpr[:-1] - tpr[1:])/2))
plt.plot(fpr, tpr)
print('AUC = %f'%AUC)

In [None]:
gl.svm_classifier.create(train_train, target='sponsored', 
                                      features=['tfidf_hashed_15'],
                                      validation_set=train_cv,                                           
                                      class_weights='auto',
                                      max_iterations=40)

In [None]:
gl.svm_classifier.create(train_train, target='sponsored', 
                                      features=['tfidf_hashed_16'],
                                      validation_set=train_cv,                                           
                                      class_weights='auto',
                                      max_iterations=40)

In [None]:
gl.svm_classifier.create(train_train, target='sponsored', 
                                      features=['tfidf_hashed_17'],
                                      validation_set=train_cv,                                           
                                      class_weights='auto',
                                      max_iterations=40)

In [None]:
train.head()

# Continue with data

In [None]:
train = gl.load_sframe('/mnt/sframe/counts_and_tfidf_hashed_18_train')
test = gl.load_sframe('/mnt/sframe/counts_and_tfidf_hashed_18_test')

train_train = gl.load_sframe('/mnt/sframe/num_words_counts_and_tfidf_hashed_18_split_train_train')
train_cv = gl.load_sframe('/mnt/sframe/num_words_counts_and_tfidf_hashed_18_split_train_cv')

TRAIN, CV = train_cv.random_split(0.50, seed=113)

In [None]:
train.save('/mnt/sframe/shinn_train')
test.save('/mnt/sframe/shinn_test')

train_train.save('/mnt/sframe/shinn_split_train_train')
train_cv.save('/mnt/sframe/shinn_split_train_cv')
#train_train = gl.load_sframe('/mnt/sframe/tfidf_hashed_16_split_train_train')
#train_cv = gl.load_sframe('/mnt/sframe/tfidf_hashed_16_split_train_cv')

## Try classifiers

In [None]:
# model2 = gl.classifier.random_forest_classifier.create(train_train, target='sponsored',
#                                                       features=features + ['word2vec'],
#                                                       num_trees=10,
#                                                       max_depth=200,
#                                                       column_subsample=0.15,
#                                                       row_subsample=1.0,
#                                                       class_weights='auto',
#                                                       validation_set=train_cv)
model_boosted = gl.classifier.boosted_trees_classifier.create(train_train, target='sponsored',
                                                      features=features + ['word2vec'],
                                                      max_depth=6,
                                                      step_size=1.0,  #0.2
                                                      max_iterations=300,
                                                      column_subsample=0.25,
                                                      row_subsample=1.0,
                                                      class_weights='auto',
                                                      validation_set=train_cv)

In [None]:
results = model_boosted.evaluate(train_cv, metric='roc_curve')
a = results['roc_curve']

fpr = list(a['fpr'])
tpr = list(a['tpr'])
fpr[0] = 1.0
tpr[0] = 1.0
fpr = np.array(fpr)
tpr = np.array(tpr)

AUC = np.sum((fpr[:-1] - fpr[1:]) * (tpr[:-1] + (tpr[:-1] - tpr[1:])/2))
plt.plot(fpr, tpr)
print('AUC = %f'%AUC)

In [None]:
model1_word2vec = gl.classifier.random_forest_classifier.create(train_train, target='sponsored',
                                                      features=features + ['tfidf', 'word2vec'],
                                                      num_trees=90, #100,
                                                      max_depth=150,
                                                      column_subsample=0.45,
                                                      row_subsample=1.0,
                                                      class_weights='auto',
                                                      validation_set=train_cv)

In [None]:
results = model1_word2vec.evaluate(train_cv, metric='roc_curve')
a = results['roc_curve']

fpr = list(a['fpr'])
tpr = list(a['tpr'])
fpr[0] = 1.0
tpr[0] = 1.0
fpr = np.array(fpr)
tpr = np.array(tpr)

AUC = np.sum((fpr[:-1] - fpr[1:]) * (tpr[:-1] + (tpr[:-1] - tpr[1:])/2))
plt.plot(fpr, tpr)
print('AUC = %f'%AUC)

In [None]:
model1 = gl.classifier.random_forest_classifier.create(train_train, target='sponsored',
                                                      features=features + ['tfidf'], #, 'word2vec'],
                                                      num_trees=90, #100,
                                                      max_depth=150,
                                                      column_subsample=0.45,
                                                      row_subsample=1.0,
                                                      class_weights='auto',
                                                      validation_set=train_cv)

In [None]:
results = model1.evaluate(train_cv, metric='roc_curve')
a = results['roc_curve']

fpr = list(a['fpr'])
tpr = list(a['tpr'])
fpr[0] = 1.0
tpr[0] = 1.0
fpr = np.array(fpr)
tpr = np.array(tpr)

AUC = np.sum((fpr[:-1] - fpr[1:]) * (tpr[:-1] + (tpr[:-1] - tpr[1:])/2))
plt.plot(fpr, tpr)
print('AUC = %f'%AUC)

In [None]:
model = gl.classifier.random_forest_classifier.create(train_train, target='sponsored',
                                                      features=features + ['tfidf'], #, 'word2vec'],
                                                      num_trees=10,
                                                      max_depth=150,
                                                      column_subsample=0.45,
                                                      row_subsample=1.0,
                                                      class_weights='auto',
                                                      validation_set=train_cv)

In [None]:
results = model.evaluate(train_cv, metric='roc_curve')
a = results['roc_curve']

fpr = list(a['fpr'])
tpr = list(a['tpr'])
fpr[0] = 1.0
tpr[0] = 1.0
fpr = np.array(fpr)
tpr = np.array(tpr)

AUC = np.sum((fpr[:-1] - fpr[1:]) * (tpr[:-1] + (tpr[:-1] - tpr[1:])/2))
plt.plot(fpr, tpr)
print('AUC = %f'%AUC)

In [None]:
model.get_feature_importance().print_rows(num_rows=30, num_columns=2) 

In [None]:
train_train['tfidf_nonone'] = train_train['tfidf'].apply(lambda x: x if x else {})
train_cv['tfidf_nonone'] = train_cv['tfidf'].apply(lambda x: x if x else {})

In [None]:
train_train['tfidf_nonone'] = train_train['tfidf_nonone'].fillna({})
train_cv['tfidf_nonone'] = train_cv['tfidf_nonone'].fillna({})

In [None]:
svm_model = gl.svm_classifier.create(train_train, target='sponsored', 
                                      features=features + ['tfidf_nonone'], #features + ['tfidf'],
                                      validation_set=train_cv,                                           
                                      class_weights='auto',
                                      max_iterations=40)

In [None]:
train_cv['margin'] = svm_model.predict(train_cv, output_type='margin')
preds = train_cv[['sponsored', 'margin']].sort('margin')
train_cv.remove_column('margin')

pd_preds = preds.to_dataframe()
pd_preds['number'] = 1.0

pd_preds_cum = pd_preds.cumsum()

total_positives = np.asarray(pd_preds_cum['sponsored'])[-1]
total = np.asarray(pd_preds_cum['number'])[-1]
total_negatives = total - total_positives

pd_preds_cum['FN'] = pd_preds_cum['sponsored']
pd_preds_cum['TN'] = pd_preds_cum['number'] - pd_preds_cum['sponsored']

pd_preds_cum['TP'] = total_positives - pd_preds_cum['FN']
pd_preds_cum['FP'] = total - total_positives - pd_preds_cum['TN']

pd_preds_cum['fpr'] = pd_preds_cum['FP'] / (pd_preds_cum['FP'] + pd_preds_cum['TN'])
pd_preds_cum['tpr'] = pd_preds_cum['TP'] / (pd_preds_cum['TP'] + pd_preds_cum['FN'])

#   show
a = pd_preds_cum

fpr = list(a['fpr'])
tpr = list(a['tpr'])
fpr[0] = 1.0
tpr[0] = 1.0
fpr = np.array(fpr)
tpr = np.array(tpr)

AUC = np.sum((fpr[:-1] - fpr[1:]) * (tpr[:-1] + (tpr[:-1] - tpr[1:])/2))
plt.plot(fpr, tpr)
print('AUC = %f'%AUC)

In [None]:
model_boosted = gl.classifier.boosted_trees_classifier.create(train_train, target='sponsored',
                                                      features=features + ['tfidf_hashed_18'],
                                                      max_depth=6,
                                                      step_size=0.2,
                                                      max_iterations=300,
                                                      column_subsample=0.3,
                                                      row_subsample=1.0,
                                                      class_weights='auto',
                                                      validation_set=train_cv)

In [None]:
results = model_boosted.evaluate(train_cv, metric='roc_curve')
a = results['roc_curve']

fpr = list(a['fpr'])
tpr = list(a['tpr'])
fpr[0] = 1.0
tpr[0] = 1.0
fpr = np.array(fpr)
tpr = np.array(tpr)

AUC = np.sum((fpr[:-1] - fpr[1:]) * (tpr[:-1] + (tpr[:-1] - tpr[1:])/2))
plt.plot(fpr, tpr)
print('AUC = %f'%AUC)

In [None]:
gl.boosted_trees_classifier.get_default_options()

In [None]:
TRAIN, CV = train_cv.random_split(0.50, seed=113)

In [None]:
CV.shape

In [None]:
model_boosted = gl.classifier.boosted_trees_classifier.create(TRAIN, target='sponsored',
                                                      features=features + ['tfidf_hashed_18'],
                                                      max_depth=6,
                                                      step_size=0.2,
                                                      max_iterations=500,
                                                      column_subsample=0.3,
                                                      row_subsample=1.0,
                                                      class_weights='auto',
                                                      validation_set=CV)

In [None]:
results = model_boosted.evaluate(CV, metric='roc_curve')
a = results['roc_curve']

fpr = list(a['fpr'])
tpr = list(a['tpr'])
fpr[0] = 1.0
tpr[0] = 1.0
fpr = np.array(fpr)
tpr = np.array(tpr)

AUC = np.sum((fpr[:-1] - fpr[1:]) * (tpr[:-1] + (tpr[:-1] - tpr[1:])/2))
plt.plot(fpr, tpr)
print('AUC = %f'%AUC)

In [None]:
model_boosted = gl.classifier.boosted_trees_classifier.create(train_train, target='sponsored',
                                                      features=features + ['tfidf_hashed_18'],
                                                      max_depth=6,
                                                      step_size=0.2,
                                                      max_iterations=400,
                                                      column_subsample=0.4,
                                                      row_subsample=1.0,
                                                      class_weights='auto',
                                                      validation_set=train_cv)

In [None]:
train_cv_pred = gl.SFrame()
train_cv_pred['pred'] =model_boosted.predict(train_cv)
train_cv_pred['actual'] = train_cv['sponsored']
train_cv_pred['id'] = train_cv['id']

In [None]:
pred_over = train_cv_pred[train_cv_pred['pred'] > train_cv_pred['actual']]
pred_under = train_cv_pred[train_cv_pred['pred'] < train_cv_pred['actual']]

In [None]:
for z in list(pred_under.sample(0.02)['id'].apply(lambda x: 'aws s3 cp s3://sparkydotsdata/kaggle/native/orig/' + x + '_raw_html.txt ' + x +'raw_html')):
    print(z) 

In [None]:
results = model_boosted.evaluate(train_cv, metric='roc_curve')
a = results['roc_curve']

fpr = list(a['fpr'])
tpr = list(a['tpr'])
fpr[0] = 1.0
tpr[0] = 1.0
fpr = np.array(fpr)
tpr = np.array(tpr)

AUC = np.sum((fpr[:-1] - fpr[1:]) * (tpr[:-1] + (tpr[:-1] - tpr[1:])/2))
plt.plot(fpr, tpr)
print('AUC = %f'%AUC)

In [None]:
def custom_evaluator(model, train, test):
    results = model.evaluate(test, metric='roc_curve')
    a = results['roc_curve']

    fpr = list(a['fpr'])
    tpr = list(a['tpr'])
    fpr[0] = 1.0
    tpr[0] = 1.0
    fpr = np.array(fpr)
    tpr = np.array(tpr)

    AUC = np.sum((fpr[:-1] - fpr[1:]) * (tpr[:-1] + (tpr[:-1] - tpr[1:])/2))
    return {'AUC': AUC}

In [None]:
job0 = job

In [None]:
params = dict([
        ('target', 'sponsored'),
        ('features', [features + ['tfidf_hashed_18']]),
        ('max_depth', [6]),
        ('step_size', [0.2]),
        ('max_iterations', [100, 150, 200]),
        ('column_subsample', [0.4]),
        ('validation_set', [None])
    ])

job = gl.grid_search.create((TRAIN, CV), 
                              gl.boosted_trees_classifier.create, 
                              params, 
                              evaluator=custom_evaluator)
job.get_results()

In [None]:
models = job.get_models()

In [None]:
results = job.get_results()

In [None]:
results = results.to_dataframe()

In [None]:
results.boxplot('AUC', by='max_iterations')

In [None]:
results.sort('AUC', ascending=False)

In [None]:
len(models)

In [None]:
job.get_metrics()

In [None]:
aa = scipy.stats.distributions.expon(.1)

In [None]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

In [None]:
clf = RandomForestClassifier(n_estimators=10, n_jobs=-1, random_state=0)
#train = df_full[df_full.sponsored.notnull()].fillna(0)
#test = df_full[df_full.sponsored.isnull() & df_full.file.isin(test_files)].fillna(0)
# clf.fit(train.drop(['file', 'sponsored'], 1), train.sponsored)

# print('--- Create predictions and submission')
# submission = test[['file']].reset_index(drop=True)
# submission['sponsored'] = clf.predict_proba(test.drop(['file', 'sponsored'], 1))[:, 1]
# submission.to_csv('native_btb_basic_submission.csv', index=False)

In [None]:
shiTRAIN = shiTRAIN.to_dataframe()
shiCV = shiCV.to_dataframe()

In [None]:
for col in shiTRAIN.column_names:
    shiTRAIN[col] = 

In [None]:
clf.fit(shiTRAIN, shiTRAIN_label)

In [None]:
shiTRAIN = TRAIN.unpack('shinn')
shiCV = CV.unpack('shinn')

In [None]:
shiTRAIN_label = np.asarray(shiTRAIN['sponsored'])
shiCV_label = np.asarray(shiCV['sponsored'])

In [None]:
shiTRAIN_label = np.asarray(shiTRAIN_label, float)
shiCV_label = np.asarray(shiCV_label, float)

In [None]:
shiTRAIN_tf = shiTRAIN['tfidf5e5']
shiCV_tf = shiCV['tfidf5e5']

In [None]:
shiTRAIN.remove_columns(['tfidf5e5'])
shiCV.remove_columns(['tfidf5e5'])
# shiTRAIN.remove_columns(['text', 'bow', 'sponsored', 'id'])
# shiCV.remove_columns(['text', 'bow', 'sponsored', 'id'])

In [None]:
shiTRAIN = shiTRAIN.to_dataframe drop('text', 1)
shiCV = shiCV.drop('text', 1)

In [None]:
shiTRAIN.head()