# Vowpal Wabbit
This script uses the [Vowpal Wabbit](http://hunch.net/~vw/) algorithm to predict the classes good or bad. The steps are:
    
    - load the data
    - create a combination of all the groups of columns
    - train and predict the model. Save the best and save the predictions to the Output folder
    - for the best combination of group, check other parameters like loss function and l1 and l2 regularization parameters
    
It was created a function called df_to_vw to save a Pandas DataFrame into VW file format. For the trainning and predicting part, the VW is called externally using Python's `subprocess` module that calls a `Sheel Script`.

In [1]:
import pandas as pd
import mylib.utils as mu
from mylib.utils import print_time
import sklearn.metrics as metrics
from itertools import combinations 
from sklearn.cross_validation import train_test_split
from sklearn.metrics import f1_score
import shlex, subprocess

In [2]:
proc_name = 'VW_all'
vw_file_train = './Processed/data.vw.train.txt'
vw_file_val = './Processed/data.vw.val.txt'
vw_file_test = './Processed/data.vw.test.txt'
target = mu.load_target_data()

### Grid search different column groups

In [3]:
# groups to test and their combination
groups = ['balance', 'location', 'nu_info', 'personal', 'raw_scores', 'scores_class']
all_combinations = [ p for i in range(len(groups)) for p in combinations(groups, i)][1:]

In [4]:
# grid search the column groups
best_model = {'group': None, 'score_val': 0}
for comb in all_combinations:
    # load the data groups and split them
    data = mu.load_data(comb, del_key=False)
    X_train, X_val, y_train, y_val = mu.split_data(data, target)
    
    # join the y value and write the file in VW format
    vw_data = pd.concat([X_train, y_train], axis=1)
    vw_val = pd.concat([X_val, y_val], axis=1)
    print_time('Shape of train: {}, shape of test: {}'.format(vw_data.shape, vw_val.shape))
    mu.df_to_vw(vw_data, vw_file_train, 'target', id_col='customer_id')
    mu.df_to_vw(vw_val, vw_file_val, 'target', id_col='customer_id')
    #mu.df_to_vw(vw_test, vw_file_test, 'target', id_col='customer_id')
    
    # run the vw process externally
    print_time(comb) 
    p = subprocess.Popen(["./go_vw.sh", "10", ""], 
                     stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE)
    print_time(p.communicate()[0])

    # get the results to compare
    pred = pd.read_csv('./val.vw.txt', sep= ' ', header=None, names=['Pred', 'costumer_id'])
    pred.columns = ['Pred', 'customer_id']
    pred['preds'] = pred.Pred.apply(lambda l: 0 if l<0 else 1 )
    joined = pd.merge(pred, vw_val[['customer_id', 'target']],left_on='customer_id', 
                       right_on='customer_id', how='left')
    
    # print the results
    print_time('Validation scores')
    mu.classification_report_matrix(joined.target, joined.preds )
    score = mu.f1_scorer(joined.target, joined.preds)
    
    # save if this is the best model
    if score > best_model['score_val']:
        best_model = {'group': comb, 'score_val': score, 
                      'pred_val': joined.preds, 'model':mu.VowpalWabbit()}

02:52:00 10/08/15 BRT - Shape of train: (570, 11), shape of test: (64, 11)
02:52:00 10/08/15 BRT - File written ./Processed/data.vw.train.txt
02:52:00 10/08/15 BRT - File written ./Processed/data.vw.val.txt
02:52:00 10/08/15 BRT - ('balance',)
02:52:00 10/08/15 BRT - Removing old files
Starting to process
vw -d ./Processed/data.vw.train.txt -f normal.vw -c --passes 10 
RUNNING TRAIN --------------------------
RUNNING VALIDATION ---------------------------

02:52:00 10/08/15 BRT - Validation scores
02:52:00 10/08/15 BRT - 
              precision    recall  f1-score   support

          0       0.48      1.00      0.65        31
          1       0.00      0.00      0.00        33

avg / total       0.23      0.48      0.32        64

02:52:00 10/08/15 BRT - 
 [[31  0]
 [33  0]]
02:52:00 10/08/15 BRT - Shape of train: (570, 248), shape of test: (64, 248)
02:52:01 10/08/15 BRT - File written ./Processed/data.vw.train.txt
02:52:01 10/08/15 BRT - File written ./Processed/data.vw.val.txt
02

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Best model: 

In [5]:
# print the best model
print_time('Best model {}:{}'.format(best_model['group'], best_model['score_val']))

02:53:01 10/08/15 BRT - Best model ('location', 'scores_class'):0.733790066063


Save the model for bagging

In [6]:
# print the best model
mu.save_predictions_from_model(best_model, 'vw_{}'.format(best_model['group']))

02:53:01 10/08/15 BRT - Saved ./Output/pred_vw_('location', 'scores_class')_733_VowpalWabbit.csv


Generate an output to the bagging

#### Grid search the other parms for the best columns combination

In [7]:
# tests different VW configs
loss = ['  ', '--loss_function logistic', '--loss_function hinge', '--loss_function quantile']
reg_l1 = ['  ', '--l1 0.001','--l1 0.01', '--l1 0.1','--l1 1']
reg_l2 = ['  ', '--l2 0.001','--l2 0.01', '--l2 0.1','--l2 1']
comb = ('balance', 'location', 'personal', 'raw_scores')
p_comb = [ (l,l1,l2) for l in loss for l1 in reg_l1 for l2 in reg_l2]

In [8]:
# load the data groups and split them
data = mu.load_data(best_model['group'], del_key=False)
X_train, X_val, y_train, y_val = mu.split_data(data, target)
    
# join the y value and write the file in VW format
vw_data = pd.concat([X_train, y_train], axis=1)
vw_val = pd.concat([X_val, y_val], axis=1)
print_time('Shape of train: {}, shape of test: {}'.format(vw_data.shape, vw_val.shape))
mu.df_to_vw(vw_data, vw_file_train, 'target', id_col='customer_id')
mu.df_to_vw(vw_val, vw_file_val, 'target', id_col='customer_id')

# restart best model
best_model_grid = {'group': None, 'score': 0}

for (loss,l1,l2) in p_comb:
    # split the params
    #loss = loss.split(' ')
    #l1 = l1.split(' ')
    #l2 = l2.split(' ')

    # run the vw process externally
    params = shlex.split("./go_vw.sh " + "15" + loss + ' ' + l1 + ' ' + l2)
    print params
    p = subprocess.Popen(params, 
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)
    print p.communicate()[0]

    # get the results to compare
    pred = pd.read_csv('./val.vw.txt', sep= ' ', header=None, names=['Pred', 'costumer_id'])
    pred.columns = ['Pred', 'customer_id']
    pred['preds'] = pred.Pred.apply(lambda l: 0 if l<0 else 1 )
    joined = pd.merge(pred, vw_val[['customer_id', 'target']],left_on='customer_id', 
                       right_on='customer_id', how='left')
    
    # print the results
    print_time('Validation scores')
    mu.classification_report_matrix(joined.target, joined.preds )
    score = mu.f1_scorer(joined.target, joined.preds)
    
    # save if this is the best model
    if score > best_model_grid['score']:
        best_model_grid = {'group': (loss,l1,l2), 'score': score}

02:53:01 10/08/15 BRT - Loaded info group scores_class, current shape of data (634, 251)
02:53:01 10/08/15 BRT - Shape of train: (570, 252), shape of test: (64, 252)
02:53:02 10/08/15 BRT - File written ./Processed/data.vw.train.txt
02:53:02 10/08/15 BRT - File written ./Processed/data.vw.val.txt
['./go_vw.sh', '15']
Removing old files
Starting to process
vw -d ./Processed/data.vw.train.txt -f normal.vw -c --passes 15
RUNNING TRAIN --------------------------
RUNNING VALIDATION ---------------------------

02:53:02 10/08/15 BRT - Validation scores
02:53:02 10/08/15 BRT - 
              precision    recall  f1-score   support

          0       0.46      0.52      0.48        31
          1       0.48      0.42      0.45        33

avg / total       0.47      0.47      0.47        64

02:53:02 10/08/15 BRT - 
 [[16 15]
 [19 14]]
['./go_vw.sh', '15', '--l2', '0.001']
Removing old files
Starting to process
vw -d ./Processed/data.vw.train.txt -f normal.vw -c --passes 15 --l2 0.001
RUNNING T

Checking if parameters achieved better results.

In [9]:
if best_model_grid['score'] > best_model['score']:
    print_time('Grid search for Vowpal Wabit achieved better results :') 
    print_time('last: {} grid: {}'.format(best_model['score'], best_model_grid['score']))
else:
    print_time('Grid search for Vowpal Wabit did NOT achieved better results') 

KeyError: 'score'

## Confidence interval


To make sure that the technique yelds good results, the best model is run a couple of times and the condifence interval is done

In [None]:
comb = best_model['group']
nr_runs = 30
results = {}
for i in range(nr_runs):
    # load the data groups and split them
    data = mu.load_data(best_model['group'], del_key=False)
    X_train, X_val, y_train, y_val = mu.split_data(data, target, seed=mu.random.randint(1,10000))

    # join the y value and write the file in VW format
    vw_data = pd.concat([X_train, y_train], axis=1)
    vw_val = pd.concat([X_val, y_val], axis=1)
    print_time('Shape of train: {}, shape of test: {}'.format(vw_data.shape, vw_val.shape))
    mu.df_to_vw(vw_data, vw_file_train, 'target', id_col='customer_id')
    mu.df_to_vw(vw_val, vw_file_val, 'target', id_col='customer_id')
    
    # run the vw process externally
    print(comb) 
    p = subprocess.Popen(["./go_vw.sh", "15"], 
                     stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE)
    print p.communicate()[0]

    # get the results to compare
    pred = pd.read_csv('./val.vw.txt', sep= ' ', header=None, names=['Pred', 'costumer_id'])
    pred.columns = ['Pred', 'customer_id']
    pred['preds'] = pred.Pred.apply(lambda l: 0 if l<0 else 1 )
    joined = pd.merge(pred, vw_val[['customer_id', 'target']],left_on='customer_id', 
                       right_on='customer_id', how='left')
    
    # print the results
    print_time('Validation scores')
    mu.classification_report_matrix(joined.target, joined.preds )
    score = mu.f1_scorer(joined.target, joined.preds)
    results[i] = score
    print 'Score for this run %s' %score
    

In [None]:
print 'Best Score:', mu.np.max([v for v in results.itervalues()])
print 'Worst Score:', mu.np.min([v for v in results.itervalues()])
print 'Mean:', mu.np.mean([v for v in results.itervalues()])
print 'Median:', mu.np.median([v for v in results.itervalues()])
print 'SD: ', mu.np.std([v for v in results.itervalues()])

## Obs:
A brief tutorial can be found here [Basic Tutorial](http://zinkov.com/posts/2013-08-13-vowpal-tutorial/)