#ML Search

To make sure that the first search was ok, the search is run again. This time, the validation dataset is the test set and a new validation was used. That means that the dataset was split:

    - Train_big
        - Train
        - Validation
    - Test

In [4]:
%pylab inline
import sklearn
import pandas as pd
import mylib.utils as mu
from itertools import combinations 
import seaborn as sns
from mylib.utils import print_time

Populating the interactive namespace from numpy and matplotlib


In [5]:
groups = ['balance', 'location', 'nu_info', 'personal', 'raw_scores', 'scores_class']
all_combinations = [ p for i in range(len(groups)) for p in combinations(groups, i)][1:]
target = mu.load_target_data()

### Search over all combinations

In [10]:
proc_name = 'combination'
best_score = 0
best_run = None
best_models = {}
for comb in all_combinations:
    # load only the data from the combination
    print_time('Processing combination {}'.format(comb))
    data = mu.load_data(comb)
    
    # create test set
    X_train, X_test, y_train, y_test = mu.split_data(data, target)
    print_time('Test shape {}'.format(X_test.shape))
    
    results = mu.train_regression(X_train, mu.np.ravel(y_train), scorer=mu.f1_scorer)
    
    # evaluate the best model agains the test set
    run =  results[0][1]
    model = run['model']
    run['comb'] = comb
    score_val = run['score_val']
    print_time('Val score {}'.format(score_val))
    pred_test = model.predict(X_test)
    score_test = mu.f1_scorer(y_test, pred_test)
    run['score_test'] = score_test
    run['pred_test'] = pred_test
    run['y_test'] = y_test
    
    # save the best model
    if best_score<score_test:
        best_score = score_test
        best_run = run
    
    # save models with score test more than .65
    best_models[score_test] = run

03:24:34 10/08/15 BRT - Processing combination ('balance',)
03:24:34 10/08/15 BRT - Test shape (64, 9)
03:24:34 10/08/15 BRT - Created train and validation
03:24:34 10/08/15 BRT - Size train: (513, 9) test:(57, 9)
03:24:34 10/08/15 BRT - Starting to train models
03:24:34 10/08/15 BRT - Took 0.248217105865 seconds
03:24:34 10/08/15 BRT - Val score 0.533245987553
03:24:34 10/08/15 BRT - Processing combination ('location',)
03:24:34 10/08/15 BRT - Test shape (64, 246)
03:24:34 10/08/15 BRT - Created train and validation
03:24:34 10/08/15 BRT - Size train: (513, 246) test:(57, 246)
03:24:34 10/08/15 BRT - Starting to train models
03:24:35 10/08/15 BRT - Took 0.998915910721 seconds
03:24:35 10/08/15 BRT - Val score 0.541062801932
03:24:35 10/08/15 BRT - Processing combination ('nu_info',)
03:24:35 10/08/15 BRT - Test shape (64, 8)
03:24:35 10/08/15 BRT - Created train and validation
03:24:35 10/08/15 BRT - Size train: (513, 8) test:(57, 8)
03:24:35 10/08/15 BRT - Starting to train models
03

In [7]:
print_time('Best run is:')
print_time('Comb: {}'.format(best_run['comb']))
print_time('Model: {}'.format(best_run['model']))
print_time('Scores: train - {} :: validation - {} :: test - {}'.
           format(best_run['score_train'], best_run['score_val'], best_run['score_test']))
mu.classification_report_matrix(best_run['y_test'], best_run['pred_test'])

03:16:54 10/08/15 BRT - Best run is:
03:16:54 10/08/15 BRT - Comb: ('balance', 'nu_info', 'raw_scores')
03:16:54 10/08/15 BRT - Model: ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=56909, verbose=0,
           warm_start=False)
03:16:54 10/08/15 BRT - Scores: train - 1.0 :: validation - 0.591970121382 :: test - 0.615615615616


KeyError: 'y_test'

In [9]:
best_run.keys()

['X_val',
 'score_val',
 'X_train',
 'score_test',
 'score_train',
 'pred_test',
 'pred_train',
 'pred_val',
 'y_val',
 'y_tr',
 'comb',
 'model']

In [None]:
num_best = 3
for score in sort(best_models.keys())[-num_best-2:-2]:
    model = best_models[score]
    print_time('Comb: {}'.format(model['comb']))
    print_time('Model: {}'.format(model['model']))
    print_time('Scores: train - {} :: validation - {} :: test - {}'.
               format(model['score_train'], model['score_val'], best_run['score_test']))
    mu.classification_report_matrix(best_run['y_test'], best_run['pred_test'])

### Conclusion
    Svc is still the best algorithm but the column configurations changed. The next 3 best algorithms also changed.

In [None]:
import shlex, subprocess

# vw files
vw_file_train = './Processed/data.vw.train.txt'
vw_file_val = './Processed/data.vw.val.txt'
vw_file_test = './Processed/data.vw.test.txt'

# grid search the column groups
best_model = {'group': comb, 'score_test': 0, 'pred_test':None, 
                      'score_val': 0, 'pred_val':None}
for comb in all_combinations:
    # load the data groups and split them
    data = mu.load_data(comb, del_key=False)
    
    # create test set
    X_train, X_test, y_train, y_test = mu.split_data(data, target)
    print_time('Test shape {}'.format(X_test.shape))
    
    # create the validation set
    X_train, X_val, y_train, y_val = mu.split_data(X_train, y_train)
    
    # join the y value and write the file in VW format
    vw_data = pd.concat([X_train, y_train], axis=1)
    vw_val = pd.concat([X_val, y_val], axis=1)
    vw_test = pd.concat([X_test, y_test], axis=1)
    
    print_time('Shape of train: {}, shape of test: {}'.format(vw_data.shape, vw_val.shape))
    mu.df_to_vw(vw_data, vw_file_train, 'target', id_col='customer_id')
    mu.df_to_vw(vw_val, vw_file_val, 'target', id_col='customer_id')
    mu.df_to_vw(vw_test, vw_file_test, 'target', id_col='customer_id')
    
    # run the vw process externally
    print_time(comb) 
    p = subprocess.Popen(["./go_vw.sh", "10", ""], 
                     stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE)
    print_time(p.communicate()[0])

    # get the results to compare
    pred = pd.read_csv('./val.vw.txt', sep= ' ', header=None, names=['Pred', 'costumer_id'])
    pred.columns = ['Pred', 'customer_id']
    pred['preds'] = pred.Pred.apply(lambda l: 0 if l<0 else 1 )
    joined = pd.merge(pred, vw_val[['customer_id', 'target']],left_on='customer_id', 
                       right_on='customer_id', how='left')
    
    # print the results
    print_time('Validation scores')
    mu.classification_report_matrix(joined.target, joined.preds )
    score_val = mu.f1_scorer(joined.target, joined.preds)
    
    print_time(comb) 
    p = subprocess.Popen(["./predict_vw.sh"], 
                     stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE)
    print_time(p.communicate()[0])
    
    # get the results to compare
    pred_test = pd.read_csv('./pred.vw.txt', sep= ' ', header=None, names=['Pred', 'costumer_id'])
    pred_test.columns = ['Pred', 'customer_id']
    pred_test['preds'] = pred_test.Pred.apply(lambda l: 0 if l<0 else 1 )
    joined_test = pd.merge(pred_test, vw_test[['customer_id', 'target']],left_on='customer_id', 
                       right_on='customer_id', how='left')
    
    # print the results
    print_time('Test scores')
    mu.classification_report_matrix(joined_test.target, joined_test.preds)
    score_test = mu.f1_scorer(joined_test.target, joined_test.preds)
    print_time('Score of test {}'.format(score_test))
    
    # save if this is the best model
    if score_test > best_model['score_test']:
        best_model = {'group': comb, 'score_test': score_test, 'pred_test':joined_test.preds, 
                      'score_val': score_val, 'pred_val':joined.preds}

In [None]:
print best_model['score_test']
print best_model['group']