# Modeling

## Load libraries and utility functions.

In [1]:
from __future__ import print_function
import os
import warnings
import pandas as pd
import lightgbm as lgb
from sklearn.feature_extraction import text
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.externals import joblib
from ItemSelector import ItemSelector
from label_rank import label_rank

warnings.filterwarnings(action='ignore', category=UserWarning, module='lightgbm')

## Define the input parameters.
One of the most important parameters is the number of estimators that allows you to trade-off accuracy, modeling time, and model size. The table below should give you an idea of the relationships between the number of estimators and the metrics.

| Estimators | Run time (s) | Size (MB) | Accuracy@1 | Accuracy@2 | Accuracy@3 |
|------------|--------------|----------|--------|--------|
|        100 |           40 |  2 | 25.02% | 38.72% | 47.83% |
|       1000 |          177 |  4 | 46.79% | 60.80% | 69.11% |
|       2000 |          359 |  7 | 51.38% | 65.93% | 73.09% |
|       4000 |          628 | 12 | 53.39% | 67.40% | 74.74% |

In [2]:
args_data = 'balanced_pairs_train.tsv'
args_test = 'balanced_pairs_test.tsv'
args_estimators = 4000
args_ngrams = 1
args_unweighted = False
args_min_child_samples = 20
args_match = 20
args_outputs = '.'
args_inputs = '.'
args_save = True
args_model = 'model.pkl'
args_instances = 'inst.txt'
args_labels = 'labels.txt'
args_rank = 3
args_verbose = -1

## Define the paths to the input and output data

In [3]:
# The input data.
inputs_path = args_inputs
data_path = os.path.join(inputs_path, args_data)
test_path = os.path.join(inputs_path, args_test)

# The output data.
outputs_path = args_outputs
model_path = os.path.join(outputs_path, args_model)
instances_path = os.path.join(outputs_path, args_instances)
labels_path = os.path.join(outputs_path, args_labels)

# Create the outputs folder.
os.makedirs(outputs_path, exist_ok=True)

## Load and set up the training data

In [4]:
# Load the training data.
print('Reading {}'.format(data_path))
train = pd.read_csv(data_path, sep='\t', encoding='latin1')
train

Reading ./balanced_pairs_train.tsv


Unnamed: 0,Id_x,AnswerId_x,Text_x,Id_y,Text_y,AnswerId_y,Label,n
0,114525,336868,"the difference between the two functions? (""fu...",336859,var functionname = function() {} vs function f...,336868,1,0
1,114525,336868,"the difference between the two functions? (""fu...",1646698,what is the 'new' keyword in javascript?. the ...,3658673,0,1
2,114525,336868,"the difference between the two functions? (""fu...",8021436,turning live() into on() in jquery. my applica...,8021462,0,2
3,114525,336868,"the difference between the two functions? (""fu...",13840429,what is the difference between client-side and...,13840431,0,3
4,114525,336868,"the difference between the two functions? (""fu...",20035101,no 'access-control-allow-origin' header is pre...,20035319,0,4
5,114525,336868,"the difference between the two functions? (""fu...",2844565,is there a jquery dom change listener?. essent...,2844704,0,5
6,114525,336868,"the difference between the two functions? (""fu...",784929,what is the !! (not not) operator in javascrip...,784946,0,6
7,114525,336868,"the difference between the two functions? (""fu...",37684,how to replace plain urls with links?. i am us...,21925491,0,7
8,114525,336868,"the difference between the two functions? (""fu...",364952,jquery/javascript: accessing contents of an if...,364997,0,8
9,114525,336868,"the difference between the two functions? (""fu...",2100758,javascript or (||) variable assignment explana...,2100767,0,9


In [5]:
# Limit the number of training duplicate matches.
train = train[train.n < args_match]

# The input data columns.
feature_columns = ['Text_x', 'Text_y']
label_column = 'Label'
group_column = 'Id_x'
answerid_column = 'AnswerId_y'
name_columns = ['Id_x', 'Id_y']

# Report on the dataset.
print('train: {:,} rows with {:.2%} matches'.format(
    train.shape[0], train[label_column].mean()))

train: 132,500 rows with 5.00% matches


In [6]:
# Compute instance weights.
weight_column = 'Weight'
if args_unweighted:
    weight = pd.Series([1.0], train[label_column].unique())
else:
    label_counts = train[label_column].value_counts()
    weight = train.shape[0]/(label_counts.shape[0]*label_counts)
train[weight_column] = train[label_column].apply(lambda x: weight[x])

# Collect the ordered AnswerId.
labels = sorted(train[answerid_column].unique())
label_order = pd.DataFrame({'label': labels})

## Define the model.

In [7]:
# Select and format the training data.
train_X = train[feature_columns]
train_y = train[label_column]
sample_weight = train[weight_column]
names = train[name_columns]

In [8]:
# Select the training hyperparameters.
n_estimators = args_estimators
min_child_samples = args_min_child_samples
estimator = lgb.LGBMClassifier(n_estimators=n_estimators,
                               min_child_samples=min_child_samples,
                               verbose=args_verbose)
if args_ngrams > 0:
    ngram_range = (1, args_ngrams)
else:
    ngram_range = None
assert ngram_range is not None

# The featurization pipeline(s) for each text column.
featurization = [
    (column,
     make_pipeline(ItemSelector(column),
                   text.TfidfVectorizer(ngram_range=ngram_range)))
    for column in feature_columns]
features = FeatureUnion(featurization)

# The model pipeline.
model = Pipeline([
    ('features', features),
    ('model', lgb.LGBMClassifier(n_estimators=n_estimators))
])

## Fit the model.

In [9]:
%%time
model.fit(train_X, train_y, model__sample_weight=sample_weight)

CPU times: user 32min 46s, sys: 4.9 s, total: 32min 51s
Wall time: 3min 7s


Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('Text_x', Pipeline(memory=None,
     steps=[('itemselector', ItemSelector(keys='Text_x')), ('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf...0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0))])

Write the model to file.

In [10]:
if args_save:
    joblib.dump(model, model_path)
    print('{} size: {:.0f} MB'.format(model_path, os.path.getsize(model_path)/(2**20)))

./model.pkl size: 12 MB


## Test the model

In [11]:
# Read the test data.
print('Reading {}'.format(test_path))
test = pd.read_csv(test_path, sep='\t', encoding='latin1')
print('test {:,} rows with {:.2%} matches'.format(
    test.shape[0], test[label_column].mean()))

Reading ./balanced_pairs_test.tsv
test 297,570 rows with 0.55% matches


In [12]:
test.head()

Unnamed: 0,Id_x,AnswerId_x,Text_x,Id_y,Text_y,AnswerId_y,Label,n
0,290214,14220323,in ajax how to retrive variable from inside of...,14220321,how do i return the response from an asynchron...,14220323,1,0
1,290214,14220323,in ajax how to retrive variable from inside of...,236073,why split the tag when writing it with docume...,236106,0,1
2,290214,14220323,in ajax how to retrive variable from inside of...,728360,most elegant way to clone a javascript object....,728694,0,2
3,290214,14220323,in ajax how to retrive variable from inside of...,1634268,explain javascript's encapsulated anonymous fu...,1634321,0,3
4,290214,14220323,in ajax how to retrive variable from inside of...,1458633,how to deal with floating point number precisi...,3439981,0,4


Collect the model predictions.

In [13]:
%%time
test_X = test[feature_columns]
test['probabilities'] = model.predict_proba(test_X)[:, 1]

CPU times: user 2min 19s, sys: 664 ms, total: 2min 19s
Wall time: 43.8 s


In [14]:
test.head()

Unnamed: 0,Id_x,AnswerId_x,Text_x,Id_y,Text_y,AnswerId_y,Label,n,probabilities
0,290214,14220323,in ajax how to retrive variable from inside of...,14220321,how do i return the response from an asynchron...,14220323,1,0,0.995786
1,290214,14220323,in ajax how to retrive variable from inside of...,236073,why split the tag when writing it with docume...,236106,0,1,2e-05
2,290214,14220323,in ajax how to retrive variable from inside of...,728360,most elegant way to clone a javascript object....,728694,0,2,0.000518
3,290214,14220323,in ajax how to retrive variable from inside of...,1634268,explain javascript's encapsulated anonymous fu...,1634321,0,3,0.000195
4,290214,14220323,in ajax how to retrive variable from inside of...,1458633,how to deal with floating point number precisi...,3439981,0,4,6e-05


Create a data frame with one row per duplicate question, and make it contain the model's predictions.

In [15]:
# Order the testing data by dupe Id and question AnswerId.
test.sort_values([group_column, answerid_column], inplace=True)

# Extract the ordered probabilities.
probabilities = (
    test.probabilities
    .groupby(test[group_column], sort=False)
    .apply(lambda x: tuple(x.values)))

In [16]:
# Get the individual records.
output_columns_x = ['Id_x', 'AnswerId_x', 'Text_x']
test_score = (test[output_columns_x]
              .drop_duplicates()
              .set_index(group_column))
test_score['probabilities'] = probabilities
test_score.reset_index(inplace=True)
test_score.columns = ['Id', 'AnswerId', 'Text', 'probabilities']

In [17]:
test_score

Unnamed: 0,Id,AnswerId,Text,probabilities
0,290214,14220323,in ajax how to retrive variable from inside of...,"(2.7841376733774188e-05, 1.558218944944856e-05..."
1,604860,1520853,interesting test of javascript regexp. i wrote...,"(2.0722676181687424e-05, 8.126328895045692e-05..."
2,643542,750506,doesn't javascript support closures with local...,"(0.000756403006026312, 0.00014112062139126527,..."
3,1262020,20279485,how can i maintain control of the this keyword...,"(7.91996843662999e-05, 6.0503548169761006e-06,..."
4,1297308,69984,weird javascript/jquery behavior. possible du...,"(7.229503986303964e-05, 0.0001358647862419197,..."
5,1300130,2067584,explanation and usage of jsonp. possible dupl...,"(4.545107627389494e-05, 0.00012704114084050974..."
6,1370376,30070207,prevent loss of variables when browser reload ...,"(0.0003976322130830671, 0.0018949811524387147,..."
7,1396307,1433217,javascript replace function won't remove the s...,"(2.7476787247593674e-06, 1.6167792146482174e-0..."
8,1552941,750506,how does a function in a loop (which returns a...,"(4.9642839369284144e-05, 2.125135815367454e-06..."
9,1691085,194399,obfuscating your jquery code. possible duplic...,"(7.52115372819383e-06, 5.1403788717857646e-06,..."


## Evaluate the predictions

In [18]:
# Rank the correct answers.
test_score['Ranks'] = test_score.apply(lambda x:
                                       label_rank(x.AnswerId,
                                                  x.probabilities,
                                                  label_order.label),
                                       axis=1)

# Compute the number of correctly ranked answers
for i in range(1, args_rank+1):
    print('Accuracy @{} = {:.2%}'.format(
        i, (test_score['Ranks'] <= i).mean()))
mean_rank = test_score['Ranks'].mean()
print('Mean Rank {:.4f}'.format(mean_rank))

# Write the scored instances.
test_score.to_csv(instances_path, sep='\t', index=False,
                  encoding='latin1')
label_order.to_csv(labels_path, sep='\t', index=False)

Accuracy @1 = 52.66%
Accuracy @2 = 66.91%
Accuracy @3 = 74.43%
Mean Rank 5.0037
