# Modeling

## Load libraries and utility functions.

In [11]:
from __future__ import print_function
import os
import warnings
import pandas as pd
import lightgbm as lgb
from sklearn.feature_extraction import text
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.externals import joblib
from ItemSelector import ItemSelector
from label_rank import label_rank

warnings.filterwarnings(action='ignore', category=UserWarning,
                        module='lightgbm')

## Define the input parameters.
One of the most important parameters is the number of estimators that allows you to trade-off accuracy, modeling time, and model size. The table below should give you an idea of the relationships between the number of estimators and the metrics.

| Estimators | Run time (s) | Size (MB) | Accuracy@1 | Accuracy@2 | Accuracy@3 |
|------------|--------------|-----------|------------|------------|------------|
|        100 |           40 |  2 | 25.02% | 38.72% | 47.83% |
|       1000 |          177 |  4 | 46.79% | 60.80% | 69.11% |
|       2000 |          359 |  7 | 51.38% | 65.93% | 73.09% |
|       4000 |          628 | 12 | 53.39% | 67.40% | 74.74% |
|       8000 |          904 | 22 | 54,62% | 67.77% | 75.35% |

In [12]:
args_data = 'balanced_pairs_train.tsv'
args_test = 'balanced_pairs_test.tsv'
args_estimators = 8000
args_ngrams = 1
args_unweighted = False
args_min_child_samples = 20
args_match = 20
args_outputs = '.'
args_inputs = '.'
args_save = True
args_model = 'model.pkl'
args_instances = 'inst.txt'
args_labels = 'labels.txt'
args_rank = 3
args_verbose = -1

## Define the paths to the input and output data

In [13]:
# The input data.
inputs_path = args_inputs
data_path = os.path.join(inputs_path, args_data)
test_path = os.path.join(inputs_path, args_test)

# The output data.
outputs_path = args_outputs
model_path = os.path.join(outputs_path, args_model)
instances_path = os.path.join(outputs_path, args_instances)
labels_path = os.path.join(outputs_path, args_labels)

# Create the outputs folder.
os.makedirs(outputs_path, exist_ok=True)

## Load and set up the training data

In [14]:
# Load the training data.
print('Reading {}'.format(data_path))
train = pd.read_csv(data_path, sep='\t', encoding='latin1')
train

Reading ./balanced_pairs_train.tsv


Unnamed: 0,Id_x,AnswerId_x,Text_x,Id_y,Text_y,AnswerId_y,Label,n
0,177867,122704,how do i copy the data of an element with jque...,122102,what is the most efficient way to clone an obj...,122704,1,0
1,177867,122704,how do i copy the data of an element with jque...,3665115,"create a file in memory for user to download, ...",3665147,0,1
2,177867,122704,how do i copy the data of an element with jque...,950087,include a javascript file in another javascrip...,950146,0,2
3,177867,122704,how do i copy the data of an element with jque...,2241875,how to create an object property from a variab...,2241883,0,3
4,177867,122704,how do i copy the data of an element with jque...,572897,how does javascript .prototype work?. i'm not ...,572996,0,4
5,177867,122704,how do i copy the data of an element with jque...,171251,how can i merge properties of two javascript o...,171256,0,5
6,177867,122704,how do i copy the data of an element with jque...,2846283,what are the rules for javascript's automatic ...,2846298,0,6
7,177867,122704,how do i copy the data of an element with jque...,14220321,how do i return the response from an asynchron...,14220323,0,7
8,177867,122704,how do i copy the data of an element with jque...,1885557,simplest code for array intersection in javasc...,1885660,0,8
9,177867,122704,how do i copy the data of an element with jque...,684672,loop through javascript object. i have a javas...,684692,0,9


In [15]:
# Limit the number of training duplicate matches.
train = train[train.n < args_match]

# The input data columns.
feature_columns = ['Text_x', 'Text_y']
label_column = 'Label'
group_column = 'Id_x'
answerid_column = 'AnswerId_y'
name_columns = ['Id_x', 'Id_y']

# Report on the dataset.
print('train: {:,} rows with {:.2%} matches'.format(
    train.shape[0], train[label_column].mean()))

train: 132,500 rows with 5.00% matches


In [16]:
# Compute instance weights.
weight_column = 'Weight'
if args_unweighted:
    weight = pd.Series([1.0], train[label_column].unique())
else:
    label_counts = train[label_column].value_counts()
    weight = train.shape[0]/(label_counts.shape[0]*label_counts)
train[weight_column] = train[label_column].apply(lambda x: weight[x])

# Collect the ordered AnswerId.
labels = sorted(train[answerid_column].unique())
label_order = pd.DataFrame({'label': labels})

## Define the model.

In [17]:
# Select and format the training data.
train_X = train[feature_columns]
train_y = train[label_column]
sample_weight = train[weight_column]
names = train[name_columns]

In [18]:
# Select the training hyperparameters.
n_estimators = args_estimators
min_child_samples = args_min_child_samples
estimator = lgb.LGBMClassifier(n_estimators=n_estimators,
                               min_child_samples=min_child_samples,
                               verbose=args_verbose)
if args_ngrams > 0:
    ngram_range = (1, args_ngrams)
else:
    ngram_range = None
assert ngram_range is not None

# The featurization pipeline(s) for each text column.
featurization = [
    (column,
     make_pipeline(ItemSelector(column),
                   text.TfidfVectorizer(ngram_range=ngram_range)))
    for column in feature_columns]
features = FeatureUnion(featurization)

# The model pipeline.
model = Pipeline([
    ('features', features),
    ('model', lgb.LGBMClassifier(n_estimators=n_estimators))
])

## Fit the model.

In [19]:
%%time
model.fit(train_X, train_y, model__sample_weight=sample_weight)

CPU times: user 40min 36s, sys: 3.31 s, total: 40min 39s
Wall time: 7min 21s


Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('Text_x', Pipeline(memory=None,
     steps=[('itemselector', ItemSelector(keys='Text_x')), ('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf...0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0))])

Write the model to file.

In [20]:
if args_save:
    joblib.dump(model, model_path)
    print('{} size: {:.2f} MB'.format(model_path, os.path.getsize(model_path)/(2**20)))

./model.pkl size: 22.46 MB


## Test the model

In [21]:
# Read the test data.
print('Reading {}'.format(test_path))
test = pd.read_csv(test_path, sep='\t', encoding='latin1')
print('test {:,} rows with {:.2%} matches'.format(
    test.shape[0], test[label_column].mean()))

Reading ./balanced_pairs_test.tsv
test 297,570 rows with 0.55% matches


In [22]:
test.head()

Unnamed: 0,Id_x,AnswerId_x,Text_x,Id_y,Text_y,AnswerId_y,Label,n
0,114525,336868,"the difference between the two functions? (""fu...",336859,var functionname = function() {} vs function f...,336868,1,0
1,114525,336868,"the difference between the two functions? (""fu...",1129216,sort array of objects by string property value...,1129270,0,1
2,114525,336868,"the difference between the two functions? (""fu...",5187530,variable variables in javascript?. i know it's...,5187652,0,2
3,114525,336868,"the difference between the two functions? (""fu...",2320069,jquery ajax file upload. can i use the followi...,2320097,0,3
4,114525,336868,"the difference between the two functions? (""fu...",28250680,how do i access previous promise results in a ...,28250697,0,4


Collect the model predictions.

In [23]:
%%time
test_X = test[feature_columns]
test['probabilities'] = model.predict_proba(test_X)[:, 1]

CPU times: user 3min 37s, sys: 605 ms, total: 3min 38s
Wall time: 1min 8s


In [24]:
test.head()

Unnamed: 0,Id_x,AnswerId_x,Text_x,Id_y,Text_y,AnswerId_y,Label,n,probabilities
0,114525,336868,"the difference between the two functions? (""fu...",336859,var functionname = function() {} vs function f...,336868,1,0,0.9999756
1,114525,336868,"the difference between the two functions? (""fu...",1129216,sort array of objects by string property value...,1129270,0,1,7.319658e-10
2,114525,336868,"the difference between the two functions? (""fu...",5187530,variable variables in javascript?. i know it's...,5187652,0,2,9.388601e-09
3,114525,336868,"the difference between the two functions? (""fu...",2320069,jquery ajax file upload. can i use the followi...,2320097,0,3,6.330419e-08
4,114525,336868,"the difference between the two functions? (""fu...",28250680,how do i access previous promise results in a ...,28250697,0,4,4.066033e-09


Create a data frame with one row per duplicate question, and make it contain the model's predictions.

In [None]:
# Order the testing data by dupe Id and question AnswerId.
test.sort_values([group_column, answerid_column], inplace=True)

# Extract the ordered probabilities.
probabilities = (
    test.probabilities
    .groupby(test[group_column], sort=False)
    .apply(lambda x: tuple(x.values)))

In [26]:
# Get the individual records.
output_columns_x = ['Id_x', 'AnswerId_x', 'Text_x']
test_score = (test[output_columns_x]
              .drop_duplicates()
              .set_index(group_column))
test_score['probabilities'] = probabilities
test_score.reset_index(inplace=True)
test_score.columns = ['Id', 'AnswerId', 'Text', 'probabilities']

In [27]:
test_score.head()

Unnamed: 0,Id,AnswerId,Text,probabilities
0,114525,336868,"the difference between the two functions? (""fu...","(9.382683066962518e-08, 4.478198950696976e-08,..."
1,209732,1520853,why am i seeing inconsistent javascript logic ...,"(3.113476542151973e-06, 9.428270499788823e-07,..."
2,562412,14220323,return value from function with an ajax call. ...,"(4.1066638336300696e-09, 7.95495801749694e-09,..."
3,565430,122704,(deep) copying an array using jquery. possibl...,"(4.807857343878882e-07, 5.3399109827360376e-11..."
4,832257,17606289,javascript multiple replace. how do you replac...,"(7.500307961024528e-05, 1.0218972073303199e-05..."


## Evaluate the predictions

In [28]:
# Rank the correct answers.
test_score['Ranks'] = test_score.apply(lambda x:
                                       label_rank(x.AnswerId,
                                                  x.probabilities,
                                                  label_order.label),
                                       axis=1)

# Compute the number of correctly ranked answers
for i in range(1, args_rank+1):
    print('Accuracy @{} = {:.2%}'.format(
        i, (test_score['Ranks'] <= i).mean()))
mean_rank = test_score['Ranks'].mean()
print('Mean Rank {:.4f}'.format(mean_rank))

# Write the scored instances.
test_score.to_csv(instances_path, sep='\t', index=False,
                  encoding='latin1')
label_order.to_csv(labels_path, sep='\t', index=False)

Accuracy @1 = 54.62%
Accuracy @2 = 67.77%
Accuracy @3 = 75.35%
Mean Rank 4.5242
