# Modeling

## Load libraries and utility functions.

In [1]:
from __future__ import print_function
import os
import warnings
import pandas as pd
import lightgbm as lgb
from sklearn.feature_extraction import text
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.externals import joblib
from ItemSelector import ItemSelector
from label_rank import label_rank

warnings.filterwarnings(action='ignore', category=UserWarning,
                        module='lightgbm')

## Define the input parameters.
One of the most important parameters is the number of estimators that allows you to trade-off accuracy, modeling time, and model size. The table below should give you an idea of the relationships between the number of estimators and the metrics.

| Estimators | Run time (s) | Size (MB) | Accuracy@1 | Accuracy@2 | Accuracy@3 |
|------------|--------------|-----------|------------|------------|------------|
|        100 |           40 |  2 | 25.02% | 38.72% | 47.83% |
|       1000 |          177 |  4 | 46.79% | 60.80% | 69.11% |
|       2000 |          359 |  7 | 51.38% | 65.93% | 73.09% |
|       4000 |          628 | 12 | 53.39% | 67.40% | 74.74% |
|       8000 |          904 | 22 | 54,62% | 67.77% | 75.35% |

In [2]:
args_data = 'balanced_pairs_train.tsv' # The file of training data.
args_test = 'balanced_pairs_test.tsv'  # The file of testing data.
args_estimators = 8000                 # The number of estimators fit by LightGBM.
args_min_child_samples = 20            # The minimum number of samples in a leaf created bty LightGBM.
args_verbose = -1                      # The progress report messages from LightGBM; "-1" means none.
args_ngrams = 1                        # The maximum size of ngrams created by TfidfVectorizer.
args_unweighted = False                # Whether to ignore instance weights used to correct imbalance in training.
args_match = 20                        # The maximum number of original questions per duplicate to use in the data. 
args_outputs = '.'                     # The folder where this notebook deposits its outputs.
args_inputs = '.'                      # The folder where this notebook picks up its inputs.
args_save = True                       # Whether to save the model created by the notebook.
args_model = 'model.pkl'               # The file containing the saved model.
args_instances = 'inst.txt'            # The file containing the scored test data.
args_labels = 'labels.txt'             # The file containing the ordered unique ids of the original questions. 
args_rank = 3                          # The maximum position at which to report test set accuracy.

## Define paths to the notebook's input and output files

The training and testing datasets.

In [3]:
inputs_path = args_inputs
data_path = os.path.join(inputs_path, args_data)
test_path = os.path.join(inputs_path, args_test)

The saved model file and the scored test data.

In [4]:
outputs_path = args_outputs
os.makedirs(outputs_path, exist_ok=True)                     # Create the outputs folder.
model_path = os.path.join(outputs_path, args_model)
instances_path = os.path.join(outputs_path, args_instances)
labels_path = os.path.join(outputs_path, args_labels)

## Load and set up the training data

Load the training data, and display a sample of its contents.

In [5]:
print('Reading {}'.format(data_path))
train = pd.read_csv(data_path, sep='\t', encoding='latin1')
train.head()

Reading ./balanced_pairs_train.tsv


Unnamed: 0,Id_x,AnswerId_x,Text_x,Id_y,Text_y,AnswerId_y,Label,n
0,177867,122704,how do i copy the data of an element with jque...,122102,what is the most efficient way to clone an obj...,122704,1,0
1,177867,122704,how do i copy the data of an element with jque...,3665115,"create a file in memory for user to download, ...",3665147,0,1
2,177867,122704,how do i copy the data of an element with jque...,950087,include a javascript file in another javascrip...,950146,0,2
3,177867,122704,how do i copy the data of an element with jque...,2241875,how to create an object property from a variab...,2241883,0,3
4,177867,122704,how do i copy the data of an element with jque...,572897,how does javascript .prototype work?. i'm not ...,572996,0,4


Limit the number of duplicate-original question matches.

In [6]:
train = train[train.n < args_match]

Define the roles of the columns in the training data.

In [7]:
feature_columns = ['Text_x', 'Text_y']
label_column = 'Label'
duplicates_id_column = 'Id_x'
answer_id_column = 'AnswerId_y'

Report on the training dataset: the number of rows and the proportion of true matches.

In [8]:
print('train: {:,} rows with {:.2%} matches'.format(
      train.shape[0], train[label_column].mean()))

train: 132,500 rows with 5.00% matches


Compute the instance weights used to correct for class imbalance in training.

In [9]:
weight_column = 'Weight'
if args_unweighted:
    weight = pd.Series([1.0], train[label_column].unique())
else:
    label_counts = train[label_column].value_counts()
    weight = train.shape[0]/(label_counts.shape[0]*label_counts)
train[weight_column] = train[label_column].apply(lambda x: weight[x])

Collect the unique ids that identify each original question's answer.

In [10]:
labels = sorted(train[answer_id_column].unique())
label_order = pd.DataFrame({'label': labels})

## Define the model.

Collect the parts of the training data by role.

In [11]:
train_X = train[feature_columns]
train_y = train[label_column]
sample_weight = train[weight_column]

Use the inputs to define the hyperparameters used in training.

In [12]:
n_estimators = args_estimators
min_child_samples = args_min_child_samples
if args_ngrams > 0:
    ngram_range = (1, args_ngrams)
else:
    ngram_range = None

Verify that the hyperparameter values are valid.

In [13]:
assert n_estimators > 0
assert min_child_samples > 1
assert type(ngram_range) is tuple and len(ngram_range) == 2
assert ngram_range[0] > 0 and ngram_range[0] <= ngram_range[1]

Define the pipeline that featurizes the text columns.

In [14]:
featurization = [
    (column,
     make_pipeline(ItemSelector(column),
                   text.TfidfVectorizer(ngram_range=ngram_range)))
    for column in feature_columns]
features = FeatureUnion(featurization)

Define the estimator that learns how to classify duplicate-original question pairs.

In [15]:
estimator = lgb.LGBMClassifier(n_estimators=n_estimators,
                               min_child_samples=min_child_samples,
                               verbose=args_verbose)

Define the model pipeline as feeding the features into the estimator.

In [16]:
model = Pipeline([
    ('features', features),
    ('model', estimator)
])

## Fit the model.
This step should take about seven and a half minutes on a a Standard NC6 DLVM.

In [17]:
%%time
model.fit(train_X, train_y, model__sample_weight=sample_weight)

CPU times: user 42min 3s, sys: 4.13 s, total: 42min 7s
Wall time: 7min 38s


Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('Text_x', Pipeline(memory=None,
     steps=[('itemselector', ItemSelector(keys='Text_x')), ('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf...a=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0, verbose=-1))])

Save the model to a file, and report on its size.

In [18]:
if args_save:
    joblib.dump(model, model_path)
    print('{} size: {:.2f} MB'.format(model_path, os.path.getsize(model_path)/(2**20)))

./model.pkl size: 22.46 MB


## Test the model

Read in the test data set, and report of the number of its rows and proportion of true matches.

In [19]:
print('Reading {}'.format(test_path))
test = pd.read_csv(test_path, sep='\t', encoding='latin1')
print('test {:,} rows with {:.2%} matches'.format(
    test.shape[0], test[label_column].mean()))

Reading ./balanced_pairs_test.tsv
test 297,570 rows with 0.55% matches


Display a sample of its contents.

In [20]:
test.head()

Unnamed: 0,Id_x,AnswerId_x,Text_x,Id_y,Text_y,AnswerId_y,Label,n
0,114525,336868,"the difference between the two functions? (""fu...",336859,var functionname = function() {} vs function f...,336868,1,0
1,114525,336868,"the difference between the two functions? (""fu...",1129216,sort array of objects by string property value...,1129270,0,1
2,114525,336868,"the difference between the two functions? (""fu...",5187530,variable variables in javascript?. i know it's...,5187652,0,2
3,114525,336868,"the difference between the two functions? (""fu...",2320069,jquery ajax file upload. can i use the followi...,2320097,0,3
4,114525,336868,"the difference between the two functions? (""fu...",28250680,how do i access previous promise results in a ...,28250697,0,4


Collect the model predictions. This step should take about 1 minute on a a Standard NC6 DLVM.

In [21]:
%%time
test_X = test[feature_columns]
test['probabilities'] = model.predict_proba(test_X)[:, 1]

CPU times: user 3min 49s, sys: 529 ms, total: 3min 49s
Wall time: 1min 11s


Display the sample with the added probabilities column.

In [22]:
test.head()

Unnamed: 0,Id_x,AnswerId_x,Text_x,Id_y,Text_y,AnswerId_y,Label,n,probabilities
0,114525,336868,"the difference between the two functions? (""fu...",336859,var functionname = function() {} vs function f...,336868,1,0,0.9999756
1,114525,336868,"the difference between the two functions? (""fu...",1129216,sort array of objects by string property value...,1129270,0,1,7.319658e-10
2,114525,336868,"the difference between the two functions? (""fu...",5187530,variable variables in javascript?. i know it's...,5187652,0,2,9.388601e-09
3,114525,336868,"the difference between the two functions? (""fu...",2320069,jquery ajax file upload. can i use the followi...,2320097,0,3,6.330419e-08
4,114525,336868,"the difference between the two functions? (""fu...",28250680,how do i access previous promise results in a ...,28250697,0,4,4.066033e-09


Collect the probabilities for each duplicate question, ordered by the original question ids. 

In [23]:
# Order the testing data by duplicate question id and original question id.
test.sort_values([duplicates_id_column, answer_id_column], inplace=True)

# Extract the ordered probabilities.
probabilities = (
    test.probabilities
    .groupby(test[duplicates_id_column], sort=False)
    .apply(lambda x: tuple(x.values)))

Create a data frame with one row per duplicate question, and make it contain the model's predictions for each duplicate.

In [24]:
test_score = (test[['Id_x', 'AnswerId_x', 'Text_x']]
              .drop_duplicates()
              .set_index(duplicates_id_column))
test_score['probabilities'] = probabilities
test_score.reset_index(inplace=True)
test_score.columns = ['Id', 'AnswerId', 'Text', 'probabilities']

Display a sample of its contents.

In [25]:
test_score.head()

Unnamed: 0,Id,AnswerId,Text,probabilities
0,114525,336868,"the difference between the two functions? (""fu...","(9.382683054306137e-08, 4.478198947859503e-08,..."
1,209732,1520853,why am i seeing inconsistent javascript logic ...,"(3.11347654850818e-06, 9.428270494992024e-07, ..."
2,562412,14220323,return value from function with an ajax call. ...,"(4.106663843235513e-09, 7.954958014896923e-09,..."
3,565430,122704,(deep) copying an array using jquery. possibl...,"(4.807857345346937e-07, 5.339910996938693e-11,..."
4,832257,17606289,javascript multiple replace. how do you replac...,"(7.500307973518815e-05, 1.0218972114103849e-05..."


## Evaluate the predictions

For each duplicate question, find the rank of its correct original question.

In [26]:
test_score['Ranks'] = test_score.apply(lambda x:
                                       label_rank(x.AnswerId,
                                                  x.probabilities,
                                                  label_order.label),
                                       axis=1)

Compute the fraction of correct original questions by minimum rank. Also print the average rank of the correct original questions.

In [27]:
for i in range(1, args_rank+1):
    print('Accuracy @{} = {:.2%}'.format(
        i, (test_score['Ranks'] <= i).mean()))
mean_rank = test_score['Ranks'].mean()
print('Mean Rank {:.4f}'.format(mean_rank))

Accuracy @1 = 54.62%
Accuracy @2 = 67.77%
Accuracy @3 = 75.35%
Mean Rank 4.5235


Write the scored instances to a file, along with the ordered original questions's answer ids.

In [28]:
test_score.to_csv(instances_path, sep='\t', index=False,
                  encoding='latin1')
label_order.to_csv(labels_path, sep='\t', index=False)