# Modeling

## Load libraries and utility functions.

In [1]:
from __future__ import print_function
import os
import warnings
import pandas as pd
import lightgbm as lgb
from sklearn.feature_extraction import text
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.externals import joblib
from ItemSelector import ItemSelector
from label_rank import label_rank

warnings.filterwarnings(action='ignore', category=UserWarning,
                        module='lightgbm')

## Define the input parameters.
One of the most important parameters is the number of estimators that allows you to trade-off accuracy, modeling time, and model size. The table below should give you an idea of the relationships between the number of estimators and the metrics.

| Estimators | Run time (s) | Size (MB) | Accuracy@1 | Accuracy@2 | Accuracy@3 |
|------------|--------------|----------|--------|--------|
|        100 |           40 |  2 | 25.02% | 38.72% | 47.83% |
|       1000 |          177 |  4 | 46.79% | 60.80% | 69.11% |
|       2000 |          359 |  7 | 51.38% | 65.93% | 73.09% |
|       4000 |          628 | 12 | 53.39% | 67.40% | 74.74% |

In [2]:
args_data = 'balanced_pairs_train.tsv'
args_test = 'balanced_pairs_test.tsv'
args_estimators = 4000
args_ngrams = 1
args_unweighted = False
args_min_child_samples = 20
args_match = 20
args_outputs = '.'
args_inputs = '.'
args_save = True
args_model = 'model.pkl'
args_instances = 'inst.txt'
args_labels = 'labels.txt'
args_rank = 3
args_verbose = -1

## Define the paths to the input and output data

In [3]:
# The input data.
inputs_path = args_inputs
data_path = os.path.join(inputs_path, args_data)
test_path = os.path.join(inputs_path, args_test)

# The output data.
outputs_path = args_outputs
model_path = os.path.join(outputs_path, args_model)
instances_path = os.path.join(outputs_path, args_instances)
labels_path = os.path.join(outputs_path, args_labels)

# Create the outputs folder.
os.makedirs(outputs_path, exist_ok=True)

## Load and set up the training data

In [4]:
# Load the training data.
print('Reading {}'.format(data_path))
train = pd.read_csv(data_path, sep='\t', encoding='latin1')
train

Reading ./balanced_pairs_train.tsv


Unnamed: 0,Id_x,AnswerId_x,Text_x,Id_y,Text_y,AnswerId_y,Label,n
0,114525,336868,"the difference between the two functions? (""fu...",336859,var functionname = function() {} vs function f...,336868,1,0
1,114525,336868,"the difference between the two functions? (""fu...",1566595,can i use multiple versions of jquery on the s...,1566644,0,1
2,114525,336868,"the difference between the two functions? (""fu...",32584850,facebook js sdk's fb.api('/me') method doesn't...,32585470,0,2
3,114525,336868,"the difference between the two functions? (""fu...",1359018,"in jquery, how to attach events to dynamic htm...",9331127,0,3
4,114525,336868,"the difference between the two functions? (""fu...",2655925,how to apply !important using .css()?. i am ha...,8894528,0,4
5,114525,336868,"the difference between the two functions? (""fu...",20279484,how to access the correct `this` / context ins...,20279485,0,5
6,114525,336868,"the difference between the two functions? (""fu...",1144783,replacing all occurrences of a string in javas...,17606289,0,6
7,114525,336868,"the difference between the two functions? (""fu...",1398582,prevent execution of parent event handler. i h...,1398608,0,7
8,114525,336868,"the difference between the two functions? (""fu...",391979,get client ip using just javascript?. i need t...,810461,0,8
9,114525,336868,"the difference between the two functions? (""fu...",5223,"length of a javascript object (that is, associ...",6700,0,9


In [5]:
# Limit the number of training duplicate matches.
train = train[train.n < args_match]

# The input data columns.
feature_columns = ['Text_x', 'Text_y']
label_column = 'Label'
group_column = 'Id_x'
answerid_column = 'AnswerId_y'
name_columns = ['Id_x', 'Id_y']

# Report on the dataset.
print('train: {:,} rows with {:.2%} matches'.format(
    train.shape[0], train[label_column].mean()))

train: 132,500 rows with 5.00% matches


In [6]:
# Compute instance weights.
weight_column = 'Weight'
if args_unweighted:
    weight = pd.Series([1.0], train[label_column].unique())
else:
    label_counts = train[label_column].value_counts()
    weight = train.shape[0]/(label_counts.shape[0]*label_counts)
train[weight_column] = train[label_column].apply(lambda x: weight[x])

# Collect the ordered AnswerId.
labels = sorted(train[answerid_column].unique())
label_order = pd.DataFrame({'label': labels})

## Define the model.

In [7]:
# Select and format the training data.
train_X = train[feature_columns]
train_y = train[label_column]
sample_weight = train[weight_column]
names = train[name_columns]

In [8]:
# Select the training hyperparameters.
n_estimators = args_estimators
min_child_samples = args_min_child_samples
estimator = lgb.LGBMClassifier(n_estimators=n_estimators,
                               min_child_samples=min_child_samples,
                               verbose=args_verbose)
if args_ngrams > 0:
    ngram_range = (1, args_ngrams)
else:
    ngram_range = None
assert ngram_range is not None

# The featurization pipeline(s) for each text column.
featurization = [
    (column,
     make_pipeline(ItemSelector(column),
                   text.TfidfVectorizer(ngram_range=ngram_range)))
    for column in feature_columns]
features = FeatureUnion(featurization)

# The model pipeline.
model = Pipeline([
    ('features', features),
    ('model', lgb.LGBMClassifier(n_estimators=n_estimators))
])

## Fit the model.

In [9]:
%%time
model.fit(train_X, train_y, model__sample_weight=sample_weight)

CPU times: user 21min 30s, sys: 2.21 s, total: 21min 33s
Wall time: 3min 58s


Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('Text_x', Pipeline(memory=None,
     steps=[('itemselector', ItemSelector(keys='Text_x')), ('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf...0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0))])

Write the model to file.

In [10]:
if args_save:
    joblib.dump(model, model_path)
    print('{} size: {:.0f} MB'.format(model_path, os.path.getsize(model_path)/(2**20)))

./model.pkl size: 12 MB


## Test the model

In [11]:
# Read the test data.
print('Reading {}'.format(test_path))
test = pd.read_csv(test_path, sep='\t', encoding='latin1')
print('test {:,} rows with {:.2%} matches'.format(
    test.shape[0], test[label_column].mean()))

Reading ./balanced_pairs_test.tsv
test 297,570 rows with 0.55% matches


In [12]:
test.head()

Unnamed: 0,Id_x,AnswerId_x,Text_x,Id_y,Text_y,AnswerId_y,Label,n
0,440494,169035,share constants between php and javascript. p...,168214,pass a php string to a javascript variable (an...,169035,1,0
1,440494,169035,share constants between php and javascript. p...,1129216,sort array of objects by string property value...,1129270,0,1
2,440494,169035,share constants between php and javascript. p...,5041494,selecting and manipulating css pseudo-elements...,5734583,0,2
3,440494,169035,share constants between php and javascript. p...,3163407,javascript and operator within assignment. i k...,3163422,0,3
4,440494,169035,share constants between php and javascript. p...,69913,why don't self-closing script tags work?. what...,69984,0,4


Collect the model predictions.

In [13]:
%%time
test_X = test[feature_columns]
test['probabilities'] = model.predict_proba(test_X)[:, 1]

CPU times: user 2min 22s, sys: 615 ms, total: 2min 22s
Wall time: 44.8 s


In [14]:
test.head()

Unnamed: 0,Id_x,AnswerId_x,Text_x,Id_y,Text_y,AnswerId_y,Label,n,probabilities
0,440494,169035,share constants between php and javascript. p...,168214,pass a php string to a javascript variable (an...,169035,1,0,0.971116
1,440494,169035,share constants between php and javascript. p...,1129216,sort array of objects by string property value...,1129270,0,1,0.000641
2,440494,169035,share constants between php and javascript. p...,5041494,selecting and manipulating css pseudo-elements...,5734583,0,2,0.000273
3,440494,169035,share constants between php and javascript. p...,3163407,javascript and operator within assignment. i k...,3163422,0,3,0.001303
4,440494,169035,share constants between php and javascript. p...,69913,why don't self-closing script tags work?. what...,69984,0,4,0.000346


Create a data frame with one row per duplicate question, and make it contain the model's predictions.

In [15]:
# Order the testing data by dupe Id and question AnswerId.
test.sort_values([group_column, answerid_column], inplace=True)

# Extract the ordered probabilities.
probabilities = (
    test.probabilities
    .groupby(test[group_column], sort=False)
    .apply(lambda x: tuple(x.values)))

In [16]:
# Get the individual records.
output_columns_x = ['Id_x', 'AnswerId_x', 'Text_x']
test_score = (test[output_columns_x]
              .drop_duplicates()
              .set_index(group_column))
test_score['probabilities'] = probabilities
test_score.reset_index(inplace=True)
test_score.columns = ['Id', 'AnswerId', 'Text', 'probabilities']

In [17]:
test_score

Unnamed: 0,Id,AnswerId,Text,probabilities
0,440494,169035,share constants between php and javascript. p...,"(0.00014768915602073943, 0.0033569558256017376..."
1,559752,242833,single quotes versus double quotes in js. pos...,"(2.7610195585357627e-05, 0.0006315978415349053..."
2,604860,1520853,interesting test of javascript regexp. i wrote...,"(7.072950994103214e-06, 1.4285941605650736e-05..."
3,643542,750506,doesn't javascript support closures with local...,"(0.0018551580861028074, 0.0017612695797268336,..."
4,951267,242833,"javascript formatting opinion: ' vs "". in look...","(1.6442363696513182e-05, 0.0001391814937541013..."
5,1688337,1771824,javascript if alternative. what does this bit ...,"(0.021356611762446152, 0.0015444223303615243, ..."
6,2009963,23740549,pass entire $_post variable to popup. i have a...,"(2.540176980694238e-05, 0.00013399239997736613..."
7,2039117,17606289,remove semicolon in string by javascript. does...,"(2.428053715090766e-05, 0.00044291818165422046..."
8,2296971,1085810,get selected item from the list with js. i hav...,"(0.0008084507919272385, 0.0006013838635226974,..."
9,2462800,695053,how to create a dynamic key to be added to a j...,"(0.0002856138461600747, 9.100848531635329e-05,..."


## Evaluate the predictions

In [18]:
# Rank the correct answers.
test_score['Ranks'] = test_score.apply(lambda x:
                                       label_rank(x.AnswerId,
                                                  x.probabilities,
                                                  label_order.label),
                                       axis=1)

# Compute the number of correctly ranked answers
for i in range(1, args_rank+1):
    print('Accuracy @{} = {:.2%}'.format(
        i, (test_score['Ranks'] <= i).mean()))
mean_rank = test_score['Ranks'].mean()
print('Mean Rank {:.4f}'.format(mean_rank))

# Write the scored instances.
test_score.to_csv(instances_path, sep='\t', index=False,
                  encoding='latin1')
label_order.to_csv(labels_path, sep='\t', index=False)

Accuracy @1 = 53.64%
Accuracy @2 = 66.85%
Accuracy @3 = 74.07%
Mean Rank 5.2697
