# Modeling

## Load libraries and utility functions.

In [1]:
from __future__ import print_function
import os
import warnings
import pandas as pd
import lightgbm as lgb
from sklearn.feature_extraction import text
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.externals import joblib
from ItemSelector import ItemSelector
from label_rank import label_rank

warnings.filterwarnings(action='ignore', category=UserWarning,
                        module='lightgbm')

## Define the input parameters.
One of the most important parameters is the number of estimators that allows you to trade-off accuracy and modeling time. The table below should give you an idea of the relationships between the number of estimators and the metrics.

| estimators | run time (s) | Accuracy@1 | Accuracy@2 | Accuracy@3 |
|------------|--------------|----------|--------|--------|
|        100 |           40 |   25.02% | 38.72% | 47.83% |
|       1000 |          177 |   46.79% | 60.80% | 69.11% |
|       2000 |          359 |   51.38% | 65.93% | 73.09% |
|       4000 |          628 |   53.39% | 67.40% | 74.74% |

In [2]:
args_data = 'balanced_pairs_train.tsv'
args_test = 'balanced_pairs_test.tsv'
args_estimators = 4000
args_ngrams = 1
args_unweighted = False
args_min_child_samples = 20
args_match = 20
args_outputs = '.'
args_inputs = '.'
args_save = True
args_model = 'model.pkl'
args_instances = 'inst.txt'
args_labels = 'labels.txt'
args_rank = 3
args_verbose = -1

## Define the paths to the input and output data

In [3]:
# The input data.
inputs_path = args_inputs
data_path = os.path.join(inputs_path, args_data)
test_path = os.path.join(inputs_path, args_test)

# The output data.
outputs_path = args_outputs
model_path = os.path.join(outputs_path, args_model)
instances_path = os.path.join(outputs_path, args_instances)
labels_path = os.path.join(outputs_path, args_labels)

# Create the outputs folder.
os.makedirs(outputs_path, exist_ok=True)

## Load and set up the training data

In [4]:
# Load the training data.
print('Reading {}'.format(data_path))
train = pd.read_csv(data_path, sep='\t', encoding='latin1')
train

Reading ./balanced_pairs_train.tsv


Unnamed: 0,Id_x,AnswerId_x,Text_x,Id_y,Text_y,AnswerId_y,Label,n
0,114525,336868,"the difference between the two functions? (""fu...",336859,var functionname = function() {} vs function f...,336868,1,0
1,114525,336868,"the difference between the two functions? (""fu...",1873983,what does the leading semicolon in javascript ...,1873999,0,1
2,114525,336868,"the difference between the two functions? (""fu...",1026069,capitalize the first letter of string in javas...,1026087,0,2
3,114525,336868,"the difference between the two functions? (""fu...",3665115,"create a file in memory for user to download, ...",3665147,0,3
4,114525,336868,"the difference between the two functions? (""fu...",122102,what is the most efficient way to clone an obj...,122704,0,4
5,114525,336868,"the difference between the two functions? (""fu...",1144783,replacing all occurrences of a string in javas...,17606289,0,5
6,114525,336868,"the difference between the two functions? (""fu...",126100,how to efficiently count the number of keys/pr...,4889658,0,6
7,114525,336868,"the difference between the two functions? (""fu...",2194992,jquery - $ is not defined. i have a simple jqu...,2195167,0,7
8,114525,336868,"the difference between the two functions? (""fu...",750486,javascript closure inside loops â simple pra...,750506,0,8
9,114525,336868,"the difference between the two functions? (""fu...",1584370,how to merge two arrays in javascript and de-d...,1584377,0,9


In [5]:
# Limit the number of training duplicate matches.
train = train[train.n < args_match]

# The input data columns.
feature_columns = ['Text_x', 'Text_y']
label_column = 'Label'
group_column = 'Id_x'
answerid_column = 'AnswerId_y'
name_columns = ['Id_x', 'Id_y']

# Report on the dataset.
print('train: {:,} rows with {:.2%} matches'.format(
    train.shape[0], train[label_column].mean()))

train: 132,500 rows with 5.00% matches


In [7]:
# Compute instance weights.
weight_column = 'Weight'
if args_unweighted:
    weight = pd.Series([1.0], train[label_column].unique())
else:
    label_counts = train[label_column].value_counts()
    weight = train.shape[0]/(label_counts.shape[0]*label_counts)
train[weight_column] = train[label_column].apply(lambda x: weight[x])

# Collect the ordered AnswerId.
labels = sorted(train[answerid_column].unique())
label_order = pd.DataFrame({'label': labels})

## Define the model.

In [8]:
# Select and format the training data.
train_X = train[feature_columns]
train_y = train[label_column]
sample_weight = train[weight_column]
names = train[name_columns]

In [None]:
# Select the training hyperparameters.
n_estimators = args_estimators
min_child_samples = args_min_child_samples
estimator = lgb.LGBMClassifier(n_estimators=n_estimators,
                               min_child_samples=min_child_samples,
                               verbose=args_verbose)
if args_ngrams > 0:
    ngram_range = (1, args_ngrams)
else:
    ngram_range = None
assert ngram_range is not None

# The featurization pipeline(s) for each text column.
featurization = [
    (column,
     make_pipeline(ItemSelector(column),
                   text.TfidfVectorizer(ngram_range=ngram_range)))
    for column in feature_columns]
features = FeatureUnion(featurization)

# The model pipeline.
model = Pipeline([
    ('features', features),
    ('model', lgb.LGBMClassifier(n_estimators=n_estimators))
])

## Fit the model.

In [9]:
%%time
model.fit(train_X, train_y, model__sample_weight=sample_weight)

CPU times: user 39min 47s, sys: 6.76 s, total: 39min 54s
Wall time: 3min 46s


Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('Text_x', Pipeline(memory=None,
     steps=[('itemselector', ItemSelector(keys='Text_x')), ('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf...reg_lambda=0, seed=0, silent=True, subsample=1,
        subsample_for_bin=50000, subsample_freq=1))])

Write the model to file.

In [10]:
if args_save:
    joblib.dump(model, model_path)

## Test the model

In [15]:
# Read the test data.
print('Reading {}'.format(test_path))
test = pd.read_csv(test_path, sep='\t', encoding='latin1')
print('test {:,} rows with {:.2%} matches'.format(
    test.shape[0], test[label_column].mean()))

Reading ./balanced_pairs_test.tsv
test 297,570 rows with 0.55% matches


In [17]:
test.head()

Unnamed: 0,Id_x,AnswerId_x,Text_x,Id_y,Text_y,AnswerId_y,Label,n
0,247479,21925491,jquery text to link script?. does anyone know ...,37684,how to replace plain urls with links?. i am us...,21925491,1,0
1,247479,21925491,jquery text to link script?. does anyone know ...,307179,what is javascript's highest integer value tha...,307200,0,1
2,247479,21925491,jquery text to link script?. does anyone know ...,4851595,how to resolve the c:\fakepath?. this is my u...,4851614,0,2
3,247479,21925491,jquery text to link script?. does anyone know ...,5187530,variable variables in javascript?. i know it's...,5187652,0,3
4,247479,21925491,jquery text to link script?. does anyone know ...,28250680,how do i access previous promise results in a ...,28250697,0,4


Collect the model predictions.

In [18]:
%%time
test_X = test[feature_columns]
test['probabilities'] = model.predict_proba(test_X)[:, 1]

CPU times: user 2min 23s, sys: 684 ms, total: 2min 24s
Wall time: 45.3 s


In [19]:
test.head()

Unnamed: 0,Id_x,AnswerId_x,Text_x,Id_y,Text_y,AnswerId_y,Label,n,probabilities
0,247479,21925491,jquery text to link script?. does anyone know ...,37684,how to replace plain urls with links?. i am us...,21925491,1,0,0.468283
1,247479,21925491,jquery text to link script?. does anyone know ...,307179,what is javascript's highest integer value tha...,307200,0,1,0.000189
2,247479,21925491,jquery text to link script?. does anyone know ...,4851595,how to resolve the c:\fakepath?. this is my u...,4851614,0,2,0.001017
3,247479,21925491,jquery text to link script?. does anyone know ...,5187530,variable variables in javascript?. i know it's...,5187652,0,3,0.000446
4,247479,21925491,jquery text to link script?. does anyone know ...,28250680,how do i access previous promise results in a ...,28250697,0,4,0.003316


Create a data frame with one row per duplicate question, and make it contain the model's predictions.

In [21]:
# Order the testing data by dupe Id and question AnswerId.
test.sort_values([group_column, answerid_column], inplace=True)

# Extract the ordered probabilities.
probabilities = (
    test.probabilities
    .groupby(test[group_column], sort=False)
    .apply(lambda x: tuple(x.values)))

In [22]:
# Get the individual records.
output_columns_x = ['Id_x', 'AnswerId_x', 'Text_x']
test_score = (test[output_columns_x]
              .drop_duplicates()
              .set_index(group_column))
test_score['probabilities'] = probabilities
test_score.reset_index(inplace=True)
test_score.columns = ['Id', 'AnswerId', 'Text', 'probabilities']

In [23]:
test_score

Unnamed: 0,Id,AnswerId,Text,probabilities
0,247479,21925491,jquery text to link script?. does anyone know ...,"(0.00016380278778036438, 0.002772127595526845,..."
1,338463,493018,how do i do a date comparison in javascript?. ...,"(0.00013938171375277149, 0.001323839281037983,..."
2,392470,23740549,passing a value from php to javascript. i have...,"(8.338713242995355e-07, 4.9814578975691474e-05..."
3,393479,23740549,best way to transfer an array between php and ...,"(0.00017402821021508124, 0.0001360680117759247..."
4,440494,169035,share constants between php and javascript. p...,"(3.215949785277046e-05, 0.003020275856288079, ..."
5,1283504,1267338,how can i create a zero-padded string represen...,"(0.11432843683370593, 0.0007863132635050472, 0..."
6,1413916,750506,javascript closure immediate evaluation. consi...,"(3.960694724097149e-06, 4.016946750950482e-05,..."
7,1579978,750506,javascript variable scope. i'm having a proble...,"(1.5228922607515007e-05, 1.6848806980754364e-0..."
8,1589234,805113,what's the cleanest way to write a multiline s...,"(0.0004446809683608513, 0.007257244001218062, ..."
9,1691085,194399,obfuscating your jquery code. possible duplic...,"(2.842100969380482e-05, 0.00020624472211889925..."


## Evaluate the predictions

In [24]:
# Rank the correct answers.
test_score['Ranks'] = test_score.apply(lambda x:
                                       label_rank(x.AnswerId,
                                                  x.probabilities,
                                                  label_order.label),
                                       axis=1)

# Compute the number of correctly ranked answers
for i in range(1, args_rank+1):
    print('Accuracy @{} = {:.2%}'.format(
        i, (test_score['Ranks'] <= i).mean()))
mean_rank = test_score['Ranks'].mean()
print('Mean Rank {:.4f}'.format(mean_rank))

# Write the scored instances.
test_score.to_csv(instances_path, sep='\t', index=False,
                  encoding='latin1')
label_order.to_csv(labels_path, sep='\t', index=False)

Accuracy @1 = 55.84%
Accuracy @2 = 70.40%
Accuracy @3 = 76.33%
Mean Rank 4.2391
