# Modeling

## Load libraries and utility functions.

In [1]:
from __future__ import print_function
import os
import warnings
import pandas as pd
import lightgbm as lgb
from sklearn.feature_extraction import text
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.externals import joblib
from ItemSelector import ItemSelector
from label_rank import label_rank

warnings.filterwarnings(action='ignore', category=UserWarning, module='lightgbm')

## Define the input parameters.
One of the most important parameters is the number of estimators that allows you to trade-off accuracy, modeling time, and model size. The table below should give you an idea of the relationships between the number of estimators and the metrics.

| Estimators | Run time (s) | Size (MB) | Accuracy@1 | Accuracy@2 | Accuracy@3 |
|------------|--------------|----------|--------|--------|
|        100 |           40 |  2 | 25.02% | 38.72% | 47.83% |
|       1000 |          177 |  4 | 46.79% | 60.80% | 69.11% |
|       2000 |          359 |  7 | 51.38% | 65.93% | 73.09% |
|       4000 |          628 | 12 | 53.39% | 67.40% | 74.74% |

In [2]:
args_data = 'balanced_pairs_train.tsv'
args_test = 'balanced_pairs_test.tsv'
args_estimators = 4000
args_ngrams = 1
args_unweighted = False
args_min_child_samples = 20
args_match = 20
args_outputs = '.'
args_inputs = '.'
args_save = True
args_model = 'model.pkl'
args_instances = 'inst.txt'
args_labels = 'labels.txt'
args_rank = 3
args_verbose = -1

## Define the paths to the input and output data

In [3]:
# The input data.
inputs_path = args_inputs
data_path = os.path.join(inputs_path, args_data)
test_path = os.path.join(inputs_path, args_test)

# The output data.
outputs_path = args_outputs
model_path = os.path.join(outputs_path, args_model)
instances_path = os.path.join(outputs_path, args_instances)
labels_path = os.path.join(outputs_path, args_labels)

# Create the outputs folder.
os.makedirs(outputs_path, exist_ok=True)

## Load and set up the training data

In [4]:
# Load the training data.
print('Reading {}'.format(data_path))
train = pd.read_csv(data_path, sep='\t', encoding='latin1')
train

Reading ./balanced_pairs_train.tsv


Unnamed: 0,Id_x,AnswerId_x,Text_x,Id_y,Text_y,AnswerId_y,Label,n
0,177867,122704,how do i copy the data of an element with jque...,122102,what is the most efficient way to clone an obj...,122704,1,0
1,177867,122704,how do i copy the data of an element with jque...,205853,why would a javascript variable start with a d...,553734,0,1
2,177867,122704,how do i copy the data of an element with jque...,440739,what do parentheses surrounding a javascript o...,440772,0,2
3,177867,122704,how do i copy the data of an element with jque...,3034941,new myobject(); vs new myobject;. in some java...,3034952,0,3
4,177867,122704,how do i copy the data of an element with jque...,6259982,how to use the ?: (ternary) operator in javasc...,6260001,0,4
5,177867,122704,how do i copy the data of an element with jque...,850341,how do i work around javascript's parseint oct...,850346,0,5
6,177867,122704,how do i copy the data of an element with jque...,2844565,is there a jquery dom change listener?. essent...,2844704,0,6
7,177867,122704,how do i copy the data of an element with jque...,5627284,pass in an array of deferreds to $.when(). her...,5627301,0,7
8,177867,122704,how do i copy the data of an element with jque...,23392111,console.log() async or sync?. i am currently r...,23392650,0,8
9,177867,122704,how do i copy the data of an element with jque...,3595515,xmlhttprequest error: origin null is not allow...,3744697,0,9


In [5]:
# Limit the number of training duplicate matches.
train = train[train.n < args_match]

# The input data columns.
feature_columns = ['Text_x', 'Text_y']
label_column = 'Label'
group_column = 'Id_x'
answerid_column = 'AnswerId_y'
name_columns = ['Id_x', 'Id_y']

# Report on the dataset.
print('train: {:,} rows with {:.2%} matches'.format(
    train.shape[0], train[label_column].mean()))

train: 132,500 rows with 5.00% matches


In [6]:
# Compute instance weights.
weight_column = 'Weight'
if args_unweighted:
    weight = pd.Series([1.0], train[label_column].unique())
else:
    label_counts = train[label_column].value_counts()
    weight = train.shape[0]/(label_counts.shape[0]*label_counts)
train[weight_column] = train[label_column].apply(lambda x: weight[x])

# Collect the ordered AnswerId.
labels = sorted(train[answerid_column].unique())
label_order = pd.DataFrame({'label': labels})

## Define the model.

In [7]:
# Select and format the training data.
train_X = train[feature_columns]
train_y = train[label_column]
sample_weight = train[weight_column]
names = train[name_columns]

In [8]:
# Select the training hyperparameters.
n_estimators = args_estimators
min_child_samples = args_min_child_samples
estimator = lgb.LGBMClassifier(n_estimators=n_estimators,
                               min_child_samples=min_child_samples,
                               verbose=args_verbose)
if args_ngrams > 0:
    ngram_range = (1, args_ngrams)
else:
    ngram_range = None
assert ngram_range is not None

# The featurization pipeline(s) for each text column.
featurization = [
    (column,
     make_pipeline(ItemSelector(column),
                   text.TfidfVectorizer(ngram_range=ngram_range)))
    for column in feature_columns]
features = FeatureUnion(featurization)

# The model pipeline.
model = Pipeline([
    ('features', features),
    ('model', lgb.LGBMClassifier(n_estimators=n_estimators))
])

## Fit the model.

In [9]:
%%time
model.fit(train_X, train_y, model__sample_weight=sample_weight)

CPU times: user 29min 11s, sys: 3.64 s, total: 29min 15s
Wall time: 2min 46s


Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('Text_x', Pipeline(memory=None,
     steps=[('itemselector', ItemSelector(keys='Text_x')), ('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf...0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0))])

Write the model to file.

In [10]:
if args_save:
    joblib.dump(model, model_path)
    print('{} size: {:.0f} MB'.format(model_path, os.path.getsize(model_path)/(2**20)))

./model.pkl size: 12 MB


## Test the model

In [11]:
# Read the test data.
print('Reading {}'.format(test_path))
test = pd.read_csv(test_path, sep='\t', encoding='latin1')
print('test {:,} rows with {:.2%} matches'.format(
    test.shape[0], test[label_column].mean()))

Reading ./balanced_pairs_test.tsv
test 297,570 rows with 0.55% matches


In [12]:
test.head()

Unnamed: 0,Id_x,AnswerId_x,Text_x,Id_y,Text_y,AnswerId_y,Label,n
0,114525,336868,"the difference between the two functions? (""fu...",336859,var functionname = function() {} vs function f...,336868,1,0
1,114525,336868,"the difference between the two functions? (""fu...",32584850,facebook js sdk's fb.api('/me') method doesn't...,32585470,0,1
2,114525,336868,"the difference between the two functions? (""fu...",2846283,what are the rules for javascript's automatic ...,2846298,0,2
3,114525,336868,"the difference between the two functions? (""fu...",3224834,get difference between 2 dates in javascript?....,3224854,0,3
4,114525,336868,"the difference between the two functions? (""fu...",610406,javascript equivalent to printf/string.format....,610415,0,4


Collect the model predictions.

In [13]:
%%time
test_X = test[feature_columns]
test['probabilities'] = model.predict_proba(test_X)[:, 1]

CPU times: user 2min 18s, sys: 556 ms, total: 2min 19s
Wall time: 43.5 s


In [14]:
test.head()

Unnamed: 0,Id_x,AnswerId_x,Text_x,Id_y,Text_y,AnswerId_y,Label,n,probabilities
0,114525,336868,"the difference between the two functions? (""fu...",336859,var functionname = function() {} vs function f...,336868,1,0,0.998699
1,114525,336868,"the difference between the two functions? (""fu...",32584850,facebook js sdk's fb.api('/me') method doesn't...,32585470,0,1,1.2e-05
2,114525,336868,"the difference between the two functions? (""fu...",2846283,what are the rules for javascript's automatic ...,2846298,0,2,0.000125
3,114525,336868,"the difference between the two functions? (""fu...",3224834,get difference between 2 dates in javascript?....,3224854,0,3,0.000121
4,114525,336868,"the difference between the two functions? (""fu...",610406,javascript equivalent to printf/string.format....,610415,0,4,5e-06


Create a data frame with one row per duplicate question, and make it contain the model's predictions.

In [15]:
# Order the testing data by dupe Id and question AnswerId.
test.sort_values([group_column, answerid_column], inplace=True)

# Extract the ordered probabilities.
probabilities = (
    test.probabilities
    .groupby(test[group_column], sort=False)
    .apply(lambda x: tuple(x.values)))

In [16]:
# Get the individual records.
output_columns_x = ['Id_x', 'AnswerId_x', 'Text_x']
test_score = (test[output_columns_x]
              .drop_duplicates()
              .set_index(group_column))
test_score['probabilities'] = probabilities
test_score.reset_index(inplace=True)
test_score.columns = ['Id', 'AnswerId', 'Text', 'probabilities']

In [17]:
test_score

Unnamed: 0,Id,AnswerId,Text,probabilities
0,114525,336868,"the difference between the two functions? (""fu...","(5.847837899421206e-06, 7.153227455363355e-05,..."
1,338463,493018,how do i do a date comparison in javascript?. ...,"(7.776176591148092e-05, 0.00011977398592736444..."
2,1242481,6055620,copy to clipboard using javascript. possible ...,"(1.2217124127371903e-05, 1.745021712056076e-06..."
3,1283504,1267338,how can i create a zero-padded string represen...,"(0.13034549002125007, 0.0030458619422580693, 0..."
4,1582634,750506,passing values to onclick. if i create a whole...,"(0.00015453635569203993, 9.497227670146427e-05..."
5,1589234,805113,what's the cleanest way to write a multiline s...,"(2.5109434841630697e-06, 6.298295034395161e-06..."
6,1871874,138233,"alternatives for using ""#"" in href attribute. ...","(8.57704948591212e-05, 0.00037922751624717965,..."
7,1916584,553734,jquery variable syntax. i'm learning jquery by...,"(0.00030920760553571476, 0.0030959165127625735..."
8,1955248,14220323,how to return variable from the function calle...,"(1.1074242681915092e-06, 2.8733587151238397e-0..."
9,2039117,17606289,remove semicolon in string by javascript. does...,"(0.0007006778622830255, 0.00029926215672953306..."


## Evaluate the predictions

In [18]:
# Rank the correct answers.
test_score['Ranks'] = test_score.apply(lambda x:
                                       label_rank(x.AnswerId,
                                                  x.probabilities,
                                                  label_order.label),
                                       axis=1)

# Compute the number of correctly ranked answers
for i in range(1, args_rank+1):
    print('Accuracy @{} = {:.2%}'.format(
        i, (test_score['Ranks'] <= i).mean()))
mean_rank = test_score['Ranks'].mean()
print('Mean Rank {:.4f}'.format(mean_rank))

# Write the scored instances.
test_score.to_csv(instances_path, sep='\t', index=False,
                  encoding='latin1')
label_order.to_csv(labels_path, sep='\t', index=False)

Accuracy @1 = 53.27%
Accuracy @2 = 68.13%
Accuracy @3 = 75.78%
Mean Rank 4.8312


Next, we will [develop the model API](02_Develop_Model_Driver.ipynb) to call our model.