In [15]:
import os
import time 
import warnings
warnings.filterwarnings('ignore')

import nltk
import re
import pickle

%pylab inline
import matplotlib.pylab as plt
import seaborn as sns
plt.style.use('ggplot')
plt.style.use('seaborn-poster')
sns.set_palette('Set1', 10, desat=0.75)

import numpy as np
import pandas as pd
pd.set_option("display.max_rows", 200)
pd.set_option("display.max_colwidth", 200)

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import MaxAbsScaler

from scipy import sparse 

Populating the interactive namespace from numpy and matplotlib


In [16]:
DATA_PATH = 'data/'
data = pd.read_csv(os.path.join(DATA_PATH, 'train_preprocessed.csv'), sep=';')
data.fillna('xxx', inplace=True)

In [17]:
%%time
from sklearn.metrics import log_loss
import lightgbm as lgb

def get_lgb_holdout_score(data_sparse, Y):
    """
    input: csr sparse matrix and labels in a list/array/pd.Series format
    output: float, log loss on holdout dataset
    
    Sparse matrix is splitted for train and holdout by 75 and 25% respectievly. Then light gbm model with
    default parameters is trained on train with validation on holdout (which is strictly speaking is overfitting, but
    it doesn't matter for now). Returned value is log loss perfomance of trained model on holdout set.
    """
    train, valid, ytrain, yvalid = train_test_split(data_sparse, Y, train_size=0.75, random_state=0)
    dtrain = lgb.Dataset(train, ytrain)
    deval  = lgb.Dataset(valid, yvalid, reference=dtrain)
    
    params = {'task': 'train','boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'binary_logloss'}
    
    gbm = lgb.train(params,
                dtrain,
                num_boost_round=10000,
                valid_sets=[dtrain, deval],
                verbose_eval=False,
                early_stopping_rounds=10)
    lgb_pred = gbm.predict(valid, num_iteration=gbm.best_iteration)
    return round(log_loss(yvalid, lgb_pred), 4)

CPU times: user 21 µs, sys: 48 µs, total: 69 µs
Wall time: 73 µs


I can think of three differnet sparse representations for this task:

1) The most obvious way: horizontally concatenate two sparse matrices (TfidfVectorizer output), each representing one question. We'll have **doubled dimensionality**, and the decision tree will have to make **two splits** in order to figure out, that two questions have (or not have) particular word in common. That's likely will work... ok.

2) We can also concatenate horizontally two sparse matrices (TfidfVectorizer output), one representing intersection quesitons, and the second - difference. **Dimensionality will be doubled**, just as in previous case, but it'll take just **one split** for decision tree to check, whether two questions have certain word in common or not. This will likely work better than the first option.

3) Finally we can take two sparse matrices, each one representing one question (CountVectorizer), and take their sum. If we do this with binary CountVectorizer, we'll have matrix with **single dimensionality** (which is good, especially for decision-tree based algorithm). Each element in the matrix will one of three values:

    a) 0 - column word is in neither question.
    b) 1 - column word is in one question.
    c) 2 - column word is in both questions.
   
This way our decision tree will be able to check any word in **one split**. Although the information about word count will be lost.

4) The same as number 3, but before adding two sparse matrices, we'll multiply the second one by 2. This way each element in the matrix will one of four values:

    a) 0 - column word is in neither question.
    b) 1 - column word is in first question.
    c) 2 - column word is in second questions.
    d) 3 - column word in both questions.

----


Most likely, the best results will be yeilded by second or fourth option. Let's test it on stemmed data.

In [18]:
%%time
p_text = '_src'
corpus = data['q1'+p_text].tolist() + data['q2'+p_text].tolist()

# First option
tfidf = TfidfVectorizer(max_df=0.8, min_df=3).fit(corpus) 
data_sparse = sparse.hstack([tfidf.transform(data['q1'+p_text])
                            ,tfidf.transform(data['q2'+p_text])
                            ], format = 'csr')   
print ('First option number of columns:', data_sparse.shape[1])
print ('First option log loss:', get_lgb_holdout_score(data_sparse, data.target), '\n')

# Second option
tfidf = TfidfVectorizer(max_df=0.8, min_df=3).fit(corpus) 
data_sparse = sparse.hstack([tfidf.transform(data['inter'+p_text])
                            ,tfidf.transform(data['extra'+p_text])
                            ], format = 'csr')
print ('Second option number of columns:', data_sparse.shape[1])
print ('Second option log loss:', get_lgb_holdout_score(data_sparse, data.target), '\n')

# Third option
CV = CountVectorizer(max_df=0.8, min_df=3, binary=True).fit(corpus) 
data_sparse = sparse.hstack([CV.transform(data['q1'+p_text]) 
                            +CV.transform(data['q2'+p_text])
                            ], format = 'csr').astype(float)
print ('Third option number of columns:', data_sparse.shape[1])
print ('Third option log loss:', get_lgb_holdout_score(data_sparse, data.target), '\n')

# Fourth option
CV = CountVectorizer(max_df=0.8, min_df=3, binary=True).fit(corpus) 
data_sparse = sparse.hstack([CV.transform(data['q1'+p_text]) 
                            +CV.transform(data['q2'+p_text])*2
                            ], format = 'csr').astype(float)
print ('Fourth option number of columns:', data_sparse.shape[1])
print ('Fourth option log loss:', get_lgb_holdout_score(data_sparse, data.target), '\n')

First option number of columns: 79676
First option log loss: 0.3788 

Second option number of columns: 79676
Second option log loss: 0.3544 

Third option number of columns: 39838
Third option log loss: 0.3666 

Fourth option number of columns: 39838
Fourth option log loss: 0.3662 

CPU times: user 2h 40min 59s, sys: 15min 39s, total: 2h 56min 38s
Wall time: 37min 58s


The first option is the worst, and the second option yeilds the best results. Let's try different preprocessings with it.

In [5]:
# Second option no stopwords
p_text = '_nostops'
corpus = data['q1'+p_text].tolist() + data['q2'+p_text].tolist()
tfidf = TfidfVectorizer(max_df=0.8, min_df=3).fit(corpus) 
data_sparse = sparse.hstack([tfidf.transform(data['inter'+p_text])
                            ,tfidf.transform(data['extra'+p_text])
                            ], format = 'csr')
print ('Second option no stopwords log loss:', get_lgb_holdout_score(data_sparse, data.target), '\n')

# Second option unprocessed
p_text = '_src'
corpus = data['q1'+p_text].tolist() + data['q2'+p_text].tolist()
tfidf = TfidfVectorizer(max_df=0.8, min_df=3).fit(corpus) 
data_sparse = sparse.hstack([tfidf.transform(data['inter'+p_text])
                            ,tfidf.transform(data['extra'+p_text])
                            ], format = 'csr')
print ('Second option unprocessed log loss:', get_lgb_holdout_score(data_sparse, data.target), '\n')

# Second option stemmed words
p_text = '_stem'
corpus = data['q1'+p_text].tolist() + data['q2'+p_text].tolist()
tfidf = TfidfVectorizer(max_df=0.8, min_df=3).fit(corpus) 
data_sparse = sparse.hstack([tfidf.transform(data['inter'+p_text])
                            ,tfidf.transform(data['extra'+p_text])
                            ], format = 'csr')
print ('Second option stemmed log loss:', get_lgb_holdout_score(data_sparse, data.target), '\n')

Second option no stopwords log loss: 0.3775 

Second option unprocessed log loss: 0.3544 

Second option stemmed log loss: 0.3482 



Stemmed data shows the best results. Probabliy stopwords contain significant amount of information and raw questions on the contrary have too uch noise. Let's try to concatenate tfidf on part of speech tags and rerun the model. Since data_sparse is already built on stemmed data, we don't have to refit tfidf on stemmed data again. Let's just concatenate the tags.

In [9]:
p_text = '_tags'
corpus = data['q1'+p_text].tolist() + data['q2'+p_text].tolist()
tfidf = TfidfVectorizer(max_df=0.8, min_df=3).fit(corpus) 
data_sparse = sparse.hstack([data_sparse
                            ,tfidf.transform(data['inter'+p_text])
                            ,tfidf.transform(data['extra'+p_text])
                            ], format='csr')
print ('Second option stemmed with tags log loss:', get_lgb_holdout_score(data_sparse, data.target), '\n')

Second option stemmed with tags log loss: 0.3404 

