In [1]:
import pickle
import random

#import matplotlib
#import matplotlib.pyplot as plt
#import seaborn as sns
import numpy as np
import pandas as pd

import sklearn
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

random.seed(42)
#% matplotlib inline

# Load data

In [2]:
train_df = pd.read_csv('../data/input/train.csv', index_col='id')

train_df['question1'].fillna('', inplace=True)
train_df['question2'].fillna('', inplace=True)

# filter out question shorter than 10 characters
train_df['q1_len'] = train_df['question1'].str.len()
train_df['q2_len'] = train_df['question2'].str.len()

train_df = train_df.loc[lambda df: (df['q1_len'] > 10) & (df['q2_len'] > 10)]

# subset data
#train_df = train_df.loc[0:199999]

train_df['is_duplicate'].value_counts()

0    254901
1    149257
Name: is_duplicate, dtype: int64

# Feature Creation

In [3]:
train_df['q1_len'] = train_df['question1'].str.len()
train_df['q2_len'] = train_df['question2'].str.len()

train_df['len_diff'] = abs(train_df['q1_len'] - train_df['q2_len'])

train_df['q1_n_words'] = train_df['question1'].apply(lambda row: len(row.split(" ")))
train_df['q2_n_words'] = train_df['question2'].apply(lambda row: len(row.split(" ")))

train_df['n_word_diff'] = abs(train_df['q1_n_words'] - train_df['q2_n_words'])

train_df['q1_mean_word_len'] = train_df['question1'].apply(lambda row: np.mean([len(i) for i in row.split(" ")]))
train_df['q2_mean_word_len'] = train_df['question2'].apply(lambda row: np.mean([len(i) for i in row.split(" ")]))

train_df['mean_word_len_diff'] = abs(train_df['q1_mean_word_len'] - train_df['q2_mean_word_len'])

# Logistic regression

On length, word count, and words shared

In [4]:
def normalized_word_share(row):
    w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))    
    return 1.0 * len(w1 & w2)/(len(w1 | w2))

train_df['word_share'] = train_df.apply(normalized_word_share, axis=1)

In [5]:
X_train, X_val = train_test_split(train_df, 
                                  test_size=0.2, random_state=42, 
                                  stratify=train_df['is_duplicate'].values)

print(X_train.shape)
print(X_val.shape)

(323326, 15)
(80832, 15)


In [6]:
clf = LogisticRegression()

clf.fit(X_train.loc[:, ['len_diff', 'n_word_diff', 'word_share']].values, X_train['is_duplicate'].values)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [7]:
pred = clf.predict_proba(X_val.loc[:, ['len_diff', 'n_word_diff', 'word_share']].values)

print(log_loss(X_val['is_duplicate'].values, pred[:, 0]))
print(log_loss(X_val['is_duplicate'].values, [0.999999 if i >= 0.5 else 0.000001 for i in pred[:, 0]]))

1.02459190113
8.92781599603


## Save model

In [8]:
pickle.dump(clf, open('../models/logistic_regression.pkl', 'wb'))

In [24]:
clf = pickle.load(open('../models/logistic_regression.pkl', 'rb'))