Importing data 

In [7]:
import numpy as np
import pandas as pd
## edit the link to be shorter and accessible from root folder
train = pd.read_csv("data/train.csv",header=None,names=['id','qid1','qid2','question1','question2','is_duplicate'])
test = pd.read_csv("data/test.csv",header=None,names=['test_id','qid1','qid2','question1','question2','is_duplicate'])

Remarque : Les données fournies ont l'air complètes et il y a pas l'air d'y avoir des fautes d'orthographe.

Implementing a first method : Random forest with basic features

In [8]:
#Defining the metric that they want us to use : weighted_log_loss
def weighted_log_loss(y_true, y_pred):
    a = 0.165/0.37
    b = (1-0.165)/(1-0.37)
    score = a*y_true*np.log(y_pred+0.00001) + b*(1.0 - y_true)*np.log(1.0 - y_pred+0.00001)
    return -np.mean(score)

In [9]:
#Getting the raw features that we will be working on :
train_X, train_y = train.iloc[:,:-1], train.iloc[:,-1]
test_X = test.iloc[:,:-1]

In [10]:
#Getting non-nlp features
from collections import defaultdict
import numpy as np
import pandas as pd
import networkx as nx

NB_CORES = 10
FREQ_UPPER_BOUND = 100
NEIGHBOR_UPPER_BOUND = 5


def create_question_hash(train_df, test_df):
    train_qs = np.dstack([train_df["question1"], train_df["question2"]]).flatten()
    test_qs = np.dstack([test_df["question1"], test_df["question2"]]).flatten()
    all_qs = np.append(train_qs, test_qs)
    all_qs = pd.DataFrame(all_qs)[0].drop_duplicates()
    all_qs.reset_index(inplace=True, drop=True)
    question_dict = pd.Series(all_qs.index.values, index=all_qs.values).to_dict()
    return question_dict


def get_hash(df, hash_dict):
    df["qid1"] = df["question1"].map(hash_dict)
    df["qid2"] = df["question2"].map(hash_dict)
    return df.drop(["question1", "question2"], axis=1)


def get_kcore_dict(df):
    g = nx.Graph()
    g.add_nodes_from(df.qid1)
    edges = list(df[["qid1", "qid2"]].to_records(index=False))
    g.add_edges_from(edges)
    g.remove_edges_from(g.selfloop_edges())

    df_output = pd.DataFrame(data=g.nodes(), columns=["qid"])
    df_output["kcore"] = 0
    for k in range(2, NB_CORES + 1):
        ck = nx.k_core(g, k=k).nodes()
        print("kcore", k)
        df_output.ix[df_output.qid.isin(ck), "kcore"] = k

    return df_output.to_dict()["kcore"]


def get_kcore_features(df, kcore_dict):
    df["kcore1"] = df["qid1"].apply(lambda x: kcore_dict[x])
    df["kcore2"] = df["qid2"].apply(lambda x: kcore_dict[x])
    return df


def convert_to_minmax(df, col):
    sorted_features = np.sort(np.vstack([df[col + "1"], df[col + "2"]]).T)
    df["min_" + col] = sorted_features[:, 0]
    df["max_" + col] = sorted_features[:, 1]
    return df.drop([col + "1", col + "2"], axis=1)


def get_neighbors(train_df, test_df):
    neighbors = defaultdict(set)
    for df in [train_df, test_df]:
        for q1, q2 in zip(df["qid1"], df["qid2"]):
            neighbors[q1].add(q2)
            neighbors[q2].add(q1)
    return neighbors


def get_neighbor_features(df, neighbors):
    common_nc = df.apply(lambda x: len(neighbors[x.qid1].intersection(neighbors[x.qid2])), axis=1)
    min_nc = df.apply(lambda x: min(len(neighbors[x.qid1]), len(neighbors[x.qid2])), axis=1)
    df["common_neighbor_ratio"] = common_nc / min_nc
    df["common_neighbor_count"] = common_nc.apply(lambda x: min(x, NEIGHBOR_UPPER_BOUND))
    return df


def get_freq_features(df, frequency_map):
    df["freq1"] = df["qid1"].map(lambda x: min(frequency_map[x], FREQ_UPPER_BOUND))
    df["freq2"] = df["qid2"].map(lambda x: min(frequency_map[x], FREQ_UPPER_BOUND))
    return df


print("Hashing the questions...")
question_dict = create_question_hash(train, test)
train = get_hash(train, question_dict)
test = get_hash(test, question_dict)
print("Number of unique questions:", len(question_dict))


#print("Calculating kcore features...")
all_df = pd.concat([train, test])
#kcore_dict = get_kcore_dict(all_df)
#train = get_kcore_features(train, kcore_dict)
#test = get_kcore_features(test, kcore_dict)
#train = convert_to_minmax(train, "kcore")
#test = convert_to_minmax(test, "kcore")


print("Calculating common neighbor features...")
neighbors = get_neighbors(train, test)
train = get_neighbor_features(train, neighbors)
test = get_neighbor_features(test, neighbors)

print("Calculating frequency features...")
frequency_map = dict(zip(*np.unique(np.vstack((all_df["qid1"], all_df["qid2"])), return_counts=True)))
train = get_freq_features(train, frequency_map)
test = get_freq_features(test, frequency_map)
train = convert_to_minmax(train, "freq")
test = convert_to_minmax(test, "freq")

Hashing the questions...
Number of unique questions: 58869
Calculating common neighbor features...
Calculating frequency features...


In [20]:
train_non_nlp_features = train.drop(["id","qid1","qid2","is_duplicate"],axis=1)
test_non_nlp_features = test.drop(["test_id","qid1","qid2","is_duplicate"],axis=1)

In [23]:
test_non_nlp_features.head()

Unnamed: 0,common_neighbor_ratio,common_neighbor_count,min_freq,max_freq
0,0.833333,5,6,6
1,0.947368,5,19,21
2,0.947368,5,19,19
3,0.875,5,8,8
4,0.785714,5,14,15


In [24]:
train_non_nlp_features.head()

Unnamed: 0,common_neighbor_ratio,common_neighbor_count,min_freq,max_freq
0,0.0,0,2,5
1,0.0,0,1,5
2,0.0,0,1,4
3,0.75,3,4,4
4,0.0,0,1,10


In [15]:
#non-nlp features extraction
import re
import pandas as pd
from nltk.corpus import stopwords
from fuzzywuzzy import fuzz
import distance

SAFE_DIV = 0.0001
STOP_WORDS = stopwords.words("english")

#Basic preprocessing :
def preprocess(x):
    x = str(x).lower()
    x = x.replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'")\
                           .replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not")\
                           .replace("n't", " not").replace("what's", "what is").replace("it's", "it is")\
                           .replace("'ve", " have").replace("i'm", "i am").replace("'re", " are")\
                           .replace("he's", "he is").replace("she's", "she is").replace("'s", " own")\
                           .replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ")\
                           .replace("€", " euro ").replace("'ll", " will")
    x = re.sub(r"([0-9]+)000000", r"\1m", x)
    x = re.sub(r"([0-9]+)000", r"\1k", x)
    return x

#Getting token features
def get_token_features(q1, q2):
    token_features = [0.0]*10

    q1_tokens = q1.split()
    q2_tokens = q2.split()

    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return token_features

    q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
    q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])

    q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])
    q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])

    common_word_count = len(q1_words.intersection(q2_words))
    common_stop_count = len(q1_stops.intersection(q2_stops))
    common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))

    token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
    token_features[7] = int(q1_tokens[0] == q2_tokens[0])
    token_features[8] = abs(len(q1_tokens) - len(q2_tokens))
    token_features[9] = (len(q1_tokens) + len(q2_tokens))/2
    return token_features


def get_longest_substr_ratio(a, b):
    strs = list(distance.lcsubstrings(a, b))
    if len(strs) == 0:
        return 0
    else:
        return len(strs[0]) / (min(len(a), len(b)) + 1)

#LA méthode qui extrait les features depuis la dataframe
def extract_features(df):
    df["question1"] = df["question1"].fillna("").apply(preprocess)
    df["question2"] = df["question2"].fillna("").apply(preprocess)

    print("token features...")
    token_features = df.apply(lambda x: get_token_features(x["question1"], x["question2"]), axis=1)
    df["cwc_min"]       = list(map(lambda x: x[0], token_features))
    df["cwc_max"]       = list(map(lambda x: x[1], token_features))
    df["csc_min"]       = list(map(lambda x: x[2], token_features))
    df["csc_max"]       = list(map(lambda x: x[3], token_features))
    df["ctc_min"]       = list(map(lambda x: x[4], token_features))
    df["ctc_max"]       = list(map(lambda x: x[5], token_features))
    df["last_word_eq"]  = list(map(lambda x: x[6], token_features))
    df["first_word_eq"] = list(map(lambda x: x[7], token_features))
    df["abs_len_diff"]  = list(map(lambda x: x[8], token_features))
    df["mean_len"]      = list(map(lambda x: x[9], token_features))

    print("fuzzy features..")
    df["token_set_ratio"]       = df.apply(lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1)
    df["token_sort_ratio"]      = df.apply(lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1)
    df["fuzz_ratio"]            = df.apply(lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1)
    df["fuzz_partial_ratio"]    = df.apply(lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1)
    df["longest_substr_ratio"]  = df.apply(lambda x: get_longest_substr_ratio(x["question1"], x["question2"]), axis=1)
    return df

print("Extracting features for train:")
train_df = extract_features(train_X)
train_df.drop(["id", "qid1", "qid2", "question1", "question2"], axis=1, inplace=True)
#train_df.to_csv("data/nlp_features_train.csv", index=False)

print("Extracting features for test:")
test_df = extract_features(test_X)
test_df.drop(["test_id","qid1","qid2", "question1", "question2"], axis=1, inplace=True)
#test_df.to_csv("data/nlp_features_test.csv", index=False)

Extracting features for train:


KeyError: 'question1'

In [25]:
train_df.head()

Unnamed: 0,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
0,0.0,0.0,0.99998,0.624992,0.624992,0.35714,0,1,6,11.0,92,68,60,84,0.368421
1,0.249994,0.166664,0.399992,0.399992,0.33333,0.249998,0,1,3,10.5,67,54,56,57,0.208333
2,0.799984,0.399996,0.249994,0.199996,0.499995,0.312498,1,0,6,13.0,69,58,61,61,0.269841
3,0.999967,0.749981,0.99995,0.666644,0.833319,0.833319,1,0,0,6.0,92,88,81,76,0.5
4,0.199996,0.199996,0.333328,0.333328,0.272725,0.249998,0,1,1,11.5,58,53,52,53,0.196721


In [26]:
test_df.head()

Unnamed: 0,test_id,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
0,0,0.499988,0.399992,0.666644,0.399992,0.57142,0.399996,0,1,3,8.5,71,71,62,62,0.25
1,1,0.99995,0.99995,0.999967,0.749981,0.99998,0.833319,1,1,1,5.5,100,90,90,79,0.517241
2,2,0.999975,0.999975,0.749981,0.599988,0.874989,0.777769,1,0,1,8.5,91,88,88,87,0.7
3,3,0.599988,0.499992,0.999975,0.999975,0.777769,0.699993,1,1,1,9.5,70,61,62,57,0.296296
4,4,0.499988,0.399992,0.0,0.0,0.22222,0.199998,1,0,1,9.5,56,51,62,63,0.380952


In [27]:
train_all_features = pd.concat([train_df,train_non_nlp_features],axis=1)

In [28]:
train_all_features.head()

Unnamed: 0,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio,common_neighbor_ratio,common_neighbor_count,min_freq,max_freq
0,0.0,0.0,0.99998,0.624992,0.624992,0.35714,0,1,6,11.0,92,68,60,84,0.368421,0.0,0,2,5
1,0.249994,0.166664,0.399992,0.399992,0.33333,0.249998,0,1,3,10.5,67,54,56,57,0.208333,0.0,0,1,5
2,0.799984,0.399996,0.249994,0.199996,0.499995,0.312498,1,0,6,13.0,69,58,61,61,0.269841,0.0,0,1,4
3,0.999967,0.749981,0.99995,0.666644,0.833319,0.833319,1,0,0,6.0,92,88,81,76,0.5,0.75,3,4,4
4,0.199996,0.199996,0.333328,0.333328,0.272725,0.249998,0,1,1,11.5,58,53,52,53,0.196721,0.0,0,1,10


In [29]:
test_all_features = pd.concat([test_df,test_non_nlp_features],axis=1)

In [30]:
test_all_features.head()

Unnamed: 0,test_id,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio,common_neighbor_ratio,common_neighbor_count,min_freq,max_freq
0,0,0.499988,0.399992,0.666644,0.399992,0.57142,0.399996,0,1,3,8.5,71,71,62,62,0.25,0.833333,5,6,6
1,1,0.99995,0.99995,0.999967,0.749981,0.99998,0.833319,1,1,1,5.5,100,90,90,79,0.517241,0.947368,5,19,21
2,2,0.999975,0.999975,0.749981,0.599988,0.874989,0.777769,1,0,1,8.5,91,88,88,87,0.7,0.947368,5,19,19
3,3,0.599988,0.499992,0.999975,0.999975,0.777769,0.699993,1,1,1,9.5,70,61,62,57,0.296296,0.875,5,8,8
4,4,0.499988,0.399992,0.0,0.0,0.22222,0.199998,1,0,1,9.5,56,51,62,63,0.380952,0.785714,5,14,15


In [31]:
#Training the Random Forest
import itertools as itertools
import sklearn as skl
import nltk

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, make_scorer, confusion_matrix

# Cross validation with Grid Search to optimize hyper parameters
cv_sets = KFold(n_splits=10, random_state=0)
scorer = make_scorer(weighted_log_loss, greater_is_better=False)

# varying class_weight to penalize False Positives more 
grid = GridSearchCV(RandomForestClassifier(200, random_state=0),
                        scoring=scorer,
                        cv = cv_sets,
                        param_grid={"class_weight": [{0:100, 1:1}, {0:10, 1:1}, {0:1, 1:1}]})

# Training Random Forest Classifier with full training dataset
grid.fit(train_all_features, train_y)
print("Random Forests Classifier log loss error: {}".format(grid.best_score_))

Random Forests Classifier log loss error: -0.6611652049134563


In [33]:
# generate predictions and submit on kaggle
#test_all_features = test_all_features.drop('test_id',axis=1)
prob_y = grid.predict_proba(test_all_features)

submission = pd.DataFrame()
submission['Id'] = test_X['test_id']
submission['Score'] = prob_y[:,1]
submission.to_csv("submissions/submission_rf_AllFeatures.csv", index=False)