Importing data 

In [11]:
import numpy as np
import pandas as pd
## edit the link to be shorter and accessible from root folder
train = pd.read_csv("data/train.csv",header=None,names=['PairNb','Q1id','Q2id','Q1RT','Q2RT','label'])
test = pd.read_csv("data/test.csv",header=None,names=['PairNb','Q1id','Q2id','Q1RT','Q2RT','label'])

Remarque : Les données fournies ont l'air complètes et il y a pas l'air d'y avoir des fautes d'orthographe.

Implementing a first method : Random forest with basic features

In [5]:
#Defining the metric that they want us to use : weighted_log_loss
def weighted_log_loss(y_true, y_pred):
    a = 0.165/0.37
    b = (1-0.165)/(1-0.37)
    score = a*y_true*np.log(y_pred+0.00001) + b*(1.0 - y_true)*np.log(1.0 - y_pred+0.00001)
    return -np.mean(score)

In [6]:
#Getting the raw features that we will be working on :
train_X, train_y = train.iloc[:,:-1], train.iloc[:,-1]
test_X = test.iloc[:,:-1]

In [7]:
#Defining functions that will be used to compute the "basic" features
#Premier essai de feature engineering:
def words(qn):
    """
    Returns the number of words in a question
    """
    return len(qn.split())

def avg_word_length(qn):
    """
    Tabulates the average word length in a question
    """
    words = qn.split()
    return sum([len(word) for word in words])/len(words) if len(words) != 0 else 0

def char_count(qn):
    """
    Counts the total number of letters in a question
    """
    return sum([len(word) for word in qn.split()])

def caps_count(qn):
    """
    Counts the number of capital letters in a question, 
    only checking the first word of each sentence
    """
    words = qn.split()
    return sum([1 for word in words if word[0].isupper()])

# Jaccard Similiarity Coefficient
# Obtain the Jaccard Similiarity Coeefficient between 2 questions
# (X intersect Y) / (X union Y)
# Bag Of Words, which is the list of unique words in the document, with no frequency count.
def jaccard_index(row):
    """
    Obtain the Jaccard Similarity Coefficient which essentially is represented
    by: (X intersect Y) / (X union Y). Done using the Bag Of Words, 
    which is the list of unique words in the document, with no frequency count involved. 
    
    Input
    ------
    row: the row with both questions 1 and 2.
    
    Returns
    -------
    index: the Jaccard index (AKA Similarity Coefficient)
    """
    q1 = set(row['Q1RT'].split())
    q2 = set(row['Q2RT'].split())
    index = 1.0
    index = (float(len(q1.intersection(q2))) 
             / len(q1.union(q2)))
    return index
    

def levenshtein(dataframe):
    """
    Obtain the Levensthein distance between the two questions.
    Levensthein distance is another similarity index like Jaccard. 
    """
    return leven.distance(dataframe['Q1RT'], dataframe['Q2RT'])

In [8]:
#Defining the function used to calculating the classic features and doing so :
def feature_engineering_classic(df, which):
    if which == "train":
        qns_set = df.iloc[:,3:5] 
        q1 = qns_set.iloc[:,0]
        q2 = qns_set.iloc[:,1]
    elif which == "test":
        qns_set = df.iloc[:,3:5]
        q1 = qns_set.iloc[:,0]
        q2 = qns_set.iloc[:,1]    

    # Creating new features using Feature Engineering
    word_len_diff = abs(q1.apply(words) - q2.apply(words))
    avg_word_len_diff = abs(q1.apply(avg_word_length) - q2.apply(avg_word_length))
    char_diff = abs(q1.apply(char_count) - q2.apply(char_count))
    caps_diff = abs(q1.apply(caps_count) - q2.apply(caps_count))
    jaccard = qns_set.apply(jaccard_index, axis=1)
    #leven_dist = qns_set.apply(levenshtein, axis=1)

    # Creating a new dataframe with values of new feature
    classic_feat = pd.DataFrame({'word_len_diff': word_len_diff, 'avg_word_len_diff': avg_word_len_diff, 
                                 'char_diff': char_diff, 'caps_diff': caps_diff, 'jaccard': jaccard}) 
                                 #'leven_dist': leven_dist})
    return classic_feat

# classic features for Random Forests classifier
classic_train_X = feature_engineering_classic(train_X, "train")
classic_test_X = feature_engineering_classic(test_X, "test")

In [12]:
#Training the Random Forest
import itertools as itertools
import sklearn as skl
import nltk

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, make_scorer, confusion_matrix

# Cross validation with Grid Search to optimize hyper parameters
cv_sets = KFold(n_splits=10, random_state=0)
scorer = make_scorer(weighted_log_loss, greater_is_better=False)

# varying class_weight to penalize False Positives more 
grid = GridSearchCV(RandomForestClassifier(200, random_state=0),
                        scoring=scorer,
                        cv = cv_sets,
                        param_grid={"class_weight": [{0:100, 1:1}, {0:10, 1:1}, {0:1, 1:1}]})

# Training Random Forest Classifier with full training dataset
grid.fit(classic_train_X, train_y)
print("Random Forests Classifier log loss error: {}".format(grid.best_score_))

Random Forests Classifier log loss error: -3.4220341932483875


In [None]:
# generate predictions and submit on kaggle
prob_y = grid.predict_proba(classic_test_X)

submission = pd.DataFrame()
submission['Id'] = test_X['PairNb']
submission['Score'] = prob_y[:,1]
submission.to_csv("submissions/submission_rf.csv", index=False)
#C'est de la merde, ça fait un score de 0,8 alors que la baseline est à 0.6...