<h1>Email recipient recommendation</h1>

<i>Thomas Boudou, Guillaume Richard, Antoine Simoulin</i>

<p style="text-align: justify">It was shown that at work, employees frequently forget to include one or more recipient(s) before sending a message. Conversely, it is common that some recipients of a given message were actually not intended to receive the message. To increase productivity and prevent information leakage, the needs for effective <b>email recipient recommendation</b> systems are thus pressing.

In this challenge, you are asked to develop such a system, which, given the content and the date of a message, recommends a list of <b>10 recipients ranked by decreasing order of relevance</b>.</p>

In [2]:
# Requirements
%matplotlib inline
import random
import pandas as pd
import numpy as np
# do not display warnings
import warnings
warnings.filterwarnings("ignore")

# Functions files are saved in "src/" directory.
import sys
sys.path.append('src/')
from accuracy_measure import *

In [3]:
from load_data import *

# load files
# Data are saved in "data/" directory
path_to_data = '../data/'
training, training_info, test, test_info, y_df = load_data(path_to_data)

# create adress book
# /!\ can take 1-2 min
address_books = create_address_books(training, y_df)

# join train and test files
X_df = join_data(training_info, training)
X_sub_df = join_data(test_info, test)

<h2> TF-IDF </h2>

In [47]:
import nltk
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import time

cachedStopWords = stopwords.words("english")

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    stemmer = PorterStemmer()
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

class TFIDF():
    def __init__(self):
        self.token_dict = {}
        self.tfidf=TfidfVectorizer(tokenizer=None, stop_words='english')

    def fit(self, X):
        for i in range(X.shape[0]):
            text = X.body.values[i]
            lowers = text.lower()
            s=string.punctuation.replace('@','')
            s=s.replace('+','')
            no_punctuation = lowers.translate(str.maketrans('','',s))
            y = " ".join(no_punctuation.split())
            y = ' '.join([word for word in y.split() if word not in cachedStopWords])
            self.token_dict[i] = y

        self.tfidf.fit(self.token_dict.values())


    def fit_transform(self, X):
        start_time = time.time()
        for i in range(X.shape[0]):
            text = X.body.values[i]
            lowers = text.lower()
            s=string.punctuation.replace('@','')
            s=s.replace('+','')

            no_punctuation = lowers.translate(str.maketrans('','',s))
            y = " ".join(no_punctuation.split())
            y = ' '.join([word for word in y.split() if word not in cachedStopWords])

            self.token_dict[i] = y

        X_tfidf = self.tfidf.fit_transform(self.token_dict.values())

        print('performed Tf-Idf in %2i seconds.' % (time.time() - start_time))
        return X_tfidf

    def transform(self, Y):
        start_time = time.time()
        Y_dict={}
        for i in range(Y.shape[0]):
            text = Y.body.values[i]
            lowers = text.lower()
            s=string.punctuation.replace('@','')
            s=s.replace('+','')

            no_punctuation = lowers.translate(str.maketrans('','',s))
            y = " ".join(no_punctuation.split())
            y = ' '.join([word for word in y.split() if word not in cachedStopWords])

            Y_dict[i] = y
        Y_tf_idf=self.tfidf.transform(Y_dict.values())

        print('performed Tf-Idf in %2i seconds.' % (time.time() - start_time))
        return Y_tf_idf

<h3> Useful functions </h3>

In [150]:
from sklearn.metrics.pairwise import cosine_similarity

#Score vector creation
def score_vector(KNN_indices,sender_index,sender_AB,y,cos_dist_mat):
    recipient_scores=np.zeros((sum(sender_index),len(sender_AB)+1))
    for i in range(sum(sender_index)):
        d=np.array(KNN_indices[i])
        neigh_mails=y.values[d]#neighbour mails
        z=0
        for n_mail in neigh_mails:
            for rec in n_mail:
                if rec in sender_AB:
                    j=sender_AB[rec]#index in the score vector
                    recipient_scores[i,j]+=cos_dist_mat[i,d[z]]
            z=z+1
    return recipient_scores

#Label creation (from recipient addresses to 0/1 vector)
def create_labels(sender_train_is,sender_AB,y_train):
    recipient_labels=np.zeros((sum(sender_train_is),len(sender_AB)))
    i=0
    for rec_list in y_train[sender_train_is]:
        for rec in rec_list:
            if rec in sender_AB:
                j=sender_AB[rec]
                recipient_labels[i,j]=1 
        i=i+1
    return recipient_labels

#Complete prediction when <10
def complete_prediction(k, sender, address_books, res_temp, K=10):
    # k the number of recipients to predict
    k_most = [elt[0] for elt in address_books[sender][:K] if elt not in res_temp]
    k_most = k_most[:k]
    if len(k_most) < k: # sender n'a pas assez de contacts
        k_most.extend([0] * (k-len(k_most)))
    return k_most

#Computes the KNN on the distance matrix
def KNN(distance,k=30):
    indexes=[]
    for d in distance:
        indexes.append((-d).argsort()[:k])
    return np.array(indexes)

<h2> Fitting </h2>

In [133]:
%%time 
#import TFIDF_mod
#from TFIDF_mod import TFIDF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
import numpy as np
from sklearn.model_selection import ShuffleSplit

# splitting data for cross validation
skf = ShuffleSplit(n_splits=1, test_size=0.25)

print('--------Cross-Validation Module--------')
for train_is, test_is in skf.split(y_df):
    print('\n Beginning of extraction \n ------------')
    ############Extraction + TF-IDF############
    X_train=X_df.ix[train_is]
    y_train = y_df.recipients.loc[train_is].copy()
    X_test=X_df.ix[test_is]
    y_test = y_df.recipients.loc[test_is].copy()
    y_pred=y_test.copy()
    
    tf_idf = TFIDF()
    X_train_TFIDF=tf_idf.fit_transform(X_train)
    X_test_TFIDF=tf_idf.transform(X_test)
    
    print('Extraction done \n ------------')
    print('\n Beginning of prediction \n ------------')
    ############Prediction############
    sender_test = X_test.sender.unique().tolist()
    clf={}
    count=0
    L=len(sender_test)
    y_pred=y_test.copy()
    
    tot_rec_mails={}
    for sender in sender_test:
        for x in address_books[sender]:
            if x[0] in tot_rec_mails:
                tot_rec_mails[x[0]]=tot_rec_mails[x[0]]+x[1]
            else:
                tot_rec_mails[x[0]]=x[1]
    for sender in sender_test:
        #Isolation of sender's mails
        sender_train_is = np.array(X_train.sender == sender)
        sender_test_is = np.array(X_test.sender == sender)

        ############Feature extraction############
        
        #Finding the nearest neighbours of sender's mails
        cos_dist_mat=cosine_similarity(X_train_TFIDF[sender_train_is])-np.identity(sum(sender_train_is))
        cos_dist_mat_test=cosine_similarity(X_test_TFIDF[sender_test_is],X_train_TFIDF[sender_train_is])
        #cos_dist_mat=cosine_similarity(X_TFIDF[sender_train_is],X_TFIDF) #to try later
        
        #KNN
        KNN_indices=KNN(cos_dist_mat,k=50)
        KNN_indices_test=KNN(cos_dist_mat_test,k=50)

        #Sender number in the address book
        sender_AB={}
        id_to_sender={}
        sent_frequency={}
        rec_frequency={}
        n_mails=float(sum(sender_train_is))
        z=0
        for x in address_books[sender]:
            sender_AB[x[0]]=z
            id_to_sender[z]=x[0]
            sent_frequency[x[0]]=x[1]/n_mails
            rec_frequency[x[0]]=float(x[1])/tot_rec_mails[x[0]]
            z=z+1

        #Creation of the score vector
        recipient_scores=score_vector(KNN_indices,sender_train_is,sender_AB,y_train,cos_dist_mat) 

        
        ############Train############

        #Creation of the labels for the classifier
        recipient_labels=create_labels(sender_train_is,sender_AB,y_train)

        #One classifier per recipient
        for rec in sender_AB:
            #Adding frequency feature
            #recipient_scores.T[len(sender_AB)]=sent_frequency[rec]
            x_fit=np.array([[x, sent_frequency[rec], rec_frequency[rec]] for x in recipient_scores.T[sender_AB[rec]]])
            key=sender+','+rec
            #clf[key]=SVC()
            clf[key]=xgb.XGBClassifier(n_estimators=10)
            clf[key].fit(x_fit,recipient_labels.T[sender_AB[rec]])


        ############Test############

        #Creation of the test score vector
        recipient_scores=score_vector(KNN_indices_test,sender_test_is,sender_AB,y_train,cos_dist_mat_test) 
        recipient_labels=np.zeros((sum(sender_test_is),len(sender_AB))).T

        #Prediction
        pred=0
        for rec in sender_AB:
            #Adding frequency feature
            recipient_scores.T[len(sender_AB)]=sent_frequency[rec]
            x_fit=np.array([[x, sent_frequency[rec], rec_frequency[rec]] for x in recipient_scores.T[sender_AB[rec]]])
            #Predict
            key=sender+','+rec
            recipient_labels[sender_AB[rec]]=(clf[key].predict_proba(x_fit)).T[1].T
        recipient_labels=recipient_labels.T
        #Storage
        y_test_pred=[]
        for y in recipient_labels:
            y_tmp=[]
            max_rec=(-y).argsort()[:10]
            for rec_id in max_rec:
                y_tmp.append(id_to_sender[rec_id])
            if len(y_tmp) < 10:
                y_tmp.extend(complete_prediction(10-len(y_tmp),sender, address_books, y_tmp))
            y_test_pred.append(y_tmp)
        y_pred.ix[sender_test_is]=y_test_pred
        
        if int((count*10)/L)>int(((count-1)*10)/L):
            print(round(float(count*100)/L))
        count=count+1
    print('End of prediction')
    print('------------')

for train_is, test_is in skf.split(y_df):
    
    
    i=0
    accuracy = {}
    accuracy_freq = {}
    accuracy_TOT = 0
    for sender in sender_test:
        print('%10s | %40s | ' %(sender_test.index(sender), sender), end='')
        sender_train_is = np.array(X_train.sender == sender)
        sender_test_is = np.array(X_test.sender == sender)
        accuracy[sender] = mapk(y_test[sender_test_is], y_pred[sender_test_is])
        accuracy_TOT += accuracy[sender]
        print(round(accuracy[sender],2))
print(accuracy_TOT/len(accuracy))

--------Cross-Validation Module--------

 Beginning of extraction 
 ------------
performed Tf-Idf in 16 seconds.
performed Tf-Idf in  5 seconds.
Extraction done 
 ------------

 Beginning of prediction 
 ------------
10
20
30
40
50
60
70
80
90
End of prediction
------------
         0 |                   beth.cherry@enform.com | 0.89
         1 |                    susan.scott@enron.com | 0.12
         2 |                      jean.mrha@enron.com | 0.61
         3 |                 stanley.horton@enron.com | 0.16
         4 |                sara.shackleton@enron.com | 0.14
         5 |                     lynn.blair@enron.com | 0.27
         6 |                  chris.dorland@enron.com | 0.17
         7 |                    mark.palmer@enron.com | 0.52
         8 |                     tim.belden@enron.com | 0.17
         9 |                    marie.heard@enron.com | 0.17
        10 |               michael.tribolet@enron.com | 0.52
        11 |                 phillip.m.love@enron.com 

<h2> Submission </h2>

In [156]:
%%time 
#import TFIDF_mod
#from TFIDF_mod import TFIDF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
import numpy as np
from sklearn.model_selection import ShuffleSplit

# splitting data for cross validation
skf = ShuffleSplit(n_splits=1, test_size=0.25)

print('--------Cross-Validation Module--------')
for train_is, test_is in skf.split(y_df):
    print('\n Beginning of extraction \n ------------')
    ############Extraction + TF-IDF############
    X_train=X_df.copy()
    y_train = y_df.recipients.copy()
    X_test=X_sub_df
    #y_test = y_df.recipients.loc[test_is].copy()
    y_pred=pd.Series([[''] for x in X_sub_df.sender])
    
    tf_idf = TFIDF()
    X_train_TFIDF=tf_idf.fit_transform(X_train)
    X_test_TFIDF=tf_idf.transform(X_test)
    
    print('Extraction done \n ------------')
    print('\n Beginning of prediction \n ------------')
    ############Prediction############
    sender_test = X_test.sender.unique().tolist()
    clf={}
    count=0
    L=len(sender_test)
    
    tot_rec_mails={}
    for sender in sender_test:
        for x in address_books[sender]:
            if x[0] in tot_rec_mails:
                tot_rec_mails[x[0]]=tot_rec_mails[x[0]]+x[1]
            else:
                tot_rec_mails[x[0]]=x[1]
    for sender in sender_test:
        #Isolation of sender's mails
        sender_train_is = np.array(X_train.sender == sender)
        sender_test_is = np.array(X_test.sender == sender)

        ############Feature extraction############
        
        #Finding the nearest neighbours of sender's mails
        cos_dist_mat=cosine_similarity(X_train_TFIDF[sender_train_is])-np.identity(sum(sender_train_is))
        cos_dist_mat_test=cosine_similarity(X_test_TFIDF[sender_test_is],X_train_TFIDF[sender_train_is])
        #cos_dist_mat=cosine_similarity(X_TFIDF[sender_train_is],X_TFIDF) #to try later
        
        #KNN
        KNN_indices=KNN(cos_dist_mat,k=50)
        KNN_indices_test=KNN(cos_dist_mat_test,k=50)

        #Sender number in the address book
        sender_AB={}
        id_to_sender={}
        sent_frequency={}
        rec_frequency={}
        n_mails=float(sum(sender_train_is))
        z=0
        for x in address_books[sender]:
            sender_AB[x[0]]=z
            id_to_sender[z]=x[0]
            sent_frequency[x[0]]=x[1]/n_mails
            rec_frequency[x[0]]=float(x[1])/tot_rec_mails[x[0]]
            z=z+1

        #Creation of the score vector
        recipient_scores=score_vector(KNN_indices,sender_train_is,sender_AB,y_train,cos_dist_mat) 

        
        ############Train############

        #Creation of the labels for the classifier
        recipient_labels=create_labels(sender_train_is,sender_AB,y_train)

        #One classifier per recipient
        for rec in sender_AB:
            #Adding frequency feature
            #recipient_scores.T[len(sender_AB)]=sent_frequency[rec]
            x_fit=np.array([[x, sent_frequency[rec], rec_frequency[rec]] for x in recipient_scores.T[sender_AB[rec]]])
            key=sender+','+rec
            #clf[key]=SVC()
            clf[key]=xgb.XGBClassifier(n_estimators=10)
            clf[key].fit(x_fit,recipient_labels.T[sender_AB[rec]])


        ############Test############

        #Creation of the test score vector
        recipient_scores=score_vector(KNN_indices_test,sender_test_is,sender_AB,y_train,cos_dist_mat_test) 
        recipient_labels=np.zeros((sum(sender_test_is),len(sender_AB))).T

        #Prediction
        pred=0
        for rec in sender_AB:
            #Adding frequency feature
            recipient_scores.T[len(sender_AB)]=sent_frequency[rec]
            x_fit=np.array([[x, sent_frequency[rec], rec_frequency[rec]] for x in recipient_scores.T[sender_AB[rec]]])
            #Predict
            key=sender+','+rec
            recipient_labels[sender_AB[rec]]=(clf[key].predict_proba(x_fit)).T[1].T
        recipient_labels=recipient_labels.T
        #Storage
        y_test_pred=[]
        for y in recipient_labels:
            y_tmp=[]
            max_rec=(-y).argsort()[:10]
            for rec_id in max_rec:
                y_tmp.append(id_to_sender[rec_id])
            if len(y_tmp) < 10:
                y_tmp.extend(complete_prediction(10-len(y_tmp),sender, address_books, y_tmp))
            y_test_pred.append(y_tmp)
        y_pred.ix[sender_test_is]=y_test_pred
        
        if int((count*10)/L)>int(((count-1)*10)/L):
            print(round(float(count*100)/L),'%')
        count=count+1
    print('End of prediction')
    print('------------')

--------Cross-Validation Module--------

 Beginning of extraction 
 ------------
performed Tf-Idf in 25 seconds.
performed Tf-Idf in  1 seconds.
Extraction done 
 ------------

 Beginning of prediction 
 ------------
10 %
20 %
30 %
40 %
50 %
60 %
70 %
80 %
90 %
End of prediction
------------
CPU times: user 7min 11s, sys: 7.5 s, total: 7min 19s
Wall time: 2min 34s


In [160]:
def create_submission(y_pred,X_test_df):

    predictions_towrite={}
    x_test=X_test_df.values
    for i in range(len(y_pred)):
        recipients=y_pred[i]
        mid=x_test[i][0]
        predictions_towrite[mid]=recipients

    count=0
    with open('./pred_KNN.txt', 'w') as my_file:
        my_file.write('mid,recipients' + '\n')
        for ids, preds in predictions_towrite.items():
            count=count+1
            r=str(ids)+","
            for s in preds:
                r=r+" "+str(s)
            r=r+'\n'
            my_file.write(r)

In [161]:
create_submission(y_pred,X_sub_df)