<h1>Email recipient recommendation</h1>

<i>Thomas Boudou, Guillaume Richard, Antoine Simoulin</i>

<p style="text-align: justify">It was shown that at work, employees frequently forget to include one or more recipient(s) before sending a message. Conversely, it is common that some recipients of a given message were actually not intended to receive the message. To increase productivity and prevent information leakage, the needs for effective <b>email recipient recommendation</b> systems are thus pressing.

In this challenge, you are asked to develop such a system, which, given the content and the date of a message, recommends a list of <b>10 recipients ranked by decreasing order of relevance</b>.</p>

In [2]:
# Requirements
%matplotlib inline
import random
import pandas as pd
import numpy as np
# do not display warnings
import warnings
warnings.filterwarnings("ignore")

# Functions files are saved in "src/" directory.
import sys
sys.path.append('src/')
from accuracy_measure import *

In [3]:
from load_data import *

# load files
# Data are saved in "data/" directory
path_to_data = '../data/'
training, training_info, test, test_info, y_df = load_data(path_to_data)

# create adress book
# /!\ can take 1-2 min
address_books = create_address_books(training, y_df)

# join train and test files
X_df = join_data(training_info, training)
X_sub_df = join_data(test_info, test)

In [4]:
def KNN(distance,k=30):
    indexes=[]
    for d in distance:
        indexes.append((-d).argsort()[:k])
    return np.array(indexes)

In [47]:
import nltk
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import time

cachedStopWords = stopwords.words("english")

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    stemmer = PorterStemmer()
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

class TFIDF():
    def __init__(self):
        self.token_dict = {}
        self.tfidf=TfidfVectorizer(tokenizer=None, stop_words='english')

    def fit(self, X):
        for i in range(X.shape[0]):
            text = X.body.values[i]
            lowers = text.lower()
            s=string.punctuation.replace('@','')
            s=s.replace('+','')
            no_punctuation = lowers.translate(str.maketrans('','',s))
            y = " ".join(no_punctuation.split())
            y = ' '.join([word for word in y.split() if word not in cachedStopWords])
            self.token_dict[i] = y

        self.tfidf.fit(self.token_dict.values())


    def fit_transform(self, X):
        start_time = time.time()
        for i in range(X.shape[0]):
            text = X.body.values[i]
            lowers = text.lower()
            s=string.punctuation.replace('@','')
            s=s.replace('+','')

            no_punctuation = lowers.translate(str.maketrans('','',s))
            y = " ".join(no_punctuation.split())
            y = ' '.join([word for word in y.split() if word not in cachedStopWords])

            self.token_dict[i] = y

        X_tfidf = self.tfidf.fit_transform(self.token_dict.values())

        print('performed Tf-Idf in %2i seconds.' % (time.time() - start_time))
        return X_tfidf

    def transform(self, Y):
        start_time = time.time()
        Y_dict={}
        for i in range(Y.shape[0]):
            text = Y.body.values[i]
            lowers = text.lower()
            s=string.punctuation.replace('@','')
            s=s.replace('+','')

            no_punctuation = lowers.translate(str.maketrans('','',s))
            y = " ".join(no_punctuation.split())
            y = ' '.join([word for word in y.split() if word not in cachedStopWords])

            Y_dict[i] = y
        Y_tf_idf=self.tfidf.transform(Y_dict.values())

        print('performed Tf-Idf in %2i seconds.' % (time.time() - start_time))
        return Y_tf_idf

def complete_prediction(k, sender, address_books, res_temp, K=10):
    # k the number of recipients to predict
    k_most = [elt[0] for elt in address_books[sender][:K] if elt not in res_temp]
    k_most = k_most[:k]
    if len(k_most) < k: # sender n'a pas assez de contacts
        k_most.extend([0] * (k-len(k_most)))
    return k_most


<h2> Fitting </h2>

In [92]:
%%time 
#import TFIDF_mod
#from TFIDF_mod import TFIDF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
import numpy as np
from sklearn.model_selection import ShuffleSplit

# splitting data for cross validation
skf = ShuffleSplit(n_splits=1, test_size=0.25)

print('--------Cross-Validation Module--------')
for train_is, test_is in skf.split(y_df):
    print('\n Beginning of extraction \n ------------')
    ############Extraction + TF-IDF############
    X_train=X_df.ix[train_is]
    y_train = y_df.recipients.loc[train_is].copy()
    X_test=X_df.ix[test_is]
    y_test = y_df.recipients.loc[test_is].copy()
    y_pred=y_test.copy()
    
    tf_idf = TFIDF()
    X_train_TFIDF=tf_idf.fit_transform(X_train)
    X_test_TFIDF=tf_idf.transform(X_test)
    print('Extraction done \n ------------')
    print('\n Beginning of prediction \n ------------')
    ############Prediction############
    sender_test = X_test.sender.unique().tolist()
    clf={}
    count=0
    L=len(sender_test)
    y_pred=y_test.copy()

    for sender in sender_test:
        #Isolation of sender's mails
        sender_train_is = np.array(X_train.sender == sender)
        sender_test_is = np.array(X_test.sender == sender)

        ############Feature extraction############
        #Finding the nearest neighbours of sender's mails
        cos_dist_mat=cosine_similarity(X_train_TFIDF[sender_train_is])-np.identity(sum(sender_train_is))
        cos_dist_mat_test=cosine_similarity(X_test_TFIDF[sender_test_is],X_train_TFIDF[sender_train_is])
        #cos_dist_mat=cosine_similarity(X_TFIDF[sender_train_is],X_TFIDF) #to try later
        KNN_indices=KNN(cos_dist_mat,k=30)
        KNN_indices_test=KNN(cos_dist_mat_test,k=30)

        #Sender number in the address book
        sender_AB={}
        id_to_sender={}
        z=0
        for x in address_books[sender]:
            sender_AB[x[0]]=z
            id_to_sender[z]=x[0]
            z=z+1

        #Creation of the score vector
        recipient_scores=np.zeros((sum(sender_train_is),len(sender_AB)))
        for i in range(sum(sender_train_is)):
            d=np.array(KNN_indices[i])
            neigh_mails=y_train.values[d]#neighbour mails
            z=0
            for n_mail in neigh_mails:
                for rec in n_mail:
                    if rec in sender_AB:
                        j=sender_AB[rec]#index in the score vector
                        recipient_scores[i,j]+=cos_dist_mat[i,d[z]]
                z=z+1

        ############Train############

        #Creation of the labels for the classifier
        recipient_labels=np.zeros((sum(sender_train_is),len(sender_AB)))
        i=0
        for rec_list in y_train[sender_train_is]:
            for rec in rec_list:
                if rec in sender_AB:
                    j=sender_AB[rec]
                    recipient_labels[i,j]=1 
            i=i+1

        #One classifier per recipient
        for rec in sender_AB:
            key=sender+','+rec
            #clf[key]=SVC()
            clf[key]=xgb.XGBClassifier(n_estimators=50)
            clf[key].fit(recipient_scores,recipient_labels.T[sender_AB[rec]])


        ############Test############

        #Creation of the test score vector
        recipient_scores=np.zeros((sum(sender_test_is),len(sender_AB)))
        for i in range(sum(sender_test_is)):
            d=np.array(KNN_indices[i])
            neigh_mails=y_train.values[d]#neighbour mails
            z=0
            for n_mail in neigh_mails:
                for rec in n_mail:
                    if rec in sender_AB:
                        j=sender_AB[rec]#index in the score vector
                        recipient_scores[i,j]+=cos_dist_mat[i,d[z]]
                z=z+1

        recipient_labels=np.zeros((sum(sender_test_is),len(sender_AB))).T

        #Prediction
        pred=0
        for rec in sender_AB:
            key=sender+','+rec
            recipient_labels[sender_AB[rec]]=(clf[key].predict_proba(recipient_scores)).T[1].T
        recipient_labels=recipient_labels.T
        #Storage
        y_test_pred=[]
        for y in recipient_labels:
            y_tmp=[]
            max_rec=(-y).argsort()[:10]
            for rec_id in max_rec:
                y_tmp.append(id_to_sender[rec_id])
            if len(y_tmp) < 10:
                y_tmp.extend(complete_prediction(10-len(y_tmp),sender, address_books, y_tmp))
            y_test_pred.append(y_tmp)
        y_pred.ix[sender_test_is]=y_test_pred
        print(count,'/',L)
        count=count+1

--------Cross-Validation Module--------

 Beginning of extraction 
 ------------
performed Tf-Idf in 17 seconds.
performed Tf-Idf in  5 seconds.
Extraction done 
 ------------

 Beginning of prediction 
 ------------
0 / 125
1 / 125
2 / 125
3 / 125
4 / 125
5 / 125
6 / 125
7 / 125
8 / 125
9 / 125
10 / 125
11 / 125
12 / 125
13 / 125
14 / 125
15 / 125
16 / 125
17 / 125
18 / 125
19 / 125
20 / 125
21 / 125
22 / 125
23 / 125
24 / 125
25 / 125
26 / 125
27 / 125
28 / 125
29 / 125
30 / 125
31 / 125
32 / 125
33 / 125
34 / 125
35 / 125
36 / 125
37 / 125
38 / 125
39 / 125
40 / 125
41 / 125
42 / 125
43 / 125
44 / 125
45 / 125
46 / 125
47 / 125
48 / 125
49 / 125
50 / 125
51 / 125
52 / 125
53 / 125
54 / 125
55 / 125
56 / 125
57 / 125
58 / 125
59 / 125
60 / 125
61 / 125
62 / 125
63 / 125
64 / 125
65 / 125
66 / 125
67 / 125
68 / 125
69 / 125
70 / 125
71 / 125
72 / 125
73 / 125
74 / 125
75 / 125
76 / 125
77 / 125
78 / 125
79 / 125
80 / 125
81 / 125
82 / 125
83 / 125
84 / 125
85 / 125
86 / 125
87 / 125
8

In [93]:
for train_is, test_is in skf.split(y_df):
    
    
    i=0
    accuracy = {}
    accuracy_freq = {}
    accuracy_TOT = 0
    for sender in sender_test:
        print('%10s | %40s | ' %(sender_test.index(sender), sender), end='')
        sender_train_is = np.array(X_train.sender == sender)
        sender_test_is = np.array(X_test.sender == sender)
        
        accuracy[sender] = mapk(y_test[sender_test_is], y_pred[sender_test_is])
        
        accuracy_TOT += accuracy[sender]
        print(round(accuracy[sender],2))
print(accuracy_TOT/len(accuracy))


         0 |                    rahil.jafry@enron.com | 0.48
         1 |                      eric.bass@enron.com | 0.18
         2 |                    susan.scott@enron.com | 0.05
         3 |                  karen.buckley@enron.com | 0.25
         4 |                    karen.denne@enron.com | 0.43
         5 |                  john.lavorato@enron.com | 0.1
         6 |                  chris.germany@enron.com | 0.06
         7 |                  michelle.cash@enron.com | 0.13
         8 |                sara.shackleton@enron.com | 0.08
         9 |               hunter.s.shively@enron.com | 0.16
        10 |                     sally.beck@enron.com | 0.08
        11 |                     lynn.blair@enron.com | 0.23
        12 |                    amr.ibrahim@enron.com | 0.52
        13 |                  chris.dorland@enron.com | 0.07
        14 |                richard.shapiro@enron.com | 0.35
        15 |                       c..giron@enron.com | 0.09
        16 |             

In [103]:
y_test[sender_test_is], y_pred[sender_test_is]

(10636                               [tana.jones@enron.com]
 43339    [kenneth.lay@enron.com, james.derrick@enron.co...
 22600    [richard.shapiro@enron.com, eric.thode@enron.c...
 25527                            [jeff.dasovich@enron.com]
 10931    [kevin.montagne@enron.com, tana.jones@enron.co...
 14158                            [steven.j.kean@enron.com]
 22699    [richard.shapiro@enron.com, steven.j.kean@enro...
 22700    [richard.shapiro@enron.com, steven.j.kean@enro...
 13605                            [steven.j.kean@enron.com]
 14116    [richard.shapiro@enron.com, steven.j.kean@enro...
 14071    [james.d.steffes@enron.com, richard.shapiro@en...
 10928    [stephanie.segura@enron.com, irma.fuentes@enro...
 13709                            [steven.j.kean@enron.com]
 13778    [james.d.steffes@enron.com, richard.shapiro@en...
 2850     [richard.shapiro@enron.com, steven.j.kean@enro...
 14079                            [steven.j.kean@enron.com]
 43332    [paula.rieker@enron.com, kenne

In [82]:
from sklearn.metrics.pairwise import cosine_similarity

def complete_prediction(k, sender, address_books, res_temp, K=10):
    # k the number of recipients to predict
    k_most = [elt[0] for elt in address_books[sender][:K] if elt not in res_temp]
    k_most = k_most[:k]
    if len(k_most) < k: # sender n'a pas assez de contacts
        k_most.extend([0] * (k-len(k_most)))
    return k_most

class Predictor_2():
    def __init__(self, X, y, sender, address_books,N=10):
        self.train = X
        self.predict = y.values
        self.sender = sender
        self.N = min(N,10)
        self.address_books = address_books

    def predict_2(self, X):
        res = []
        for i in range(X.shape[0]):
            cos = (-cosine_similarity(X[i],self.train)).argsort()[:,:30][0] # 30 mails les plus proches
            if self.N != 0:
                NN_recpt = {}
                for j in range(30):
                    for k in range(len(self.predict[cos[j]])):
                        if self.predict[cos[j]][k] in NN_recpt:
                            NN_recpt[self.predict[cos[j]][k]]+= cosine_similarity(X[i],self.train[cos[j]])
                        else:
                            NN_recpt[self.predict[cos[j]][k]] = cosine_similarity(X[i],self.train[cos[j]])
                res_temp = list(dict(sorted(NN_recpt.items(), key=operator.itemgetter(1), reverse=True)[:10]))
                #res_temp = [self.predict[cos][0][:self.N]] # add the N first recipients of the closest e-mail
            else:
                 res_temp = []
            # if less than 10 recipients, complete the prediction with more frequents users
            if len(res_temp) < 10:
                res_temp.extend(complete_prediction(10-len(res_temp),self.sender, self.address_books, res_temp))
            res.append(res_temp)
        return res

In [113]:
from sklearn.model_selection import ShuffleSplit

# splitting data for cross validation
skf = ShuffleSplit(n_splits=1, test_size=0.2)
print('%10s | %40s | %13s | %13s' %('sender_nb', 'sender', 'accuracy KNN','accuracy freq'))
print('%10s + %40s + %13s + %13s' %(10*'-', 40*'-', 13*'-', 13*'-'))

for train_is, test_is in skf.split(y_df):
    
    X_tfidf_train = X_TFIDF[train_is].copy()
    y_train = y_df.recipients.loc[train_is].copy()
    X_tfidf_test = X_TFIDF[test_is].copy()
    y_test = y_df.recipients.loc[test_is].copy()
    X_test_df = X_df.loc[test_is].copy()
    X_train_df = X_df.loc[train_is].copy()
    
    i=0
    pdt = {}
    pdt_freq = {}
    accuracy = {}
    accuracy_freq = {}
    accuracy_TOT = 0
    sender_test = X_test_df.sender.unique().tolist()
    y_pred = np.empty((X_test_df.shape[0],10),dtype=object)
    y_pred_freq = np.empty((X_test_df.shape[0],10),dtype=object)
    for sender in sender_test:
        print('%10s | %40s | ' %(sender_test.index(sender), sender), end='')
        # indices corresponding to the sender
        sender_train_is = np.array(X_train_df.sender == sender)
        sender_test_is = np.array(X_test_df.sender == sender)
        
        pdt[sender] = Predictor_2(X_tfidf_train[sender_train_is], y_train[sender_train_is], sender, address_books)
        pdt_freq[sender] = Predictor_2(X_tfidf_train[sender_train_is], y_train[sender_train_is], sender, address_books, N=0)
        y_pred[sender_test_is] = pdt[sender].predict_2(X_tfidf_test[sender_test_is])
        y_pred_freq[sender_test_is] = pdt_freq[sender].predict_2(X_tfidf_test[sender_test_is])
        
        accuracy[sender] = mapk(y_test[sender_test_is], y_pred[sender_test_is])
        accuracy_freq[sender] = mapk(y_test[sender_test_is], y_pred_freq[sender_test_is])
        
        accuracy_TOT += accuracy[sender]
        print('%13s | %13s' %(round(accuracy[sender],2),round(accuracy_freq[sender],2)))


    print('%30s'%(90*'-'))
    print('error TOT = %.2f' %(accuracy_TOT/len(accuracy)))

 sender_nb |                                   sender |  accuracy KNN | accuracy freq
---------- + ---------------------------------------- + ------------- + -------------
