<h1>Email recipient recommendation</h1>

<i>Thomas Boudou, Guillaume Richard, Antoine Simoulin</i>

<p style="text-align: justify">It was shown that at work, employees frequently forget to include one or more recipient(s) before sending a message. Conversely, it is common that some recipients of a given message were actually not intended to receive the message. To increase productivity and prevent information leakage, the needs for effective <b>email recipient recommendation</b> systems are thus pressing.

In this challenge, you are asked to develop such a system, which, given the content and the date of a message, recommends a list of <b>10 recipients ranked by decreasing order of relevance</b>.</p>

In [1]:
# Requirements
%matplotlib inline
import random
import pandas as pd
import numpy as np
# do not display warnings
import warnings
warnings.filterwarnings("ignore")

# Functions files are saved in "src/" directory.
import sys
sys.path.append('src/')
from accuracy_measure import *

In [2]:
from load_data import *

# load files
# Data are saved in "data/" directory
path_to_data = '../data/'
training, training_info, test, test_info, y_df = load_data(path_to_data)

# create adress book
# /!\ can take 1-2 min
address_books = create_address_books(training, y_df)

# join train and test files
X_df = join_data(training_info, training)
X_sub_df = join_data(test_info, test)

In [3]:
import TFIDF_mod
from TFIDF_mod import TFIDF

# transform each mail body into tfidf vector
# /!\ function can take 1-2 min to execute
TFIDF = TFIDF()
X_TFIDF = TFIDF.fit_transform(X_df) # resulting shape : (43613, 275988)

In [44]:

from sklearn.metrics.pairwise import cosine_similarity

def complete_prediction(k, sender, address_books, res_temp, K=10):
    # k the number of recipients to predict
    k_most = [elt[0] for elt in address_books[sender][:K] if elt not in res_temp]
    k_most = k_most[:k]
    if len(k_most) < k: # sender n'a pas assez de contacts
        k_most.extend([0] * (k-len(k_most)))
    return k_most

class Predictor_2():
    def __init__(self, X, y, sender, address_books,N=10):
        self.train = X
        self.predict = y.values
        self.sender = sender
        self.N = min(N,10)
        self.address_books = address_books

    def predict_2(self, X):
        res = []
        for i in range(X.shape[0]):
            cos = cosine_similarity(X[i],self.train).argsort()[:,:30][0] # 30 mails les plus proches
            if self.N != 0:
                NN_recpt = {}
                for j in range(30):
                    for k in range(len(self.predict[cos[j]])):
                        if self.predict[cos[j]][k] in NN_recpt:
                            NN_recpt[self.predict[cos[j]][k]]+= 1
                        else:
                            NN_recpt[self.predict[cos[j]][k]] = 1
                res_temp = list(dict(sorted(NN_recpt.items(), key=operator.itemgetter(1), reverse=True)[:10]))
                #res_temp = [self.predict[cos][0][:self.N]] # add the N first recipients of the closest e-mail
            else:
                 res_temp = []
            # if less than 10 recipients, complete the prediction with more frequents users
            if len(res_temp) < 10:
                res_temp.extend(complete_prediction(10-len(res_temp),self.sender, self.address_books, res_temp))
            res.append(res_temp)
        return res

In [46]:
from sklearn.model_selection import ShuffleSplit

# splitting data for cross validation
skf = ShuffleSplit(n_splits=1, test_size=0.2)
print('%10s | %40s | %13s | %13s' %('sender_nb', 'sender', 'accuracy KNN','accuracy freq'))
print('%10s + %40s + %13s + %13s' %(10*'-', 40*'-', 13*'-', 13*'-'))

for train_is, test_is in skf.split(y_df):
    
    X_tfidf_train = X_TFIDF[train_is].copy()
    y_train = y_df.recipients.loc[train_is].copy()
    X_tfidf_test = X_TFIDF[test_is].copy()
    y_test = y_df.recipients.loc[test_is].copy()
    X_test_df = X_df.loc[test_is].copy()
    X_train_df = X_df.loc[train_is].copy()
    
    i=0
    pdt = {}
    pdt_freq = {}
    accuracy = {}
    accuracy_freq = {}
    accuracy_TOT = 0
    sender_test = X_test_df.sender.unique().tolist()
    y_pred = np.empty((X_test_df.shape[0],10),dtype=object)
    y_pred_freq = np.empty((X_test_df.shape[0],10),dtype=object)

    for sender in sender_test:
        print('%10s | %40s | ' %(sender_test.index(sender), sender), end='')
        # indices corresponding to the sender
        sender_train_is = np.array(X_train_df.sender == sender)
        sender_test_is = np.array(X_test_df.sender == sender)
        
        pdt[sender] = Predictor_2(X_tfidf_train[sender_train_is], y_train[sender_train_is], sender, address_books)
        pdt_freq[sender] = Predictor_2(X_tfidf_train[sender_train_is], y_train[sender_train_is], sender, address_books, N=0)
        y_pred[sender_test_is] = pdt[sender].predict_2(X_tfidf_test[sender_test_is])
        y_pred_freq[sender_test_is] = pdt_freq[sender].predict_2(X_tfidf_test[sender_test_is])
        
        accuracy[sender] = mapk(y_test[sender_test_is], y_pred[sender_test_is])
        accuracy_freq[sender] = mapk(y_test[sender_test_is], y_pred_freq[sender_test_is])
        
        accuracy_TOT += accuracy[sender]
        print('%13s | %13s' %(round(accuracy[sender],2),round(accuracy_freq[sender],2)))

    print('%30s'%(90*'-'))
    print('error TOT = %.2f' %(accuracy_TOT/len(accuracy)))

 sender_nb |                                   sender |  accuracy KNN | accuracy freq
---------- + ---------------------------------------- + ------------- + -------------
         0 |                  michelle.cash@enron.com |          0.05 |          0.18
         1 |                     lynn.blair@enron.com |          0.16 |          0.27
         2 |                     sally.beck@enron.com |          0.02 |          0.14
         3 |                       c..giron@enron.com |          0.01 |          0.19
         4 |                  dutch.quigley@enron.com |          0.09 |          0.32
         5 |               larry.f.campbell@enron.com |          0.01 |          0.16
         6 |                james.d.steffes@enron.com |           0.1 |          0.23
         7 |                sara.shackleton@enron.com |          0.06 |          0.14
         8 |                  chris.germany@enron.com |          0.04 |          0.12
         9 |                      eric.bass@enron.com 