<h1>Email recipient recommendation</h1>

<i>Thomas Boudou, Guillaume Richard, Antoine Simoulin</i>

<p style="text-align: justify">It was shown that at work, employees frequently forget to include one or more recipient(s) before sending a message. Conversely, it is common that some recipients of a given message were actually not intended to receive the message. To increase productivity and prevent information leakage, the needs for effective <b>email recipient recommendation</b> systems are thus pressing.

In this challenge, you are asked to develop such a system, which, given the content and the date of a message, recommends a list of <b>10 recipients ranked by decreasing order of relevance</b>.</p>

In [1]:
# Requirements
%matplotlib inline
import random
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore") # do not display warnings

# Functions files are saved in "src/" directory.
import sys
sys.path.append('src/')

# path to data
path_to_data = 'data/'

In [2]:
from load_data import *
from accuracy_measure import *

X_df, X_sub_df, y_df = load_data(path_to_data)

<h2>Predictors</h2>

In [3]:
# exctract 10 most frequent recipients names from address book
#def names(address_books):
#    sender_names = {}
#    for rec, value in address_books:
#        if '.' in rec[:rec.find('@')]:
#            found = rec[:rec.find('.')].lower()
#            if len(found) > 2:
#                if not found in sender_names:
#                    sender_names[found] = rec
#    return sender_names

class Predictor_NAMES():
    def __init__(self, X, X_tfidf, y, sender, address_books,N=10):
        self.body = X
        self.train = X_tfidf
        self.predict = y.values
        self.sender = sender
        self.address_books = address_books
        self.N = min(N,10)
        self.k_most = [elt[0] for elt in self.address_books[self.sender][:10]]
        
    def prediction(self, X):
        res = []
        for i in range(X.shape[0]):
            if self.N != 0:
                potential = {}
                for rec in self.k_most:
                    potential[rec] = 1
                # exctract surnames in the mail and increase probability if recorded in recipients
                surnames = names(self.address_books[self.sender])
                for surname in surnames.keys():
                    if surname in str(self.body[i]).lower():
                        if surnames[surname] in potential:
                            potential[surnames[surname]]*=1.5
                res_temp = [k for k in sorted(potential, key=potential.get, reverse=True)][:10]
            else:
                 res_temp = []
            # if less than 10 recipients, complete the prediction with more frequents users
            if len(res_temp) < 10:
                res_temp.extend(complete_prediction(10-len(res_temp),self.sender, self.address_books, res_temp))
            res.append(res_temp)
        return res

In [4]:
def Mixed_Predictor(Y_preds, acc):
    res = []
    for i in range(Y_preds[0].shape[0]): # loop trough all email predictions for this sender
        rec = {} # create dictionary of recipients
        for y_pred, acc_ in zip(Y_preds,acc): # loop trough all predictions
            for rec_ in y_pred[i]:
                if rec_ in rec:
                    rec[rec_] += y_pred[i].tolist().index(rec_)*acc_/sum(acc)
                else:
                    rec[rec_] = y_pred[i].tolist().index(rec_)*acc_/sum(acc)
        res.append([k for k in sorted(rec, key=rec.get, reverse=False)][:10])
        if len(res[i]) < 10: # less than 10 recipients in all predictions
            res[i].extend([0] * (10-len(res[i])))
    return res

<h2>Cross Validation Module</h2>

In [5]:
import TFIDF_mod
from TFIDF_mod import TFIDF, LDA
from split import split
import Predictor
from Predictor import *#Predictor_TFIDF, Predictor_KNN, Predictor_CTFIDF
from proper_name import *

train_is, test_is = split(X_df)
# print('train size : ',train_is.shape[0],'\ntest size  : ',test_is.shape[0],'\n')
print('%5s | %40s | %12s | %12s | %12s | %12s' %('nb', 'sender', 'acc FREQ','acc C-TFIDF','acc KNN','acc mixed'))
print('%5s + %40s + %12s + %12s + %12s + %12s' %(5*'-', 40*'-', 12*'-', 12*'-', 12*'-', 12*'-'))


X_tfidf_train = {}
X_tfidf_test = {}
X_lda_test = {}
X_lda_train = {}

X_test_df = X_df.loc[test_is].copy()
X_train_df = X_df.loc[train_is].copy()

y_train = y_df.recipients.loc[train_is].copy()
y_test = y_df.recipients.loc[test_is].copy()


address_books_train = create_address_books(X_df.loc[train_is], y_df.loc[train_is])
recipient_surnames = names(address_books_train)

pdt = {}
pdt_freq = {}
pdt_KNN = {}
pdt_KNN_LDA = {}
pdt_NAMES = {}

accuracy = {}
accuracy_freq = {}
accuracy_KNN = {}
accuracy_KNN_LDA = {}
accuracy_NAMES = {}
accuracy_mixed = {}

TFIDF_dict = {}
LDA_dict = {}
pred_select = {}

sender_test = X_test_df.sender.unique().tolist()
y_pred = np.empty((X_test_df.shape[0],10),dtype=object)
y_pred_freq = np.empty((X_test_df.shape[0],10),dtype=object)
y_pred_KNN = np.empty((X_test_df.shape[0],10),dtype=object)
y_pred_KNN_LDA = np.empty((X_test_df.shape[0],10),dtype=object)
y_pred_NAMES = np.empty((X_test_df.shape[0],10),dtype=object)
y_pred_mixed = np.empty((X_test_df.shape[0],10),dtype=object)

for sender in sender_test:
    print('%5s | %40s | ' %(sender_test.index(sender), sender), end='')
    # indices corresponding to the sender
    sender_train_is = np.array(X_train_df.sender == sender)
    sender_test_is = np.array(X_test_df.sender == sender)

    # transform each mail body into tfidf vector
    # not given all corpus but only sender corpus
    TFIDF_dict[sender] = TFIDF()
    LDA_dict[sender] = LDA()

    X_tfidf_train[sender] = TFIDF_dict[sender].fit_transform(X_train_df[X_train_df.sender == sender])
    X_tfidf_test[sender] = TFIDF_dict[sender].transform(X_test_df[X_test_df.sender == sender])
    X_lda_train[sender] = LDA_dict[sender].fit_transform(X_train_df[X_train_df.sender == sender])
    X_lda_test[sender] = LDA_dict[sender].transform(X_test_df[X_test_df.sender == sender])
    
    pdt[sender] = Predictor_CTFIDF(X_tfidf_train[sender], y_train[sender_train_is], sender, address_books_train)
    pdt_freq[sender] = Predictor_CTFIDF(X_tfidf_train[sender], y_train[sender_train_is], sender, address_books_train, N=0)
    pdt_KNN[sender] = Predictor_KNN(X_tfidf_train[sender], y_train[sender_train_is], sender, address_books_train, recipient_surnames)
    # pdt_KNN_LDA[sender] = Predictor_KNN(X_lda_train[sender], y_train[sender_train_is], sender, address_books_train)
    # pdt_NAMES[sender] = Predictor_NAMES(X_train_df[X_train_df.sender == sender].body.values, X_tfidf_train[sender], y_train[sender_train_is], sender, address_books_train)
    
    y_pred[sender_test_is] = pdt[sender].prediction(X_tfidf_test[sender])
    y_pred_freq[sender_test_is] = pdt_freq[sender].prediction(X_tfidf_test[sender])
    y_pred_KNN[sender_test_is] = pdt_KNN[sender].prediction(X_tfidf_test[sender], X_test_df[sender_test_is])
    # y_pred_KNN_LDA[sender_test_is] = pdt_KNN_LDA[sender].prediction(X_lda_test[sender])
    # y_pred_NAMES[sender_test_is] = pdt_NAMES[sender].prediction(X_tfidf_test[sender])

    accuracy[sender] = mapk(y_test[sender_test_is].values, y_pred[sender_test_is])
    accuracy_freq[sender] = mapk(y_test[sender_test_is].values, y_pred_freq[sender_test_is])
    accuracy_KNN[sender] = mapk(y_test[sender_test_is].values, y_pred_KNN[sender_test_is])
    # accuracy_KNN_LDA[sender] = mapk(y_test[sender_test_is].values, y_pred_KNN_LDA[sender_test_is])
    # accuracy_NAMES[sender] = mapk(y_test[sender_test_is].values, y_pred_NAMES[sender_test_is])
    
    if accuracy_KNN[sender] == max(accuracy[sender], accuracy_KNN[sender], accuracy_freq[sender]):
            pred_select[sender] = 'KNN'
    elif accuracy[sender] == max(accuracy[sender], accuracy_KNN[sender], accuracy_freq[sender]):
            pred_select[sender] = 'CTFIDF'
    else:
            pred_select[sender] = 'FREQ'
            
    y_pred_mixed[sender_test_is] = Mixed_Predictor([y_pred[sender_test_is], y_pred_freq[sender_test_is], y_pred_KNN[sender_test_is]],
                                                  [accuracy[sender],accuracy_freq[sender],accuracy_KNN[sender]])
    accuracy_mixed[sender] = mapk(y_test[sender_test_is].values, y_pred_mixed[sender_test_is])
         
    
    print('%12s | %12s | %12s | %12s'%(round(accuracy_freq[sender],2),round(accuracy[sender],2),round(accuracy_KNN[sender],2),pred_select[sender]))

accuracy_TOT = mapk(y_test.values, y_pred)
accuracy_freq_TOT = mapk(y_test.values, y_pred_freq)
accuracy_KNN_TOT = mapk(y_test.values, y_pred_KNN)
accuracy_NAMES_TOT = mapk(y_test.values, y_pred_NAMES)
accuracy_mixed_TOT = mapk(y_test.values, y_pred_mixed)


print('%s' %(111*'='))
print('%48s | ' %('mean accuracy'), end='')
print('%12s | %12s | %12s | %12s'%((round(accuracy_freq_TOT,2)),round(accuracy_TOT,2),round(accuracy_KNN_TOT,2),round(accuracy_mixed_TOT,2)))

   nb |                                   sender |     acc FREQ |  acc C-TFIDF |      acc KNN |    acc mixed
----- + ---------------------------------------- + ------------ + ------------ + ------------ + ------------
    0 |                     tim.belden@enron.com |         0.34 |         0.52 |         0.58 |          KNN
    1 |                christian.yoder@enron.com |         0.46 |         0.55 |         0.58 |          KNN
    2 |                  lorna.brennan@enron.com |         0.76 |         0.82 |          0.9 |          KNN
    3 |                 janel.guerrero@enron.com |         0.67 |         0.86 |         0.82 |       CTFIDF
    4 |                grace.rodriguez@enron.com |         0.56 |         0.68 |         0.72 |          KNN
    5 |                    amr.ibrahim@enron.com |         0.64 |         0.85 |         0.83 |       CTFIDF
    6 |                    cindy.stark@enron.com |         0.25 |         0.24 |         0.47 |          KNN
    7 |            

<h2>Create submission</h2>

In [6]:
import datetime

def create_submission(y_pred,X_test_df):
    now = datetime.datetime.now()
    
    predictions_towrite={}
    x_test=X_test_df.values
    for i in range(len(y_pred)):
        recipients=y_pred[i]
        mid=x_test[i][0]
        predictions_towrite[mid]=recipients

    count=0
    with open('pred/pred_'+ str(now.strftime("%Y-%m-%d-%H-%M")) + '.txt', 'w') as my_file:
        my_file.write('mid,recipients' + '\n')
        for ids, preds in predictions_towrite.items():
            count=count+1
            r=str(ids)+","
            for s in preds:
                r=r+" "+str(s)
            r=r+'\n'
            my_file.write(r)

In [8]:
import TFIDF_mod
from TFIDF_mod import TFIDF
from TFIDF_mod import LDA

address_books = create_address_books(X_df, y_df)
recipient_surnames_sub = names(address_books)


sender_test = X_sub_df.sender.unique().tolist()
y_pred_ALL = np.empty((X_sub_df.shape[0],10),dtype=object)
pdt_ALL = {}

X_tfidf_train_all = {}
X_tfidf_sub_all = {}
TFIDF_dict = {}

for sender in sender_test:
    # indices corresponding to the sender
    sender_train_is_all = np.array(X_df.sender == sender)
    sender_test_is_all = np.array(X_sub_df.sender == sender)
    y_train_all = y_df.recipients.copy()
    
    if pred_select[sender] == 'KNN':
        TFIDF_dict[sender] = TFIDF()

        X_tfidf_train_all[sender] = TFIDF_dict[sender].fit_transform(X_df[X_df.sender == sender])
        X_tfidf_sub_all[sender] = TFIDF_dict[sender].transform(X_sub_df[X_sub_df.sender == sender])

        pdt_ALL[sender] = Predictor_KNN(X_tfidf_train_all[sender], y_train_all[sender_train_is_all], sender, address_books, recipient_surnames_sub)
        y_pred_ALL[sender_test_is_all] = pdt_ALL[sender].prediction(X_tfidf_sub_all[sender], X_sub_df[sender_test_is_all])
    
    elif pred_select[sender] == 'CTFIDF':
        TFIDF_dict[sender] = TFIDF()

        X_tfidf_train_all[sender] = TFIDF_dict[sender].fit_transform(X_df[X_df.sender == sender])
        X_tfidf_sub_all[sender] = TFIDF_dict[sender].transform(X_sub_df[X_sub_df.sender == sender])

        pdt_ALL[sender] = Predictor_CTFIDF(X_tfidf_train_all[sender], y_train_all[sender_train_is_all], sender, address_books)
        y_pred_ALL[sender_test_is_all] = pdt_ALL[sender].prediction(X_tfidf_sub_all[sender])
        
    else:
        TFIDF_dict[sender] = TFIDF()

        X_tfidf_train_all[sender] = TFIDF_dict[sender].fit_transform(X_df[X_df.sender == sender])
        X_tfidf_sub_all[sender] = TFIDF_dict[sender].transform(X_sub_df[X_sub_df.sender == sender])

        pdt_ALL[sender] = Predictor_CTFIDF(X_tfidf_train_all[sender], y_train_all[sender_train_is_all], sender, address_books, N=0)
        y_pred_ALL[sender_test_is_all] = pdt_ALL[sender].prediction(X_tfidf_sub_all[sender])
        
create_submission(y_pred_ALL,X_sub_df)

KeyboardInterrupt: 