<h1>Email recipient recommendation</h1>

<i>Thomas Boudou, Guillaume Richard, Antoine Simoulin</i>

<p style="text-align: justify">It was shown that at work, employees frequently forget to include one or more recipient(s) before sending a message. Conversely, it is common that some recipients of a given message were actually not intended to receive the message. To increase productivity and prevent information leakage, the needs for effective <b>email recipient recommendation</b> systems are thus pressing.

In this challenge, you are asked to develop such a system, which, given the content and the date of a message, recommends a list of <b>10 recipients ranked by decreasing order of relevance</b>.</p>

In [1]:
# Requirements
%matplotlib inline
import random
import pandas as pd
import numpy as np
# do not display warnings
import warnings
warnings.filterwarnings("ignore")

# Functions files are saved in "src/" directory.
import sys
sys.path.append('src/')
from accuracy_measure import *

In [2]:
from load_data import *

# load files
# Data are saved in "data/" directory
path_to_data = '../data/'
training, training_info, test, test_info, y_df = load_data(path_to_data)

# create adress book
# /!\ can take 1-2 min
address_books = create_address_books(training, y_df)

# join train and test files
X_df = join_data(training_info, training)
X_sub_df = join_data(test_info, test)

In [3]:
from proper_name_extractor import *

In [4]:
#Extracting proper names in X_train
if False:
    X_tmp=add_proper_names(X_df)
    X_tmp.to_csv('data_with_proper_names.tsv', sep='\t',na_values=' ')
X_tmp=pd.read_csv('data_with_proper_names.tsv', sep='\t',na_filter=False,index_col=False)
X_tmp.drop(['Unnamed: 0'],axis=1)

FileNotFoundError: File b'data_with_proper_names.tsv' does not exist

In [5]:
#Name per address creation

surname_link, recipients_link=create_name_dict(X_tmp, y_df)

In [8]:
def complete_prediction(k, sender, address_books, res_temp, K=10):
    # k the number of recipients to predict
    k_most = [elt[0] for elt in address_books[sender][:K] if elt not in res_temp]
    k_most = k_most[:k]
    if len(k_most) < k: # sender n'a pas assez de contacts
        k_most.extend([0] * (k-len(k_most)))
    return k_most

class Predictor_Names():
    def __init__(self, X, y, sender, address_books):
        self.train = X
        self.predict = y.values
        self.sender = sender
        self.address_books = address_books
        pass

    def pred(self, X):
        res = []#np.empty((X.shape[0],10))
        for x in X.values:
            res_temp=[]
            sender=x[3]
            name_list=extract_names(x[2]) #extract name from body
            score={}
            for r in address_books[sender]:
                rec=r[0]
                score[rec]=0
                for name in name_list.split(','):
                    if name in recipients_link[rec]:
                        score[rec]=score[rec]+1
            score=sorted(score.items(), key=operator.itemgetter(1), reverse = True)
            count=0
            if len(score)>0:
                s=score[count][1]
            else:
                s=0
            while s>0 and count<10:
                res_temp.append(score[count][0])
                count=count+1
                if len(score)>count:
                    s=score[count][1]
                else:
                    s=0
            if len(res_temp) < 10:
                res_temp.extend(complete_prediction(10-len(res_temp),self.sender, self.address_books, res_temp))
            res.append(res_temp)
        return res


In [9]:
from sklearn.model_selection import ShuffleSplit
import predictor

# splitting data for cross validation
skf = ShuffleSplit(n_splits=2, test_size=0.2)
print('%10s | %40s | %10s' %('sender_nb', 'sender', 'accuracy'))
print('%10s + %40s + %10s' %(10*'-', 40*'-', 10*'-'))
for train_is, test_is in skf.split(y_df):
    
    y_train = y_df.recipients.loc[train_is].copy()
    y_test = y_df.recipients.loc[test_is].copy()
    X_test_df = X_df.loc[test_is].copy()
    X_train_df = X_df.loc[train_is].copy()
    
    i=0
    pdt = {}
    accuracy = {}
    accuracy_TOT = 0
    sender_test = X_test_df.sender.unique().tolist()
    y_pred = np.empty((X_test_df.shape[0],10),dtype=object)

    for sender in sender_test:
        print('%10s | %40s | ' %(sender_test.index(sender), sender), end='')
        # indices corresponding to the sender
        sender_train_is = np.array(X_train_df.sender == sender)
        sender_test_is = np.array(X_test_df.sender == sender)
        
        pdt[sender] = Predictor_Names(X_train_df[sender_train_is], y_train[sender_train_is], sender, address_books)
        y_pred[sender_test_is] = pdt[sender].pred(X_test_df[sender_test_is])
        
        accuracy[sender] = mapk(y_test[sender_test_is], y_pred[sender_test_is], k=10)
        accuracy_TOT += accuracy[sender]
        print('%.2f' %(accuracy[sender]))

    print('%30s'%(30*'-'))
    print('error TOT = %.2f' %(accuracy_TOT/len(accuracy)))

 sender_nb |                                   sender |   accuracy
---------- + ---------------------------------------- + ----------
         0 |                   holly.keiser@enron.com | 0.08
         1 |                  chris.germany@enron.com | 0.23
         2 |               fletcher.j.sturm@enron.com | 0.17
         3 |                      m..forney@enron.com | 0.30
         4 |                 jonathan.mckay@enron.com | 0.29
         5 |                      eric.bass@enron.com | 0.30
         6 |                 phillip.m.love@enron.com | 0.25
         7 |          nancy.sellers@robertmondavi.com | 0.82
         8 |                amy.fitzpatrick@enron.com | 0.19
         9 |                james.d.steffes@enron.com | 0.26
        10 |                     scott.neal@enron.com | 0.38
        11 |               larry.f.campbell@enron.com | 0.41
        12 |               sandra.f.brawner@enron.com | 0.54
        13 |                  becky.spencer@enron.com | 0.40
        14 |

<h2>Create submission</h2>

In [10]:
def create_submission(y_pred,X_test_df):

    predictions_towrite={}
    x_test=X_test_df.values
    for i in range(len(y_pred)):
        recipients=y_pred[i]
        mid=x_test[i][0]
        predictions_towrite[mid]=recipients

    count=0
    with open('./pred_custom.txt', 'w') as my_file:
        my_file.write('mid,recipients' + '\n')
        for ids, preds in predictions_towrite.items():
            count=count+1
            r=str(ids)+","
            for s in preds:
                r=r+" "+str(s)
            r=r+'\n'
            my_file.write(r)

In [12]:
sender_test = X_sub_df.sender.unique().tolist()
y_pred = np.empty((X_sub_df.shape[0],10),dtype=object)
count=0
pdt = {}

for sender in sender_test:
    # indices corresponding to the sender
    sender_train_is = np.array(X_df.sender == sender)
    sender_test_is = np.array(X_sub_df.sender == sender)
    
    y_train_all = y_df.recipients.copy()
    pdt[sender] = Predictor_Names(X_df[sender_train_is], y_train_all[sender_train_is], sender, address_books)
    y_pred[sender_test_is] = pdt[sender].pred(X_sub_df[sender_test_is])
    
create_submission(y_pred,X_sub_df)