<h1>Email recipient recommendation</h1>

<i>Thomas Boudou, Guillaume Richard, Antoine Simoulin</i>

<p style="text-align: justify">It was shown that at work, employees frequently forget to include one or more recipient(s) before sending a message. Conversely, it is common that some recipients of a given message were actually not intended to receive the message. To increase productivity and prevent information leakage, the needs for effective <b>email recipient recommendation</b> systems are thus pressing.

In this challenge, you are asked to develop such a system, which, given the content and the date of a message, recommends a list of <b>10 recipients ranked by decreasing order of relevance</b>.</p>

In [1]:
# Requirements
%matplotlib inline
import random
import pandas as pd
import numpy as np
# do not display warnings
import warnings
warnings.filterwarnings("ignore")

# Functions files are saved in "src/" directory.
import sys
sys.path.append('src/')
from accuracy_measure import *

In [2]:
from load_data import *

# load files
# Data are saved in "data/" directory
path_to_data = 'data/'
training, training_info, test, test_info, y_df = load_data(path_to_data)

# create adress book
# /!\ can take 1-2 min
# address_books = create_address_books(training, y_df)

# join train and test files
X_df = join_data(training_info, training)
X_sub_df = join_data(test_info, test)

# remove non authorise adress from y_df (misssing @)
y_df = clean(y_df)

<h2>Predictors</h2>

In [3]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def complete_prediction(k, sender, address_books, res_temp, K=10):
    # k the number of recipients to predict
    k_most = [elt[0] for elt in address_books[sender][:K] if elt[0] not in res_temp]
    k_most = k_most[:k]
    if len(k_most) < k: # sender n'a pas assez de contacts
        k_most.extend([0] * (k-len(k_most)))
    return k_most

<h3>Predictor closest message with Tf-Idf</h3>

In [4]:
class Predictor_TFIDF():
    def __init__(self, X, y, sender, address_books,N=10):
        self.train = X
        self.predict = y.values
        self.sender = sender
        self.address_books = address_books
        self.N = min(N,10)
        
    def prediction(self, X):
        res = []
        for i in range(X.shape[0]):
            cos = cosine_similarity(X[i],self.train).argsort()[:,0][0] # mail le plus proche
            if self.N != 0:
                res_temp = [self.predict[cos][0][:self.N]] # add the N first recipients of the closest e-mail
            else:
                 res_temp = []
            # if less than 10 recipients, complete the prediction with more frequents users
            if len(res_temp) < 10:
                res_temp.extend(complete_prediction(10-len(res_temp),self.sender, self.address_books, res_temp))
            res.append(res_temp)
        return res

<h3>Predictor KNN</h3>

In [5]:
class Predictor_KNN():
    def __init__(self, X, y, sender, address_books,N=10):
        self.train = X
        self.predict = y.values
        self.sender = sender
        self.N = min(N,10)
        self.address_books = address_books

    def prediction(self, X):
        res = []
        for i in range(X.shape[0]):
            cos = (-cosine_similarity(X[i],self.train)).argsort()[:,:30][0] # 30 mails les plus proches
            if self.N != 0:
                NN_recpt = {}
                for j in range(len(cos)): # len(cos) =30 except if not enough mails
                    for k in range(len(self.predict[cos[j]])):
                        if self.predict[cos[j]][k] in NN_recpt:
                            NN_recpt[self.predict[cos[j]][k]]+= 1
                        else:
                            NN_recpt[self.predict[cos[j]][k]] = 1
                res_temp = [k for k in sorted(NN_recpt, key=NN_recpt.get, reverse=True)][:10]
            else:
                 res_temp = []
            # if less than 10 recipients, complete the prediction with more frequents users
            if len(res_temp) < 10:
                res_temp.extend(complete_prediction(10-len(res_temp),self.sender, self.address_books, res_temp))
            res.append(res_temp)
        return res

<h3>Predictor Tf-Idf centralized centroïd</h3>

In [6]:
import scipy

class Predictor_CTFIDF():
    def __init__(self, X, y, sender, address_books,N=10):
        self.N = min(N,10)
        self.train = X
        self.predict = y.values
        self.sender = sender
        self.address_books = address_books
        self.X_recpt = {}
        
        # perform centroid Tf-Idf. i.e each 10 most frequent recipients is represented 
        # by an average of all mail he received.
        # exctract 10 most frequents recipients
        self.k_most = [elt[0] for elt in address_books[sender][:20]] # 10 more frequent recipients
        # perform average Tf-Idf on 10 most frequents recipients
        for recpt in self.k_most: # loop trough 10 most frequents recipients
            for i in range(X.shape[0]): # loop trough all mails send by sender
                if recpt in self.predict[i]: # if recipients is in mail
                    if recpt in self.X_recpt:
                        self.X_recpt[recpt] += X[i,:]
                    else:
                        self.X_recpt[recpt] = X[i,:]
            #self.X_recpt[recpt] = normalize(self.X_recpt[recpt], norm='l2', axis=1) # normalize tfidf vector

    def prediction(self, X):
        res = []
        cos = {}
        for i in range(X.shape[0]):
            # cosine similarity with 10 most frequents recpt
            for recpt, value in self.X_recpt.items():
                cos[recpt] = cosine_similarity(X[i],self.X_recpt[recpt])
            if self.N != 0:
                # return the 10 most frequent recipients in order 
                # given by similarity to their centroid Tf-Idf representation
                res_temp = [k for k in sorted(cos, key=cos.get, reverse=True)]
            else:
                 res_temp = []
            # if less than 10 recipients, complete the prediction with more frequents users
            if len(res_temp) < 10:
                res_temp.extend(complete_prediction(10-len(res_temp),self.sender, self.address_books, res_temp))
            res.append(res_temp[:10])
        return res

<h2>Cross validation module</h2>

In [7]:
def split(X_df):
    X_train = {}

    for sender in X_df.sender.unique().tolist():
        X_train[sender] = X_df.sample(n=20)
    train_tot = pd.concat([X_train[sender] for sender in X_df.sender.unique().tolist()])

    train_is = []
    test_is = []

    train = X_df.mid.isin(train_tot.mid)

    for i in range(X_df.shape[0]):
        if train.ix[i]:
            test_is.extend([i])
        else:
            train_is.extend([i])

    train_is = np.asarray(train_is)
    test_is = np.asarray(test_is)
    
    return train_is, test_is

In [8]:
#from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import KFold
# splitting data for cross validation
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import LeavePOut
import TFIDF_mod
from TFIDF_mod import TFIDF, LDA


#skf = KFold(n_splits=2, random_state=None)
skf = StratifiedShuffleSplit(n_splits=1, test_size=0.05)
#lpo = LeavePOut(p=2000)
#list_lpo = list(lpo.split(y_df))
#train_is, test_is = list_lpo[0]
#print(train_is.shape[0],test_is.shape[0])

# splitting data for cross validation
#skf = ShuffleSplit(n_splits=1, test_size=0.2)

#for train_is, test_is in skf.split(y_df, X_df.sender):

train_is, test_is = split(X_df)
print('train size : ',train_is.shape[0],'\ntest size  : ',test_is.shape[0],'\n')
print('%10s | %40s | %15s | %15s | %15s ' %('sender_nb', 'sender', 'acc centroid','acc freq','acc KNN'))
print('%10s + %40s + %15s + %15s + %15s' %(10*'-', 40*'-', 15*'-', 15*'-', 15*'-'))


X_tfidf_train = {}
y_train = y_df.recipients.loc[train_is].copy()
X_tfidf_test = {}
y_test = y_df.recipients.loc[test_is].copy()
X_test_df = X_df.loc[test_is].copy()
X_train_df = X_df.loc[train_is].copy()

address_books_train = create_address_books_train(X_df.loc[train_is], y_df.loc[train_is])

pdt = {}
pdt_freq = {}
accuracy = {}
accuracy_freq = {}
accuracy_KNN = {}
pdt_KNN = {}
TFIDF_dict = {}
sender_test = X_test_df.sender.unique().tolist()
y_pred = np.empty((X_test_df.shape[0],10),dtype=object)
y_pred_freq = np.empty((X_test_df.shape[0],10),dtype=object)
y_pred_KNN = np.empty((X_test_df.shape[0],10),dtype=object)

KNN_list = []


for sender in sender_test:
    print('%10s | %40s | ' %(sender_test.index(sender), sender), end='')
    # indices corresponding to the sender
    sender_train_is = np.array(X_train_df.sender == sender)
    sender_test_is = np.array(X_test_df.sender == sender)

    # transform each mail body into tfidf vector
    # not given all corpus but only sender corpus
    TFIDF_dict[sender] = TFIDF()

    X_tfidf_train[sender] = TFIDF_dict[sender].fit_transform(X_train_df[X_train_df.sender == sender])
    X_tfidf_test[sender] = TFIDF_dict[sender].transform(X_test_df[X_test_df.sender == sender])

    pdt[sender] = Predictor_CTFIDF(X_tfidf_train[sender], y_train[sender_train_is], sender, address_books_train)
    pdt_freq[sender] = Predictor_CTFIDF(X_tfidf_train[sender], y_train[sender_train_is], sender, address_books_train, N=0)
    pdt_KNN[sender] = Predictor_KNN(X_tfidf_train[sender], y_train[sender_train_is], sender, address_books_train)

    y_pred[sender_test_is] = pdt[sender].prediction(X_tfidf_test[sender])
    y_pred_freq[sender_test_is] = pdt_freq[sender].prediction(X_tfidf_test[sender])
    y_pred_KNN[sender_test_is] = pdt_KNN[sender].prediction(X_tfidf_test[sender])

    accuracy[sender] = mapk(y_test[sender_test_is].values, y_pred[sender_test_is])
    accuracy_freq[sender] = mapk(y_test[sender_test_is].values, y_pred_freq[sender_test_is])
    accuracy_KNN[sender] = mapk(y_test[sender_test_is].values, y_pred_KNN[sender_test_is])
    
    if accuracy_KNN[sender] > accuracy_freq[sender]:
        KNN_list.extend([sender])

    print('%15s | %15s | %15s' %(round(accuracy[sender],2),round(accuracy_freq[sender],2),round(accuracy_KNN[sender],2)))

accuracy_TOT = mapk(y_test.values, y_pred)
accuracy_freq_TOT = mapk(y_test.values, y_pred_freq)
accuracy_KNN_TOT = mapk(y_test.values, y_pred_KNN)

print('%s' %(107*'='))
print('%53s | ' %('mean accuracy'), end='')
print('%15s | %15s | %15s' %((round(accuracy_TOT,2)),round(accuracy_freq_TOT,2),round(accuracy_KNN_TOT,2)))

train size :  41175 
test size  :  2438 

 sender_nb |                                   sender |    acc centroid |        acc freq |         acc KNN 
---------- + ---------------------------------------- + --------------- + --------------- + ---------------
         0 |                christian.yoder@enron.com |            0.45 |             0.4 |            0.46
         1 |                     tim.belden@enron.com |            0.54 |            0.34 |            0.44
         2 |                amy.fitzpatrick@enron.com |            0.76 |            0.58 |            0.74
         3 |                 kevin.m.presto@enron.com |            0.26 |            0.11 |            0.23
         4 |                    alan.comnes@enron.com |            0.58 |            0.49 |            0.56
         5 |                richard.shapiro@enron.com |             0.5 |            0.45 |            0.49
         6 |                 jennifer.thome@enron.com |            0.84 |            0.68 |  

<h2>Create submission</h2>

In [9]:
import datetime

def create_submission(y_pred,X_test_df):
    now = datetime.datetime.now()
    
    predictions_towrite={}
    x_test=X_test_df.values
    for i in range(len(y_pred)):
        recipients=y_pred[i]
        mid=x_test[i][0]
        predictions_towrite[mid]=recipients

    count=0
    with open('pred/pred_'+ str(now.strftime("%Y-%m-%d-%H-%M")) + '.txt', 'w') as my_file:
        my_file.write('mid,recipients' + '\n')
        for ids, preds in predictions_towrite.items():
            count=count+1
            r=str(ids)+","
            for s in preds:
                r=r+" "+str(s)
            r=r+'\n'
            my_file.write(r)

In [10]:
import TFIDF_mod
from TFIDF_mod import TFIDF
from TFIDF_mod import LDA

address_books = create_address_books(training, y_df)
sender_test = X_sub_df.sender.unique().tolist()
y_pred_ALL = np.empty((X_sub_df.shape[0],10),dtype=object)
pdt = {}

X_tfidf_train_all = {}
X_tfidf_sub_all = {}
TFIDF_dict = {}

for sender in sender_test:
    # indices corresponding to the sender
    sender_train_is = np.array(X_df.sender == sender)
    sender_test_is = np.array(X_sub_df.sender == sender)
    y_train_all = y_df.recipients.copy()
    
    if sender in KNN_list:
        TFIDF_dict[sender] = TFIDF()

        X_tfidf_train_all[sender] = TFIDF_dict[sender].fit_transform(X_df[X_df.sender == sender])
        X_tfidf_sub_all[sender] = TFIDF_dict[sender].transform(X_sub_df[X_sub_df.sender == sender])

        pdt[sender] = Predictor_KNN(X_tfidf_train_all[sender], y_train_all[sender_train_is], sender, address_books)
        y_pred_ALL[sender_test_is] = pdt[sender].prediction(X_tfidf_sub_all[sender])
    
    else:
        TFIDF_dict[sender] = TFIDF()

        X_tfidf_train_all[sender] = TFIDF_dict[sender].fit_transform(X_df[X_df.sender == sender])
        X_tfidf_sub_all[sender] = TFIDF_dict[sender].transform(X_sub_df[X_sub_df.sender == sender])

        pdt[sender] = Predictor_CTFIDF(X_tfidf_train_all[sender], y_train_all[sender_train_is], sender, address_books)
        y_pred_ALL[sender_test_is] = pdt[sender].prediction(X_tfidf_sub_all[sender])
        
create_submission(y_pred_ALL,X_sub_df)