# Imports

In [4]:
import pandas as pd
import re
import string
import numpy as np
#from nltk.stem import FrenchStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

In [19]:
import plotly.express as px
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk import ngrams
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Loading the data - emails

In [7]:
# Load the data
df = pd.read_csv('C:\\Users\\aboukhaled001\\Documents\\all-email-v03.csv', sep=';')

In [8]:
df.sample(10)

Unnamed: 0,From,Body,Class,Language,Role,Footer
751,PONFOORT Monique <mponfoort@ponant.com>,Monique Ponfoort no longer works at PONANT. Ca...,1,1,0,0
746,Hélène HERVET <helene.hervet@prosolgestion.fr>,"Bonjour \t""Je ne suis pas en mesure de relever...",1,0,0,0
330,de Battisti Marianne <marianne.de.battisti@ica...,Chers amis\tA compter du 1er mai 2022\t j’aura...,1,0,0,0
259,cyril.audousset@engie.com,Dear sender\tI am currently on a business trip...,0,1,0,0
931,Leveque Muriel <muriel.leveque@korian.com>,"Dear sender\t"" I have left the company and inv...",1,1,0,0
303,BOISSIN Delphine <delphine.boissin@biomerieux....,Hello \tI am on full-day workshops this week. ...,0,2,0,1
608,Caroline Baudin <caroline.baudin@geosys.com>,thanks for your email .I am out of the offi...,0,1,0,0
743,GUILLON Johanna <Johanna.GUILLON@bernardcontro...,Bonjour \tJe suis absente\t je reste joignable...,0,2,0,0
773,"LEMPIRE, Benjamin <b.lempire@reim-edr.fr>","LEMPIRE\t"" Benjamin a quitté Le Groupe Edmond ...",1,0,0,0
202,annick.rynders@equans.com,Dear sender\t thank you for your mail. Please...,0,1,0,0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 967 entries, 0 to 966
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   From      967 non-null    object
 1   Body      965 non-null    object
 2   Class     967 non-null    int64 
 3   Language  967 non-null    int64 
 4   Role      967 non-null    int64 
 5   Footer    967 non-null    int64 
dtypes: int64(4), object(2)
memory usage: 45.5+ KB


# Alaysing the emails

## Proportion of Footer vs NO Footer

In [10]:
# Count the number of instances of each class
class_counts = df['Footer'].value_counts().to_dict()

# Create the pie chart
fig = px.pie(values=list(class_counts.values()),
             names=['NO Footer', 'Footer'],
             labels=['NO Footer' if key == 0 else 'Footer' for key in class_counts.keys()],
             title='Proportion of Footer vs NO Footer')

# Show the chart
fig.show()

## Proportion of Absence vs Quiting

In [13]:
# Count the number of instances of each class
class_counts = df['Class'].value_counts().to_dict()

# Create the pie chart
fig = px.pie(values=list(class_counts.values()),
             names=['Absence', 'Quiting', 'UNK'],
             labels=['Absence' if key == 0 else 'Quiting' if key == 1 else 'UNK' for key in class_counts.keys()],
             title='Proportion of Absence vs Quiting')

# Show the chart
fig.show()

## Proportion of Role change vs No Role change

In [15]:
# Count the number of instances of each class
quit_df = df[df['Class'] == 1]
class_counts = quit_df['Role'].value_counts().to_dict()

# Create the pie chart
fig = px.pie(values=list(class_counts.values()),
             names=['NO Role change', 'Role change'],
             labels=['NO Role change' if key == 0 else 'Role change' for key in class_counts.keys()],
             title='Proportion of Role change vs No Role change')

# Show the chart
fig.show()

## Proportion of Different Languages

In [17]:
# Count the number of instances of each class
class_counts = df['Language'].value_counts().to_dict()

# Create the pie chart
fig = px.pie(values=list(class_counts.values()),
             names=['FR', 'EN', 'FR+EN' , 'MIX'],
             labels=['FR' if key == 0 else 'EN' if key == 1 else 'FR+EN' if key == 2 else 'MIX' for key in class_counts.keys()],
             title='Proportion of Different Languages')

# Show the chart
fig.show()

## Extraction of most frequent words and ngrams from email with footers

In [37]:

import nltk

# Filter the rows with Footer value == 1
footer_rows = df[df['Footer'] == 1]

# Compile the text of the Body column in one corpus
corpus = ' '.join(footer_rows['Body'].tolist())

# Perform basic text processing on the corpus
corpus = re.sub(r'[^\w\s]', '', corpus)
corpus = corpus.lower()


n = 30  # number of top-n frequent items to plot

def get_word_freq(text, n):
    # tokenize the text into words
    words = nltk.wordpunct_tokenize(text)
    fdist = FreqDist(words)
    top_n_words = fdist.most_common(n)
    return top_n_words

def get_2gram_freq(text, n, ngram_range=(2, 2)):
    # tokenize the text into words
    words = nltk.wordpunct_tokenize(text)
    # get the n-grams
    ngrams = nltk.ngrams(words, ngram_range[1])
    fdist = FreqDist(ngrams)
    top_n_ngrams = fdist.most_common(n)
    return top_n_ngrams

def get_3gram_freq(text, n, ngram_range=(3, 3)):
    # tokenize the text into words
    words = nltk.wordpunct_tokenize(text)
    # get the n-grams
    ngrams = nltk.ngrams(words, ngram_range[1])
    fdist = FreqDist(ngrams)
    top_n_ngrams = fdist.most_common(n)
    return top_n_ngrams

# get word frequency
word_freq = get_word_freq(corpus, n)
# plot the word frequency
fig = px.bar(word_freq, y=[x[0] for x in word_freq], x=[x[1] for x in word_freq], title="Word Frequency")
fig.show()

# get 2-gram frequency
ngram2_freq = get_2gram_freq(corpus, n)
# plot the 2-gram frequency
fig = px.bar(ngram2_freq, y=[str(x[0]) for x in ngram2_freq], x=[x[1] for x in ngram2_freq], title="2-gram Frequency")
fig.show()

# get 2-gram frequency
ngram3_freq = get_3gram_freq(corpus, n)
# plot the 2-gram frequency
fig = px.bar(ngram3_freq, y=[str(x[0]) for x in ngram3_freq], x=[x[1] for x in ngram3_freq], title="3-gram Frequency")
fig.show()

In [38]:
ngram3_freq

[(('you', 'are', 'not'), 134),
 (('the', 'intended', 'recipient'), 133),
 (('and', 'any', 'attachments'), 124),
 (('if', 'you', 'are'), 118),
 (('in', 'error', 'please'), 118),
 (('notify', 'the', 'sender'), 117),
 (('are', 'not', 'the'), 109),
 (('of', 'this', 'message'), 108),
 (('not', 'the', 'intended'), 104),
 (('message', 'and', 'any'), 85),
 (('de', 'ce', 'message'), 82),
 (('you', 'have', 'received'), 80),
 (('email', 'and', 'any'), 79),
 (('this', 'email', 'and'), 76),
 (('if', 'you', 'have'), 75),
 (('have', 'received', 'this'), 75),
 (('the', 'sender', 'immediately'), 73),
 (('is', 'strictly', 'prohibited'), 69),
 (('of', 'this', 'email'), 69),
 (('ce', 'message', 'par'), 66),
 (('message', 'par', 'erreur'), 65),
 (('intended', 'solely', 'for'), 59),
 (('solely', 'for', 'the'), 58),
 (('please', 'notify', 'the'), 54),
 (('are', 'confidential', 'and'), 54),
 (('this', 'email', 'in'), 54),
 (('email', 'in', 'error'), 54),
 (('this', 'message', 'in'), 53),
 (('message', 'in', '

## Droping empty emails

In [41]:
df

Unnamed: 0,From,Body,Class,Language,Role,Footer
0,"Froger, Christelle <christelle.froger@butachim...",Hello \t I am currently out of the office with...,0,1,0,0
1,Marwan BOUDIB <marwan.boudib@socgen.com>,thank you for your email \t please note tha...,1,3,0,0
2,DEBITUS Benoit <benoit.debitus@inovie.fr>,Bonjour \tJe suis en congés jusqu'au 28 juin....,0,0,0,0
3,PERON Cédric <cedric.peron@dpd.fr>,Bonjour \tJe suis actuellement absent et ser...,0,0,0,0
4,CORTESE Cecilia <cortesec@afd.fr>,Bonjour \tJe suis en mission à l'étranger et ...,0,0,0,0
...,...,...,...,...,...,...
962,Emmanuel ROUX <emmanuel.roux@celine.fr>,je suis actuellement indisponible jusqu’au 07/...,0,2,0,1
963,MOREL-KHAN Anne-Dorothée [EIFFAGE CONSTRUCTION...,Bonjour \tJe serai absente du bureau jusqu'au ...,0,2,0,1
964,GERIFAUD Richard [EIFFAGE CONSTRUCTION] <Richa...,Bonjour \tActuellement absent\t pour toute urg...,0,2,0,1
965,SAMUEL Joelle <samuelj@afd.fr>,thank you for your email .I am currently ou...,0,1,0,0


In [50]:
df=df.dropna(subset=['Body'])

In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 965 entries, 0 to 966
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   From      965 non-null    object
 1   Body      965 non-null    object
 2   Class     965 non-null    int64 
 3   Language  965 non-null    int64 
 4   Role      965 non-null    int64 
 5   Footer    965 non-null    int64 
dtypes: int64(4), object(2)
memory usage: 52.8+ KB


# Classification Footer vs NO Footer

In [61]:
# Defining the feature and target variables

X = df[['Body']]
y = df['Footer']

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Performing TF-IDF vectorization
vectorizer = TfidfVectorizer(ngram_range=(1, 1))
X_train = vectorizer.fit_transform(X_train['Body'])
X_test = vectorizer.transform(X_test['Body'])

# Training the classifiers
clf_lr = LogisticRegression(random_state=0)
clf_lr.fit(X_train, y_train)

clf_svm = SVC(random_state=0)
clf_svm.fit(X_train, y_train)

clf_xgb = XGBClassifier(random_state=0)
clf_xgb.fit(X_train, y_train)

clf_knn = KNeighborsClassifier()
clf_knn.fit(X_train, y_train)

# Printing the accuracy, precision, recall, and f-score for each classifier
# Validation
print("\nLogistic Regression")
print("-----------------------")
y_pred = clf_lr.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("Accuracy:", clf_lr.score(X_test, y_test))
print("Precision:", precision)
print("Recall:", recall)
print("F-Score:", fscore)


print("\nSupport Vector Machine")
print("-----------------------")
y_pred = clf_svm.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("Accuracy:", clf_svm.score(X_test, y_test))
print("Precision:", precision)
print("Recall:", recall)
print("F-Score:", fscore)

print("\nXGBoost")
print("-----------------------")
y_pred = clf_xgb.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("Accuracy:", clf_xgb.score(X_test, y_test))
print("Precision:", precision)
print("Recall:", recall)
print("F-Score:", fscore)

print("\nKNN")
print("-----------------------")
y_pred = clf_knn.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("Accuracy:", clf_xgb.score(X_test, y_test))
print("Precision:", precision)
print("Recall:", recall)
print("F-Score:", fscore)



Logistic Regression
-----------------------
Accuracy: 0.9620689655172414
Precision: 0.9402985074626866
Recall: 0.9
F-Score: 0.9197080291970803

Support Vector Machine
-----------------------
Accuracy: 0.9689655172413794
Precision: 0.9420289855072463
Recall: 0.9285714285714286
F-Score: 0.935251798561151

XGBoost
-----------------------
Accuracy: 0.9620689655172414
Precision: 0.9154929577464789
Recall: 0.9285714285714286
F-Score: 0.921985815602837

KNN
-----------------------
Accuracy: 0.9620689655172414
Precision: 0.8271604938271605
Recall: 0.9571428571428572
F-Score: 0.8874172185430464


# Classification Absence vs Quiting

In [64]:

# Defining the feature and target variables

X = df[['Body']]
y = df['Class']

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Performing TF-IDF vectorization
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_train = vectorizer.fit_transform(X_train['Body'])
X_test = vectorizer.transform(X_test['Body'])

# Training the classifiers
clf_lr = LogisticRegression(random_state=0)
clf_lr.fit(X_train, y_train)

clf_svm = SVC(random_state=0)
clf_svm.fit(X_train, y_train)

clf_xgb = XGBClassifier(random_state=0)
clf_xgb.fit(X_train, y_train)

clf_knn = KNeighborsClassifier()
clf_knn.fit(X_train, y_train)

# Printing the accuracy, precision, recall, and f-score for each classifier
# Validation
print("\nLogistic Regression")
print("-----------------------")
y_pred = clf_lr.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='weighted')
print("Accuracy:", clf_lr.score(X_test, y_test))
print("Precision:", precision)
print("Recall:", recall)
print("F-Score:", fscore)


print("\nSupport Vector Machine")
print("-----------------------")
y_pred = clf_svm.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='weighted')
print("Accuracy:", clf_svm.score(X_test, y_test))
print("Precision:", precision)
print("Recall:", recall)
print("F-Score:", fscore)

print("\nXGBoost")
print("-----------------------")
y_pred = clf_xgb.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='weighted')
print("Accuracy:", clf_xgb.score(X_test, y_test))
print("Precision:", precision)
print("Recall:", recall)
print("F-Score:", fscore)

print("\nKNN")
print("-----------------------")
y_pred = clf_knn.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='weighted')
print("Accuracy:", clf_xgb.score(X_test, y_test))
print("Precision:", precision)
print("Recall:", recall)
print("F-Score:", fscore)


Logistic Regression
-----------------------
Accuracy: 0.8896551724137931
Precision: 0.897923197492163
Recall: 0.8896551724137931
F-Score: 0.8727620630304469

Support Vector Machine
-----------------------
Accuracy: 0.8724137931034482
Precision: 0.8774782753085737
Recall: 0.8724137931034482
F-Score: 0.8499736979243485

XGBoost
-----------------------
Accuracy: 0.9413793103448276
Precision: 0.9389905427033317
Recall: 0.9413793103448276
F-Score: 0.939563838878946

KNN
-----------------------
Accuracy: 0.9413793103448276
Precision: 0.8758426301555106
Recall: 0.8827586206896552
F-Score: 0.8712427454688652


# Filtering Footer text from emails

In [None]:
footer_markers = ["footer", "copyright" , "this message " , "this email", "this e-mail", "this communication", "before printing", "disclaimer", "important notice", "this sender", "this information", "le contenu de ce message", "le contenu de ce email" , "le contenu de cet email" , "le contenu de cet email" , "le contenu de cet e-mail" , "ce courriel", "ce message", "ce email", "cet email", "ce e-mail", "cet e-mail", "cette messagerie", "les donnèes", "toute utilisation"]

In [75]:
import re

def clean_footer(text, footer_markers):
    # loop through all the markers to find the start of the footer
    text = text.lower()
    footer_start = len(text)
    for marker in footer_markers:
        footer_start_temp = text.find(marker)
        if ((footer_start_temp != -1) & (footer_start_temp < footer_start) ) :
           footer_star = footer_start_temp
 

    # split the text into main text and footer
    main_text = text[:footer_start]
    footer = text[footer_start:]

    return main_text, footer

#email_text = "This is an email text...\n\nFooter:\nContact Us: email@example.com\nCopyright 2021..."
email_text = "This is an email text...\n\nFooter:\nContact Us: email@example.com\nCopyright 2021..."
main_text, footer = clean_footer(email_text, footer_markers)

In [76]:
print (main_text)
print (footer)

this is an email text..

