In [78]:
import pandas as pd
import numpy as np

import re

from utils import tag_mail

In [79]:
df = pd.read_json(r'data/gmail_turc_send.txt', encoding = 'utf-8')

In [80]:
df['cat'] = df.headers.apply(tag_mail)

In [81]:
def clean_and_tokenize(text):
    """
    Cleaning a document with:
        - Lowercase        
        - Removing numbers with regular expressions
        - Removing punctuation with regular expressions
        - Removing other artifacts
    And separate the document into words by simply splitting at spaces
    Params:
        text (string): a sentence or a document
    Returns:
        tokens (list of strings): the list of tokens (word units) forming the document
    """        
    # Lowercase
    text = text.lower()
    # Remove numbers
    text = re.sub(r"[0-9]+", "", text)
    # Remove punctuation
    REMOVE_PUNCT = re.compile("[.;:!\'?,\"()\[\]]")
    text = REMOVE_PUNCT.sub("", text)
    # Remove small words (1 and 2 characters)
    text = re.sub(r"\b\w{1,2}\b", "", text)
    # Remove HTML artifacts specific to the corpus we're going to work with
    REPLACE_HTML = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
    text = REPLACE_HTML.sub(" ", text)
    
    tokens = text.split()
    return tokens

In [82]:
df['clean_text_tok'] = df.body.apply(clean_and_tokenize)

In [83]:
categories = ['sport','education', 'alimentation']
vocab = []
vocab_concept = []

for cat in categories:
    with open('vocabulary/final/'+cat+'_all.txt', 'r') as f:
        v = f.read().split('\n')
        while("" in v) : 
            v.remove("") 
        vocab +=  v
    with open('vocabulary/final/'+cat+'_concept.txt', 'r') as f:
        v = f.read().split('\n')
        while("" in v) : 
            v.remove("")
        vocab += v
        vocab_concept.append(v)

In [84]:
def lookup(list):
    count = 0
    for x in list:
        if x in vocab:
            count += 1
    return count

In [85]:
def see_words(text):
    L = []
    for el in text:
        if el in vocab:
            L+=[el]
    return L

In [86]:
df['words'] = df['clean_text_tok'].apply(see_words)
df['count'] = df['clean_text_tok'].apply(lookup)

In [92]:
def addToFound(words_found, words):
    for w in words:
        if w in words_found:
            words_found[w] += 1
        else:
            words_found[w] = 1
    return words_found

In [93]:

def getExtractedWords(bow):
    last = bow[0]
    words_found = addToFound({}, last)
    for i in range(1, len(bow)):
        new = False
        for j in range(len(bow[i])):
            if bow[i][j] not in last:
                new = True
                break
        if new:
            last = bow[i]
            words_found = addToFound(words_found, last)
    return words_found

In [95]:
words = getExtractedWords(df.loc[df['count']>0].words.values.tolist())
print(len(words))
for w in words:
    print(w, words[w])

{'stage': 8}
22
formation 2
manger 6
famine 2
admission 1
club 2
restaurant 2
sport 12
polytechnique 4
alimentaire 2
champion 1
karting 2
piscine 1
connaissance 7
science 16
recherche 6
volleyball 2
noix 10
scolaire 2
stage 42
basket 4
sportive 4
football 2


In [75]:
for el in df.loc[df['count']>0].body:
    print(el)
    print('\n\n\n\n')

   Bonjour Alexis, 
 
Bonne année et meilleur vœux pour 2020 ! 
 
J'ai reçu des retours de mon école. 
Pour éditer la convention de stage, elle a besoin de quelques 
renseignements supplémentaires: 
- Le signataire de la convention de stage (nom, prénom, fonction, adresse 
mail) 
- Un contact RH (nom, prénom, fonction, adresse mail) 
- Un maître de stage (nom, prénom, fonction, adresse mail). Pour l'instant, 
j'avais supposé que c'était toi mais si ce n'est pas le cas, corrige moi. 
- Un descriptif du stage 
 
Ensuite concernant le PC, est-ce que c'est possible de faire un dual boot 
Windows/Linux, ou bien c'est forcément une VM intégrée à Windows ? 
 
Bien à toi, 
 
TURC Etienne 
 
Le ven. 13 déc. 2019 à 15:24, Alexis Deudon <alexis.deudon@paylead.fr> a 
écrit : 
 
> Le nouveau framework releasé par Netflix cette semaine (en attendant je 
> galère tjs autant à trouver un bon film mais bon..) 
> 
> Ce genre de trucs il en sort tous les mois j’ai l’impression 
> 
> 
> 
> 
> 
> Et peut-ê