In [2]:
import pandas as pd
import numpy as np
import csv, nltk

from sklearn.model_selection import train_test_split
# from sklearn.svm.SVC

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2


from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import confusion_matrix,classification_report

from nltk.corpus import stopwords

In [3]:
enron = pd.read_csv("../../../datasets/emails-filtered.csv",
                    sep=",",
                    quoting=csv.QUOTE_ALL)


enron = enron.dropna(axis="index", subset=["message"])
enron

Unnamed: 0,author,category,file_index,file,message
0,allen-p,sent_mail,1.0,allen-p/_sent_mail/1.,\nHere is our forecast\n\n
1,allen-p,sent_mail,10.0,allen-p/_sent_mail/10.,\nTraveling to have a business meeting takes t...
2,allen-p,sent_mail,100.0,allen-p/_sent_mail/100.,\ntest successful. way to go!!!
3,allen-p,sent_mail,1000.0,allen-p/_sent_mail/1000.,"\nRandy,\n\n Can you send me a schedule of the..."
4,allen-p,sent_mail,1001.0,allen-p/_sent_mail/1001.,\nLet's shoot for Tuesday at 11:45.
...,...,...,...,...,...
30104,whalley-l,sent_mail,95.0,whalley-l/_sent_mail/95.,"\nBrad,\n\nGreg asked that I drop you a line t..."
30105,whalley-l,sent_mail,96.0,whalley-l/_sent_mail/96.,\nReceived test message.\n\n\n \n\tEnron Nor...
30106,whalley-l,sent_mail,97.0,whalley-l/_sent_mail/97.,\ni will try to call and talk you thru this
30107,whalley-l,sent_mail,98.0,whalley-l/_sent_mail/98.,"\nAllan,\n\nPlease remove Greg Whalley from th..."


# Pré processamento
 - remover quebra de linhas ao inicio (já que ela costuma aparecer por conta da remoção do header)
 - trocar o nome do autor por numeros sequenciais

In [4]:
def preProcessing(email_txt: str):
    if isinstance(email_txt, float):
        print(email_txt)
        return email_txt
    
    if len(email_txt) < 2:
        return email_txt

    return email_txt if email_txt[:1] != "\n" else email_txt[1:]

In [5]:
enron["message"] = enron["message"].apply(preProcessing)

In [6]:
enron = enron[["message", "author"]]

In [8]:
authors_names = list(enron["author"].unique())
author_name_to_idx = dict(zip(authors_names, range(0, len(authors_names))))

enron.author = enron.author.apply(lambda x: author_name_to_idx[x])

enron.to_csv(path_or_buf="../../../datasets/enron_interest.csv", index=False, quoting=csv.QUOTE_ALL)

In [223]:
unique_names = enron.author.unique()
df = enron

botton = 500
top = 1_000_000

for name in unique_names:
    emails_qtd = len(df[df.author == name])
    if emails_qtd > top or emails_qtd < botton:
        df = df[df.author != name]

df.author.unique(), df

(array([ 1,  3,  4, 14, 19, 21, 24, 34, 38, 43, 45, 48, 59, 60, 63, 70]),
                                                  message  author
 602    saw a lot of the bulls sell summer against len...       1
 603    amazing how with cash futures at $1 and the ba...       1
 604                   We both thank you\n\n\n   \n\t\n\t       1
 605    So, what is it?   And by the way, don't start ...       1
 606    sure, stop by and we'll arrange a place to mee...       1
 ...                                                  ...     ...
 29204  As you might have noticed, there was no file a...      70
 29205  The graphs I've set on your desks are the prod...      70
 29206  Matt Motley and Mike Swerzbin both said this t...      70
 29207  That is actually the report I'm having problem...      70
 29208  The STCA desk will be short 75 MW on peak at N...      70
 
 [19090 rows x 2 columns])

# Separação treino-teste

In [183]:
train, test = train_test_split(enron, test_size=0.2)

print(f"Tamanho do treino: {len(train)}\nTamanho do teste: {len(test)}")

Tamanho do treino: 5307
Tamanho do teste: 1327


# Extração de características

- Neste exemplo, bag of words 

In [184]:
word_re = r'\b\w+\b'
vectorizer = CountVectorizer(ngram_range=(2,2), token_pattern=word_re)

train_BOW = vectorizer.fit_transform(train['message'])
test_BOW  = vectorizer.fit_transform(test['message'])


# Seleção de características

In [40]:
y_train = train['author']
y_test = test['author']

In [136]:
vectorizer = TfidfVectorizer(ngram_range=(3,3), token_pattern=word_re).fit(train['message'])
train_tfidfv = vectorizer.transform(train['message'])
test_tfidfv = vectorizer.transform(test['message'])

In [137]:
print("TfidfVectorizer() ", train_tfidfv.shape, test_tfidfv.shape)

TfidfVectorizer()  (20916, 936090) (5229, 936090)


In [138]:
selector = SelectKBest(chi2, k=1000).fit(train_tfidfv, y_train)
x_train = selector.transform(train_tfidfv)
x_test  = selector.transform(test_tfidfv)

## Classificação

In [156]:
clf = MultinomialNB()
clf.fit(x_train, y_train)

MultinomialNB()

In [157]:
pred = clf.predict(x_test)
pred

array([45, 45, 45, ..., 70, 45, 45])

In [158]:
clf.score(x_test, y_test)

0.17804551539491298

# find accuracy, precision, recall

In [143]:
from sklearn.metrics import confusion_matrix,classification_report

confusion_matrix(pred,y_test)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [144]:
print(classification_report(pred,y_test))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         0
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00         0
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [187]:
def pipeline(df, 
             test_size: int = 0.2,
             x:str = "message",
             y:str = "author",
             word_re: str = r'\b\w+\b',
             vectorizer = TfidfVectorizer,
             selector = SelectKBest,
             K:int = 1000,
             ngram_range: tuple = (2,2),
             classifier = MultinomialNB):
    
    
    train, test = train_test_split(df, test_size=test_size)
    print(f"Tamanho do treino: {len(train)}\nTamanho do teste: {len(test)}")
    
    print("Extração de características")
    train_X = train[x]
    train_Y = train[y]
    test_X  = test[x]
    test_Y  = test[y]
    
    
    vectorizer_f = vectorizer(ngram_range=ngram_range, token_pattern=word_re).fit(train_X)
    
    train_X = vectorizer_f.transform(train_X)
    test_X = vectorizer_f.transform(test_X)  
    print("Shape: ", train_X.shape, test_X.shape)
    
    print("Seleção de características")
    selector_f = selector(chi2, k=K).fit(train_X, train_Y)
    train_X = selector_f.transform(train_X)
    test_X  = selector_f.transform(test_X)
    
    print("Classificação")
    clf = classifier()
    clf.fit(train_X, train_Y)
    pred = clf.predict(test_X)
    print(f"score {clf.score(test_X, test_Y)}")
    
    
    return pred, test_Y

In [224]:
pred, test_Y = pipeline(df, K=6000)
pred

Tamanho do treino: 15272
Tamanho do teste: 3818
Extração de características
Shape:  (15272, 343844) (3818, 343844)
Seleção de características
Classificação
score 0.46097433211105293


array([45, 45, 34, ..., 34, 45, 45])

In [225]:
print(classification_report(pred, test_Y))

              precision    recall  f1-score   support

           1       0.08      1.00      0.15        11
           3       0.40      0.97      0.57       108
           4       0.10      1.00      0.18        23
          14       0.12      1.00      0.22        18
          19       0.29      1.00      0.46        41
          21       0.38      0.88      0.53        91
          24       0.29      0.95      0.44        87
          34       0.74      0.83      0.78       423
          38       0.00      0.00      0.00         0
          43       0.01      1.00      0.02         1
          45       0.99      0.29      0.45      2758
          48       0.02      1.00      0.05         4
          59       0.14      0.95      0.24        21
          60       0.28      0.96      0.43        48
          63       0.02      1.00      0.04         4
          70       0.69      0.99      0.82       180

    accuracy                           0.46      3818
   macro avg       0.28   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
