In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from multiprocess import Pool, cpu_count
import torch
from transformers import pipeline


In [2]:
all_emails = pd.read_csv("all_emails.csv", index_col=0)
all_emails["Date"] = all_emails["Date"].astype("datetime64[ns]")
all_emails["Subject"] = all_emails["Subject"].fillna("")
all_emails["Body_Quoted"] = all_emails["Body_Quoted"].fillna("")
all_emails.dtypes


Top_Level_Folder            object
Mail_Folder                 object
Message_File                 int64
From                        object
To                          object
Cc                          object
Bcc                         object
Date                datetime64[ns]
Subject                     object
Body_Message                object
Body_Quoted                 object
dtype: object

In [3]:
all_emails


Unnamed: 0,Top_Level_Folder,Mail_Folder,Message_File,From,To,Cc,Bcc,Date,Subject,Body_Message,Body_Quoted
0,taylor-m,all_documents,1,['mark.taylor@enron.com'],['marc.r.cutler@bankamerica.com'],[],[],1998-10-30 14:43:00,Re: Friday,Wish we could go - but we're off to Ft. Lauder...,
1,taylor-m,sent,1,['mark.taylor@enron.com'],['marc.r.cutler@bankamerica.com'],[],[],1998-10-30 14:43:00,Re: Friday,Wish we could go - but we're off to Ft. Lauder...,
2,taylor-m,sent,2,['mark.taylor@enron.com'],['marc.r.cutler@bankamerica.com'],[],[],1998-10-30 14:56:00,Re: Friday,Hey Marc - any chance you guys might like to j...,
3,taylor-m,all_documents,2,['mark.taylor@enron.com'],['marc.r.cutler@bankamerica.com'],[],[],1998-10-30 14:56:00,Re: Friday,Hey Marc - any chance you guys might like to j...,
4,taylor-m,all_documents,3,['mark.taylor@enron.com'],['shari.stack@enron.com'],[],[],1998-10-30 15:02:00,Petrobras Swap,I think this has already been sent to you. Ju...,
...,...,...,...,...,...,...,...,...,...,...,...
516340,fischer-m,all_documents,428,['denise.williams@enron.com'],['ge_benefits@enron.com'],[],[],2002-07-12 11:13:00,URGENT!!! CUTOVER WEEKEND,It is vital that you leave your computer equip...,
516341,fischer-m,discussion_threads,339,['denise.williams@enron.com'],['ge_benefits@enron.com'],[],[],2002-07-12 11:13:00,URGENT!!! CUTOVER WEEKEND,It is vital that you leave your computer equip...,
516342,fischer-m,notes_inbox,2,['denise.williams@enron.com'],['ge_benefits@enron.com'],[],[],2002-07-12 11:13:00,URGENT!!! CUTOVER WEEKEND,It is vital that you leave your computer equip...,
516343,fischer-m,all_documents,429,['kurt.anderson@enron.com'],['gverkleeren@zilkha.com'],"['rwinsor@zilkha.com', 'jquick@zilkha.com', 'm...","['rwinsor@zilkha.com', 'jquick@zilkha.com', 'm...",2002-07-12 11:36:00,Re: FW: RE: Revised Availability Numbers,"Gary, thank you very much for your feedback. I...","> From: Gary Verkleeren\n> Sent: Friday, July..."


Time for first 10k subjects:

- GPU: 58.7 sec
- CPU: 2 min 45 sec

https://huggingface.co/bhadresh-savani/distilbert-base-uncased-emotion

https://stackoverflow.com/questions/64320883/the-size-of-tensor-a-707-must-match-the-size-of-tensor-b-512-at-non-singleto


In [4]:
torch.cuda.is_available()


True

In [5]:
model_name = "bhadresh-savani/distilbert-base-uncased-emotion"

classifier = pipeline(
    "text-classification",
    model=model_name,
    top_k=None,
    device="cuda:0" if torch.cuda.is_available() else "cpu",
)


In [6]:
from model_helpers import make_text_chunks

if __name__ == "__main__":
    cpus = cpu_count()
    N = all_emails.shape[0]
    # N = 1000
    arglist = list(
        zip(
            [classifier.tokenizer] * N,
            (
                all_emails["Subject"]
                + pd.Series(["\n"] * all_emails.shape[0])
                + all_emails["Body_Message"]
            ).to_list()[0:N],
            [500] * N,
            all_emails.index.to_list()[0:N],
        )
    )
    with Pool(processes=cpus) as mp_pool:
        mp_results = mp_pool.map_async(make_text_chunks, arglist)
        mp_results.wait()
    mp_res_dict = {"message_index": [], "text": []}
    for r in mp_results.get():
        mp_res_dict["message_index"] += r["message_index"]
        mp_res_dict["text"] += r["text"]
    del mp_results
    del mp_pool
    mp_res_df = pd.DataFrame(mp_res_dict)
    del mp_res_dict


In [7]:
mp_res_df.to_csv("mp_res_df.csv")
mp_res_df


Unnamed: 0,message_index,text
0,0,[CLS] re : friday wish we could go - but we're...
1,1,[CLS] re : friday wish we could go - but we're...
2,2,[CLS] re : friday hey marc - any chance you gu...
3,3,[CLS] re : friday hey marc - any chance you gu...
4,4,[CLS] petrobras swap i think this has already ...
...,...,...
759042,516340,[CLS] urgent!!! cutover weekend it is vital th...
759043,516341,[CLS] urgent!!! cutover weekend it is vital th...
759044,516342,[CLS] urgent!!! cutover weekend it is vital th...
759045,516343,[CLS] re : fw : re : revised availability numb...


In [8]:
prediction = classifier(mp_res_df["text"].to_list())
prediction[0]


[{'label': 'joy', 'score': 0.8708086013793945},
 {'label': 'anger', 'score': 0.06683482974767685},
 {'label': 'fear', 'score': 0.04083830118179321},
 {'label': 'sadness', 'score': 0.016119390726089478},
 {'label': 'love', 'score': 0.003809870220720768},
 {'label': 'surprise', 'score': 0.0015891272341832519}]

In [9]:
mp_res_df.drop(columns=["text"], inplace=True)
n_pred = len(prediction)
all_labels = [l["label"] for l in prediction[0]]
scores = {k: [] for k in all_labels}
for p in prediction[0:n_pred]:
    for lsc in p:
        label = lsc["label"]
        score = lsc["score"]
        scores[label].append(score)

for l in all_labels:
    mp_res_df[l] = scores[l]


In [10]:
mp_res_df


Unnamed: 0,message_index,joy,anger,fear,sadness,love,surprise
0,0,0.870809,0.066835,0.040838,0.016119,0.003810,0.001589
1,1,0.870809,0.066835,0.040838,0.016119,0.003810,0.001589
2,2,0.955962,0.021567,0.013542,0.004886,0.002495,0.001548
3,3,0.955962,0.021567,0.013542,0.004886,0.002495,0.001548
4,4,0.468014,0.315425,0.185531,0.019228,0.005064,0.006737
...,...,...,...,...,...,...,...
759042,516340,0.995261,0.001834,0.000816,0.001099,0.000776,0.000213
759043,516341,0.995261,0.001834,0.000816,0.001099,0.000776,0.000213
759044,516342,0.995261,0.001834,0.000816,0.001099,0.000776,0.000213
759045,516343,0.921587,0.042635,0.005179,0.026327,0.003220,0.001053


In [11]:
mp_res_df.groupby(by=["message_index"])[all_labels].mean()


Unnamed: 0_level_0,joy,anger,fear,sadness,love,surprise
message_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.870809,0.066835,0.040838,0.016119,0.003810,0.001589
1,0.870809,0.066835,0.040838,0.016119,0.003810,0.001589
2,0.955962,0.021567,0.013542,0.004886,0.002495,0.001548
3,0.955962,0.021567,0.013542,0.004886,0.002495,0.001548
4,0.517597,0.115636,0.269055,0.010137,0.002557,0.085018
...,...,...,...,...,...,...
516340,0.995261,0.001834,0.000816,0.001099,0.000776,0.000213
516341,0.995261,0.001834,0.000816,0.001099,0.000776,0.000213
516342,0.995261,0.001834,0.000816,0.001099,0.000776,0.000213
516343,0.921587,0.042635,0.005179,0.026327,0.003220,0.001053


In [12]:
all_emails.iloc[0:N, :].join(
    mp_res_df.groupby(by=["message_index"])[all_labels].mean()
).to_csv("all_emails_scores.csv", index=True)

all_emails = pd.read_csv("all_emails_scores.csv", index_col=0)
all_emails["Date"] = all_emails["Date"].astype("datetime64[ns]")
all_emails["Subject"] = all_emails["Subject"].fillna("")
all_emails["Body_Quoted"] = all_emails["Body_Quoted"].fillna("")
all_emails.dtypes


Top_Level_Folder            object
Mail_Folder                 object
Message_File                 int64
From                        object
To                          object
Cc                          object
Bcc                         object
Date                datetime64[ns]
Subject                     object
Body_Message                object
Body_Quoted                 object
joy                        float64
anger                      float64
fear                       float64
sadness                    float64
love                       float64
surprise                   float64
dtype: object

In [13]:
from model_helpers import vader_eval

if __name__ == "__main__":
    cpus = cpu_count()
    # N = 100000
    N = all_emails.shape[0]
    arglist = list(
        zip(
            all_emails.index.to_list()[0:N],
            (
                all_emails["Subject"]
                + pd.Series(["\n"] * all_emails.shape[0])
                + all_emails["Body_Message"]
            ).to_list()[0:N],
        )
    )
    with Pool(processes=cpus) as mp_pool:
        mp_results = mp_pool.map_async(vader_eval, arglist)
        mp_results.wait()
    mp_list = [r for r in mp_results.get()]
    del mp_results
    del mp_pool


In [14]:
vader_coef_dict = {}
vader_labels = ["neg", "neu", "pos", "compound", "id"]

for l in vader_labels:
    vader_coef_dict[l] = [c[l] for c in mp_list]
vader_coef = pd.DataFrame(vader_coef_dict)
vader_coef.set_index("id", inplace=True)
vader_coef


Unnamed: 0_level_0,neg,neu,pos,compound
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.000,0.936,0.064,0.2144
1,0.000,0.936,0.064,0.2144
2,0.000,0.782,0.218,0.6908
3,0.000,0.782,0.218,0.6908
4,0.013,0.886,0.101,0.9954
...,...,...,...,...
516340,0.019,0.822,0.160,0.8446
516341,0.019,0.822,0.160,0.8446
516342,0.019,0.822,0.160,0.8446
516343,0.011,0.902,0.087,0.8910


In [15]:
all_emails = all_emails.iloc[0:N, :].join(vader_coef)
all_emails


Unnamed: 0,Top_Level_Folder,Mail_Folder,Message_File,From,To,Cc,Bcc,Date,Subject,Body_Message,...,joy,anger,fear,sadness,love,surprise,neg,neu,pos,compound
0,taylor-m,all_documents,1,['mark.taylor@enron.com'],['marc.r.cutler@bankamerica.com'],[],[],1998-10-30 14:43:00,Re: Friday,Wish we could go - but we're off to Ft. Lauder...,...,0.870809,0.066835,0.040838,0.016119,0.003810,0.001589,0.000,0.936,0.064,0.2144
1,taylor-m,sent,1,['mark.taylor@enron.com'],['marc.r.cutler@bankamerica.com'],[],[],1998-10-30 14:43:00,Re: Friday,Wish we could go - but we're off to Ft. Lauder...,...,0.870809,0.066835,0.040838,0.016119,0.003810,0.001589,0.000,0.936,0.064,0.2144
2,taylor-m,sent,2,['mark.taylor@enron.com'],['marc.r.cutler@bankamerica.com'],[],[],1998-10-30 14:56:00,Re: Friday,Hey Marc - any chance you guys might like to j...,...,0.955962,0.021567,0.013542,0.004886,0.002495,0.001548,0.000,0.782,0.218,0.6908
3,taylor-m,all_documents,2,['mark.taylor@enron.com'],['marc.r.cutler@bankamerica.com'],[],[],1998-10-30 14:56:00,Re: Friday,Hey Marc - any chance you guys might like to j...,...,0.955962,0.021567,0.013542,0.004886,0.002495,0.001548,0.000,0.782,0.218,0.6908
4,taylor-m,all_documents,3,['mark.taylor@enron.com'],['shari.stack@enron.com'],[],[],1998-10-30 15:02:00,Petrobras Swap,I think this has already been sent to you. Ju...,...,0.517597,0.115636,0.269055,0.010137,0.002557,0.085018,0.013,0.886,0.101,0.9954
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
516340,fischer-m,all_documents,428,['denise.williams@enron.com'],['ge_benefits@enron.com'],[],[],2002-07-12 11:13:00,URGENT!!! CUTOVER WEEKEND,It is vital that you leave your computer equip...,...,0.995261,0.001834,0.000816,0.001099,0.000776,0.000213,0.019,0.822,0.160,0.8446
516341,fischer-m,discussion_threads,339,['denise.williams@enron.com'],['ge_benefits@enron.com'],[],[],2002-07-12 11:13:00,URGENT!!! CUTOVER WEEKEND,It is vital that you leave your computer equip...,...,0.995261,0.001834,0.000816,0.001099,0.000776,0.000213,0.019,0.822,0.160,0.8446
516342,fischer-m,notes_inbox,2,['denise.williams@enron.com'],['ge_benefits@enron.com'],[],[],2002-07-12 11:13:00,URGENT!!! CUTOVER WEEKEND,It is vital that you leave your computer equip...,...,0.995261,0.001834,0.000816,0.001099,0.000776,0.000213,0.019,0.822,0.160,0.8446
516343,fischer-m,all_documents,429,['kurt.anderson@enron.com'],['gverkleeren@zilkha.com'],"['rwinsor@zilkha.com', 'jquick@zilkha.com', 'm...","['rwinsor@zilkha.com', 'jquick@zilkha.com', 'm...",2002-07-12 11:36:00,Re: FW: RE: Revised Availability Numbers,"Gary, thank you very much for your feedback. I...",...,0.921587,0.042635,0.005179,0.026327,0.003220,0.001053,0.011,0.902,0.087,0.8910


In [16]:
all_emails.drop(
    columns=["Subject", "Body_Message", "Body_Quoted"], inplace=False
).to_csv("sa_results.csv", index=True)
