In [1]:
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from multiprocess import Pool, cpu_count
import torch
from transformers import pipeline


In [2]:
all_emails = pd.read_csv("all_emails.csv", index_col=0)
all_emails["Date"] = all_emails["Date"].astype("datetime64[ns]")
all_emails["Subject"] = all_emails["Subject"].fillna("")
all_emails["Body_Quoted"] = all_emails["Body_Quoted"].fillna("")
all_emails.dtypes


Top_Level_Folder            object
Mail_Folder                 object
Message_File                 int64
From                        object
To                          object
Cc                          object
Bcc                         object
Date                datetime64[ns]
Subject                     object
Body_Message                object
Body_Quoted                 object
dtype: object

In [3]:
all_emails


Unnamed: 0,Top_Level_Folder,Mail_Folder,Message_File,From,To,Cc,Bcc,Date,Subject,Body_Message,Body_Quoted
0,taylor-m,all_documents,1,['mark.taylor@enron.com'],['marc.r.cutler@bankamerica.com'],[],[],1998-10-30 14:43:00,Re: Friday,Wish we could go - but we're off to Ft. Lauder...,
1,taylor-m,sent,1,['mark.taylor@enron.com'],['marc.r.cutler@bankamerica.com'],[],[],1998-10-30 14:43:00,Re: Friday,Wish we could go - but we're off to Ft. Lauder...,
2,taylor-m,sent,2,['mark.taylor@enron.com'],['marc.r.cutler@bankamerica.com'],[],[],1998-10-30 14:56:00,Re: Friday,Hey Marc - any chance you guys might like to j...,
3,taylor-m,all_documents,2,['mark.taylor@enron.com'],['marc.r.cutler@bankamerica.com'],[],[],1998-10-30 14:56:00,Re: Friday,Hey Marc - any chance you guys might like to j...,
4,taylor-m,all_documents,3,['mark.taylor@enron.com'],['shari.stack@enron.com'],[],[],1998-10-30 15:02:00,Petrobras Swap,I think this has already been sent to you. Ju...,
...,...,...,...,...,...,...,...,...,...,...,...
516340,fischer-m,all_documents,428,['denise.williams@enron.com'],['ge_benefits@enron.com'],[],[],2002-07-12 11:13:00,URGENT!!! CUTOVER WEEKEND,It is vital that you leave your computer equip...,
516341,fischer-m,discussion_threads,339,['denise.williams@enron.com'],['ge_benefits@enron.com'],[],[],2002-07-12 11:13:00,URGENT!!! CUTOVER WEEKEND,It is vital that you leave your computer equip...,
516342,fischer-m,notes_inbox,2,['denise.williams@enron.com'],['ge_benefits@enron.com'],[],[],2002-07-12 11:13:00,URGENT!!! CUTOVER WEEKEND,It is vital that you leave your computer equip...,
516343,fischer-m,all_documents,429,['kurt.anderson@enron.com'],['gverkleeren@zilkha.com'],"['rwinsor@zilkha.com', 'jquick@zilkha.com', 'm...","['rwinsor@zilkha.com', 'jquick@zilkha.com', 'm...",2002-07-12 11:36:00,Re: FW: RE: Revised Availability Numbers,"Gary, thank you very much for your feedback. I...","> From: Gary Verkleeren\n> Sent: Friday, July..."


Time for first 10k subjects:

- GPU: 58.7 sec
- CPU: 2 min 45 sec

https://huggingface.co/bhadresh-savani/distilbert-base-uncased-emotion

https://stackoverflow.com/questions/64320883/the-size-of-tensor-a-707-must-match-the-size-of-tensor-b-512-at-non-singleto


In [4]:
# model_name = "bhadresh-savani/distilbert-base-uncased-emotion"
model_name = "j-hartmann/emotion-english-distilroberta-base"
# model_name = "j-hartmann/emotion-english-roberta-large"

classifier = pipeline(
    "text-classification",
    model=model_name,
    top_k=None,
    device="cuda:0" if torch.cuda.is_available() else "cpu",
)


In [18]:
if not os.path.exists("mp_res_df.csv"):
    from model_helpers import make_text_chunks

    if __name__ == "__main__":
        cpus = cpu_count()
        N = all_emails.shape[0]
        # N = 1000
        arglist = list(
            zip(
                [classifier.tokenizer] * N,
                (
                    all_emails["Subject"]
                    + pd.Series(["\n"] * all_emails.shape[0])
                    + all_emails["Body_Message"]
                ).to_list()[0:N],
                [500] * N,
                all_emails.index.to_list()[0:N],
            )
        )
        with Pool(processes=cpus) as mp_pool:
            mp_results = mp_pool.map_async(make_text_chunks, arglist)
            mp_results.wait()
        mp_res_dict = {"message_index": [], "text": []}
        for r in mp_results.get():
            mp_res_dict["message_index"] += r["message_index"]
            mp_res_dict["text"] += r["text"]
        del mp_results
        del mp_pool
        mp_res_df = pd.DataFrame(mp_res_dict)
        del mp_res_dict

    mp_res_df.to_csv("mp_res_df.csv", index=True)

mp_res_df = pd.read_csv("mp_res_df.csv", index_col=0)
mp_res_df["text"] = mp_res_df["text"].fillna("")
mp_res_df


Unnamed: 0,message_index,text
0,0,<s>Re: Friday\nWish we could go - but we're of...
1,1,<s>Re: Friday\nWish we could go - but we're of...
2,2,<s>Re: Friday\nHey Marc - any chance you guys ...
3,3,<s>Re: Friday\nHey Marc - any chance you guys ...
4,4,<s>Petrobras Swap\nI think this has already be...
...,...,...
821032,516340,<s>URGENT!!! CUTOVER WEEKEND\nIt is vital that...
821033,516341,<s>URGENT!!! CUTOVER WEEKEND\nIt is vital that...
821034,516342,<s>URGENT!!! CUTOVER WEEKEND\nIt is vital that...
821035,516343,<s>Re: FW: RE: Revised Availability Numbers\nG...


In [22]:
prediction = []
for t in tqdm(mp_res_df["text"].to_list()):
    p = classifier(t)
    prediction.append(p)
prediction[0]


  0%|          | 0/821037 [00:00<?, ?it/s]



[[{'label': 'sadness', 'score': 0.7139124274253845},
  {'label': 'neutral', 'score': 0.12567007541656494},
  {'label': 'surprise', 'score': 0.09949631243944168},
  {'label': 'fear', 'score': 0.03212163597345352},
  {'label': 'joy', 'score': 0.022816859185695648},
  {'label': 'anger', 'score': 0.003631572239100933},
  {'label': 'disgust', 'score': 0.0023511152248829603}]]

In [27]:
prediction[1][0]

[{'label': 'sadness', 'score': 0.7139124274253845},
 {'label': 'neutral', 'score': 0.12567007541656494},
 {'label': 'surprise', 'score': 0.09949631243944168},
 {'label': 'fear', 'score': 0.03212163597345352},
 {'label': 'joy', 'score': 0.022816859185695648},
 {'label': 'anger', 'score': 0.003631572239100933},
 {'label': 'disgust', 'score': 0.0023511152248829603}]

In [28]:
n_pred = len(prediction)
all_labels = [l["label"] for l in prediction[0][0]]
scores = {k: [] for k in all_labels}
for p in prediction[0:n_pred]:
    for lsc in p[0]:
        label = lsc["label"]
        score = lsc["score"]
        scores[label].append(score)
scores_df = pd.DataFrame(scores)
scores_df


Unnamed: 0,sadness,neutral,surprise,fear,joy,anger,disgust
0,0.713912,0.125670,0.099496,0.032122,0.022817,0.003632,0.002351
1,0.713912,0.125670,0.099496,0.032122,0.022817,0.003632,0.002351
2,0.013708,0.771838,0.092119,0.014273,0.097815,0.008413,0.001836
3,0.013708,0.771838,0.092119,0.014273,0.097815,0.008413,0.001836
4,0.031141,0.592839,0.283273,0.048301,0.013918,0.024611,0.005916
...,...,...,...,...,...,...,...
821032,0.275086,0.471780,0.058879,0.088386,0.017012,0.066956,0.021900
821033,0.275086,0.471780,0.058879,0.088386,0.017012,0.066956,0.021900
821034,0.275086,0.471780,0.058879,0.088386,0.017012,0.066956,0.021900
821035,0.219129,0.622659,0.026248,0.068520,0.023513,0.030111,0.009819


In [29]:
mp_res_df.drop(columns=["text"], inplace=True)
mp_res_df = (
    mp_res_df.iloc[0:n_pred, :]
    .join(scores_df)
    .groupby(by=["message_index"])
    .agg("mean")
)
mp_res_df


Unnamed: 0_level_0,sadness,neutral,surprise,fear,joy,anger,disgust
message_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.713912,0.125670,0.099496,0.032122,0.022817,0.003632,0.002351
1,0.713912,0.125670,0.099496,0.032122,0.022817,0.003632,0.002351
2,0.013708,0.771838,0.092119,0.014273,0.097815,0.008413,0.001836
3,0.013708,0.771838,0.092119,0.014273,0.097815,0.008413,0.001836
4,0.038192,0.757222,0.108221,0.041640,0.012647,0.028981,0.013096
...,...,...,...,...,...,...,...
516340,0.275086,0.471780,0.058879,0.088386,0.017012,0.066956,0.021900
516341,0.275086,0.471780,0.058879,0.088386,0.017012,0.066956,0.021900
516342,0.275086,0.471780,0.058879,0.088386,0.017012,0.066956,0.021900
516343,0.219129,0.622659,0.026248,0.068520,0.023513,0.030111,0.009819


In [30]:
all_emails.join(mp_res_df).to_csv("all_emails_scores.csv", index=True)

all_emails = pd.read_csv("all_emails_scores.csv", index_col=0)
all_emails["Date"] = all_emails["Date"].astype("datetime64[ns]")
all_emails["Subject"] = all_emails["Subject"].fillna("")
all_emails["Body_Quoted"] = all_emails["Body_Quoted"].fillna("")
all_emails.dtypes


Top_Level_Folder            object
Mail_Folder                 object
Message_File                 int64
From                        object
To                          object
Cc                          object
Bcc                         object
Date                datetime64[ns]
Subject                     object
Body_Message                object
Body_Quoted                 object
sadness                    float64
neutral                    float64
surprise                   float64
fear                       float64
joy                        float64
anger                      float64
disgust                    float64
dtype: object

In [31]:
from model_helpers import vader_eval

if __name__ == "__main__":
    cpus = cpu_count()
    # N = 100000
    N = all_emails.shape[0]
    arglist = list(
        zip(
            all_emails.index.to_list()[0:N],
            (
                all_emails["Subject"]
                + pd.Series(["\n"] * all_emails.shape[0])
                + all_emails["Body_Message"]
            ).to_list()[0:N],
        )
    )
    with Pool(processes=cpus) as mp_pool:
        mp_results = mp_pool.map_async(vader_eval, arglist)
        mp_results.wait()
    mp_list = [r for r in mp_results.get()]
    del mp_results
    del mp_pool


In [32]:
vader_coef_dict = {}
vader_labels = ["neg", "neu", "pos", "compound", "id"]

for l in vader_labels:
    vader_coef_dict[l] = [c[l] for c in mp_list]
vader_coef = pd.DataFrame(vader_coef_dict)
vader_coef.set_index("id", inplace=True)
vader_coef


Unnamed: 0_level_0,neg,neu,pos,compound
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.000,0.936,0.064,0.2144
1,0.000,0.936,0.064,0.2144
2,0.000,0.782,0.218,0.6908
3,0.000,0.782,0.218,0.6908
4,0.013,0.886,0.101,0.9954
...,...,...,...,...
516340,0.019,0.822,0.160,0.8446
516341,0.019,0.822,0.160,0.8446
516342,0.019,0.822,0.160,0.8446
516343,0.011,0.902,0.087,0.8910


In [33]:
all_emails = all_emails.iloc[0:N, :].join(vader_coef)
all_emails


Unnamed: 0,Top_Level_Folder,Mail_Folder,Message_File,From,To,Cc,Bcc,Date,Subject,Body_Message,...,neutral,surprise,fear,joy,anger,disgust,neg,neu,pos,compound
0,taylor-m,all_documents,1,['mark.taylor@enron.com'],['marc.r.cutler@bankamerica.com'],[],[],1998-10-30 14:43:00,Re: Friday,Wish we could go - but we're off to Ft. Lauder...,...,0.125670,0.099496,0.032122,0.022817,0.003632,0.002351,0.000,0.936,0.064,0.2144
1,taylor-m,sent,1,['mark.taylor@enron.com'],['marc.r.cutler@bankamerica.com'],[],[],1998-10-30 14:43:00,Re: Friday,Wish we could go - but we're off to Ft. Lauder...,...,0.125670,0.099496,0.032122,0.022817,0.003632,0.002351,0.000,0.936,0.064,0.2144
2,taylor-m,sent,2,['mark.taylor@enron.com'],['marc.r.cutler@bankamerica.com'],[],[],1998-10-30 14:56:00,Re: Friday,Hey Marc - any chance you guys might like to j...,...,0.771838,0.092119,0.014273,0.097815,0.008413,0.001836,0.000,0.782,0.218,0.6908
3,taylor-m,all_documents,2,['mark.taylor@enron.com'],['marc.r.cutler@bankamerica.com'],[],[],1998-10-30 14:56:00,Re: Friday,Hey Marc - any chance you guys might like to j...,...,0.771838,0.092119,0.014273,0.097815,0.008413,0.001836,0.000,0.782,0.218,0.6908
4,taylor-m,all_documents,3,['mark.taylor@enron.com'],['shari.stack@enron.com'],[],[],1998-10-30 15:02:00,Petrobras Swap,I think this has already been sent to you. Ju...,...,0.757222,0.108221,0.041640,0.012647,0.028981,0.013096,0.013,0.886,0.101,0.9954
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
516340,fischer-m,all_documents,428,['denise.williams@enron.com'],['ge_benefits@enron.com'],[],[],2002-07-12 11:13:00,URGENT!!! CUTOVER WEEKEND,It is vital that you leave your computer equip...,...,0.471780,0.058879,0.088386,0.017012,0.066956,0.021900,0.019,0.822,0.160,0.8446
516341,fischer-m,discussion_threads,339,['denise.williams@enron.com'],['ge_benefits@enron.com'],[],[],2002-07-12 11:13:00,URGENT!!! CUTOVER WEEKEND,It is vital that you leave your computer equip...,...,0.471780,0.058879,0.088386,0.017012,0.066956,0.021900,0.019,0.822,0.160,0.8446
516342,fischer-m,notes_inbox,2,['denise.williams@enron.com'],['ge_benefits@enron.com'],[],[],2002-07-12 11:13:00,URGENT!!! CUTOVER WEEKEND,It is vital that you leave your computer equip...,...,0.471780,0.058879,0.088386,0.017012,0.066956,0.021900,0.019,0.822,0.160,0.8446
516343,fischer-m,all_documents,429,['kurt.anderson@enron.com'],['gverkleeren@zilkha.com'],"['rwinsor@zilkha.com', 'jquick@zilkha.com', 'm...","['rwinsor@zilkha.com', 'jquick@zilkha.com', 'm...",2002-07-12 11:36:00,Re: FW: RE: Revised Availability Numbers,"Gary, thank you very much for your feedback. I...",...,0.622659,0.026248,0.068520,0.023513,0.030111,0.009819,0.011,0.902,0.087,0.8910


In [34]:
all_emails.drop(
    columns=["Subject", "Body_Message", "Body_Quoted"], inplace=False
).to_csv("sa_results.csv", index=True)
