In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from multiprocess import Pool, cpu_count
import torch
from transformers import pipeline


In [2]:
all_emails = pd.read_csv("all_emails.csv")
all_emails["Date"] = all_emails["Date"].astype("datetime64[ns]")
all_emails["Subject"] = all_emails["Subject"].fillna("")
all_emails["Body_Quoted"] = all_emails["Body_Quoted"].fillna("")
all_emails.dtypes


Top_Level_Folder            object
Mail_Folder                 object
Message_File                 int64
From                        object
To                          object
Cc                          object
Bcc                         object
Date                datetime64[ns]
Subject                     object
Body_Message                object
Body_Quoted                 object
dtype: object

Time for first 10k subjects:

- GPU: 58.7 sec
- CPU: 2 min 45 sec

https://huggingface.co/bhadresh-savani/distilbert-base-uncased-emotion

https://stackoverflow.com/questions/64320883/the-size-of-tensor-a-707-must-match-the-size-of-tensor-b-512-at-non-singleto


In [9]:
model_name = "bhadresh-savani/distilbert-base-uncased-emotion"

classifier = pipeline(
    "text-classification",
    model=model_name,
    top_k=None,
    device="cuda:0" if torch.cuda.is_available() else "cpu",
)


In [4]:
from model_helpers import make_text_chunks

if __name__ == "__main__":
    cpus = cpu_count()
    N = all_emails.shape[0]
    # N = 1000
    arglist = list(
        zip(
            [classifier.tokenizer] * N,
            (
                all_emails["Subject"]
                + pd.Series(["\n"] * all_emails.shape[0])
                + all_emails["Body_Message"]
            ).to_list()[0:N],
            [500] * N,
            all_emails.index.to_list()[0:N],
        )
    )
    with Pool(processes=cpus) as mp_pool:
        mp_results = mp_pool.map_async(make_text_chunks, arglist)
        mp_results.wait()
    mp_res_dict = {"message_index": [], "text": []}
    for r in mp_results.get():
        mp_res_dict["message_index"] += r["message_index"]
        mp_res_dict["text"] += r["text"]
    del mp_results
    del mp_pool
    mp_res_df = pd.DataFrame(mp_res_dict)
    del mp_res_dict


In [5]:
mp_res_df.to_csv("mp_res_df.csv")
mp_res_df


Unnamed: 0,message_index,text
0,0,"[CLS] december 14, 2000 - bear stearns'predict..."
1,0,"buy "" rating on divine interventures ( dvin )...."
2,0,/ / www. multexpf. com? mktg = sgpftx4 & promo...
3,0,"efficient networks ( efnt ), and others ( repo..."
4,0,questions and offer insights every market day ...
...,...,...
759042,516340,[CLS] trade with john lavorato this is a trade...
759043,516341,[CLS] gas hedges some of my position is with t...
759044,516342,[CLS] re : confidential 2 - - - - - original m...
759045,516343,[CLS] calgary analyst / associate analyst rank...


In [6]:
prediction = classifier(mp_res_df["text"].to_list())
prediction[0]


[{'label': 'joy', 'score': 0.9830018877983093},
 {'label': 'anger', 'score': 0.008043870329856873},
 {'label': 'sadness', 'score': 0.0035598266404122114},
 {'label': 'fear', 'score': 0.0026256139390170574},
 {'label': 'love', 'score': 0.0022013778798282146},
 {'label': 'surprise', 'score': 0.0005673774285241961}]

In [7]:
mp_res_df.drop(columns=["text"], inplace=True)
n_pred = len(prediction)
all_labels = [l["label"] for l in prediction[0]]
scores = {k: [] for k in all_labels}
for p in prediction[0:n_pred]:
    for lsc in p:
        label = lsc["label"]
        score = lsc["score"]
        scores[label].append(score)

for l in all_labels:
    mp_res_df[l] = scores[l]


In [8]:
mp_res_df


Unnamed: 0,message_index,joy,anger,sadness,fear,love,surprise
0,0,0.983002,0.008044,0.003560,0.002626,0.002201,0.000567
1,0,0.983461,0.008005,0.003772,0.002426,0.001648,0.000688
2,0,0.996039,0.001089,0.000678,0.001394,0.000485,0.000315
3,0,0.990148,0.004215,0.002064,0.002032,0.001129,0.000413
4,0,0.887971,0.041980,0.025462,0.037362,0.005110,0.002116
...,...,...,...,...,...,...,...
759042,516340,0.777546,0.142660,0.014953,0.056025,0.004292,0.004523
759043,516341,0.392532,0.524104,0.024239,0.051717,0.005088,0.002320
759044,516342,0.316997,0.428329,0.030341,0.207373,0.012474,0.004486
759045,516343,0.304190,0.279470,0.092436,0.255315,0.039902,0.028687


In [9]:
mp_res_df.groupby(by=["message_index"])[all_labels].mean()


Unnamed: 0_level_0,joy,anger,sadness,fear,love,surprise
message_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.972335,0.010994,0.006204,0.007841,0.001872,0.000753
1,0.277305,0.338040,0.292923,0.087476,0.002211,0.002046
2,0.949322,0.029321,0.006173,0.012011,0.001672,0.001502
3,0.949314,0.029328,0.006172,0.012012,0.001673,0.001502
4,0.017925,0.134444,0.091976,0.747981,0.002418,0.005255
...,...,...,...,...,...,...
516340,0.777546,0.142660,0.014953,0.056025,0.004292,0.004523
516341,0.392532,0.524104,0.024239,0.051717,0.005088,0.002320
516342,0.316997,0.428329,0.030341,0.207373,0.012474,0.004486
516343,0.304190,0.279470,0.092436,0.255315,0.039902,0.028687


In [10]:
all_emails.iloc[0:N, :].join(
    mp_res_df.groupby(by=["message_index"])[all_labels].mean()
).to_csv("all_emails_scores.csv", index=True)

all_emails = pd.read_csv("all_emails_scores.csv", index_col=0)
all_emails["Date"] = all_emails["Date"].astype("datetime64[ns]")
all_emails["Subject"] = all_emails["Subject"].fillna("")
all_emails["Body_Quoted"] = all_emails["Body_Quoted"].fillna("")
all_emails.dtypes


In [56]:
from model_helpers import vader_eval

if __name__ == "__main__":
    cpus = cpu_count()
    # N = 100000
    N = all_emails.shape[0]
    arglist = list(
        zip(
            all_emails.index.to_list()[0:N],
            (
                all_emails["Subject"]
                + pd.Series(["\n"] * all_emails.shape[0])
                + all_emails["Body_Message"]
            ).to_list()[0:N],
        )
    )
    with Pool(processes=cpus) as mp_pool:
        mp_results = mp_pool.map_async(vader_eval, arglist)
        mp_results.wait()
    mp_list = [r for r in mp_results.get()]
    del mp_results
    del mp_pool


In [57]:
vader_coef_dict = {}
vader_labels = ["neg", "neu", "pos", "compound", "id"]

for l in vader_labels:
    vader_coef_dict[l] = [c[l] for c in mp_list]
vader_coef = pd.DataFrame(vader_coef_dict)
vader_coef.set_index("id", inplace=True)
vader_coef


Unnamed: 0_level_0,neg,neu,pos,compound
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.008,0.830,0.162,0.9995
1,0.036,0.909,0.056,0.9927
2,0.015,0.883,0.102,0.9974
3,0.015,0.883,0.102,0.9974
4,0.000,1.000,0.000,0.0000
...,...,...,...,...
516340,0.000,0.955,0.045,0.3182
516341,0.000,1.000,0.000,0.0000
516342,0.000,1.000,0.000,0.0000
516343,0.000,1.000,0.000,0.0000


In [59]:
all_emails = all_emails.iloc[0:N, :].join(vader_coef)
all_emails


Unnamed: 0,Top_Level_Folder,Mail_Folder,Message_File,From,To,Cc,Bcc,Date,Subject,Body_Message,...,joy,anger,sadness,fear,love,surprise,neg,neu,pos,compound
0,allen-p,all_documents,1,['1.11913372.-2@multexinvestornetwork.com'],['pallen@enron.com'],[],[],2000-12-14 02:41:00,"December 14, 2000 - Bear Stearns' predictions ...",In today's Daily Update you'll find free repor...,...,0.972335,0.010994,0.006204,0.007841,0.001872,0.000753,0.008,0.830,0.162,0.9995
1,allen-p,all_documents,10,['messenger@ecm.bloomberg.com'],[],[],[],2000-12-13 16:35:00,Bloomberg Power Lines Report,Here is today's copy of Bloomberg Power Lines....,...,0.277305,0.338040,0.292923,0.087476,0.002211,0.002046,0.036,0.909,0.056,0.9927
2,allen-p,all_documents,100,['phillip.allen@enron.com'],['keith.holst@enron.com'],[],[],2000-10-09 14:16:00,Consolidated positions: Issues & To Do list,---------------------- Forwarded by Phillip K ...,...,0.949322,0.029321,0.006173,0.012011,0.001672,0.001502,0.015,0.883,0.102,0.9974
3,allen-p,all_documents,101,['phillip.allen@enron.com'],['keith.holst@enron.com'],[],[],2000-10-09 14:00:00,Consolidated positions: Issues & To Do list,---------------------- Forwarded by Phillip K ...,...,0.949314,0.029328,0.006172,0.012012,0.001673,0.001502,0.015,0.883,0.102,0.9974
4,allen-p,all_documents,102,['phillip.allen@enron.com'],['david.delainey@enron.com'],[],[],2000-10-05 13:26:00,,"Dave, \n\n Here are the names of the west desk...",...,0.017925,0.134444,0.091976,0.747981,0.002418,0.005255,0.000,1.000,0.000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
516340,zufferli-j,sent_items,95,['john.zufferli@enron.com'],['kori.loibl@enron.com'],[],[],2001-11-28 21:30:11,Trade with John Lavorato,This is a trade with OIL-SPEC-HEDGE-NG (John L...,...,0.777546,0.142660,0.014953,0.056025,0.004292,0.004523,0.000,0.955,0.045,0.3182
516341,zufferli-j,sent_items,96,['john.zufferli@enron.com'],['john.lavorato@enron.com'],[],[],2001-11-28 20:47:48,Gas Hedges,Some of my position is with the Alberta Term b...,...,0.392532,0.524104,0.024239,0.051717,0.005088,0.002320,0.000,1.000,0.000,0.0000
516342,zufferli-j,sent_items,97,['john.zufferli@enron.com'],['dawn.doucet@enron.com'],[],[],2001-11-28 15:20:00,RE: CONFIDENTIAL,2\n\n -----Original Message-----,...,0.316997,0.428329,0.030341,0.207373,0.012474,0.004486,0.000,1.000,0.000,0.0000
516343,zufferli-j,sent_items,98,['john.zufferli@enron.com'],['jeanie.slone@enron.com'],[],[],2001-11-27 19:52:45,Calgary Analyst/Associate,Analyst\t\t\t\t\tRank\n\nStephane Brodeur\t\t\...,...,0.304190,0.279470,0.092436,0.255315,0.039902,0.028687,0.000,1.000,0.000,0.0000


In [63]:
all_emails.drop(
    columns=["Subject", "Body_Message", "Body_Quoted"], inplace=False
).to_csv("sa_results.csv", index=True)
