In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from multiprocess import Pool, cpu_count
import torch
from transformers import pipeline


In [2]:
all_emails = pd.read_csv("all_emails.csv")
all_emails["Date"] = all_emails["Date"].astype("datetime64[ns]")
all_emails["Subject"] = all_emails["Subject"].fillna("")
all_emails["Body_Quoted"] = all_emails["Body_Quoted"].fillna("")
all_emails.dtypes


Top_Level_Folder            object
Mail_Folder                 object
Message_File                 int64
From                        object
To                          object
Cc                          object
Bcc                         object
Date                datetime64[ns]
Subject                     object
Body_Message                object
Body_Quoted                 object
dtype: object

Time for first 10k subjects:

- GPU: 58.7 sec
- CPU: 2 min 45 sec

https://huggingface.co/bhadresh-savani/distilbert-base-uncased-emotion

https://stackoverflow.com/questions/64320883/the-size-of-tensor-a-707-must-match-the-size-of-tensor-b-512-at-non-singleto


In [3]:
model_name = "bhadresh-savani/distilbert-base-uncased-emotion"

classifier = pipeline(
    "text-classification",
    model=model_name,
    top_k=None,
    device="cuda:0" if torch.cuda.is_available() else "cpu",
)


In [4]:
from model_helpers import make_text_chunks

if __name__ == "__main__":
    cpus = cpu_count()
    N = all_emails.shape[0]
    # N = 1000
    arglist = list(
        zip(
            [classifier.tokenizer] * N,
            (
                all_emails["Subject"]
                + pd.Series(["\n"] * all_emails.shape[0])
                + all_emails["Body_Message"]
            ).to_list()[0:N],
            [500] * N,
            all_emails.index.to_list()[0:N],
        )
    )
    with Pool(processes=cpus) as mp_pool:
        mp_results = mp_pool.map_async(make_text_chunks, arglist)
        mp_results.wait()
    mp_res_dict = {"message_index": [], "text": []}
    for r in mp_results.get():
        mp_res_dict["message_index"] += r["message_index"]
        mp_res_dict["text"] += r["text"]
    del mp_results
    del mp_pool
    mp_res_df = pd.DataFrame(mp_res_dict)
    del mp_res_dict


In [5]:
mp_res_df.to_csv("mp_res_df.csv")
mp_res_df


Unnamed: 0,message_index,text
0,0,"[CLS] december 14, 2000 - bear stearns'predict..."
1,0,"buy "" rating on divine interventures ( dvin )...."
2,0,/ / www. multexpf. com? mktg = sgpftx4 & promo...
3,0,"efficient networks ( efnt ), and others ( repo..."
4,0,questions and offer insights every market day ...
...,...,...
1394,995,"[CLS] mark, attached is a spreadsheet that lis..."
1395,996,"[CLS] tara, please make the following changes ..."
1396,997,"[CLS] mark, the following is a guest password ..."
1397,998,[CLS] new generation - - - - - - - - - - - - -...


In [6]:
prediction = classifier(mp_res_df["text"].to_list())
prediction[0]


[{'label': 'joy', 'score': 0.9830018877983093},
 {'label': 'anger', 'score': 0.008043870329856873},
 {'label': 'sadness', 'score': 0.0035598266404122114},
 {'label': 'fear', 'score': 0.0026256139390170574},
 {'label': 'love', 'score': 0.0022013778798282146},
 {'label': 'surprise', 'score': 0.0005673774285241961}]

In [7]:
mp_res_df.drop(columns=["text"], inplace=True)
n_pred = len(prediction)
all_labels = [l["label"] for l in prediction[0]]
scores = {k: [] for k in all_labels}
for p in prediction[0:n_pred]:
    for lsc in p:
        label = lsc["label"]
        score = lsc["score"]
        scores[label].append(score)

for l in all_labels:
    mp_res_df[l] = scores[l]


In [8]:
mp_res_df


Unnamed: 0,message_index,joy,anger,sadness,fear,love,surprise
0,0,0.983002,0.008044,0.003560,0.002626,0.002201,0.000567
1,0,0.983461,0.008005,0.003772,0.002426,0.001648,0.000688
2,0,0.996039,0.001089,0.000678,0.001394,0.000485,0.000315
3,0,0.990148,0.004215,0.002064,0.002032,0.001129,0.000413
4,0,0.887971,0.041980,0.025462,0.037362,0.005110,0.002116
...,...,...,...,...,...,...,...
1394,995,0.841660,0.052000,0.012787,0.087633,0.002860,0.003060
1395,996,0.921916,0.045038,0.018032,0.003233,0.010099,0.001681
1396,997,0.976938,0.007531,0.003329,0.009777,0.001297,0.001127
1397,998,0.059810,0.267258,0.619821,0.044814,0.005690,0.002607


In [9]:
mp_res_df.groupby(by=["message_index"])[all_labels].mean()


Unnamed: 0_level_0,joy,anger,sadness,fear,love,surprise
message_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.972335,0.010994,0.006204,0.007841,0.001872,0.000753
1,0.277305,0.338040,0.292923,0.087476,0.002211,0.002046
2,0.949322,0.029321,0.006173,0.012011,0.001672,0.001502
3,0.949314,0.029328,0.006172,0.012012,0.001673,0.001502
4,0.017925,0.134444,0.091976,0.747981,0.002418,0.005255
...,...,...,...,...,...,...
995,0.841660,0.052000,0.012787,0.087633,0.002860,0.003060
996,0.921916,0.045038,0.018032,0.003233,0.010099,0.001681
997,0.976938,0.007531,0.003329,0.009777,0.001297,0.001127
998,0.059810,0.267258,0.619821,0.044814,0.005690,0.002607


In [10]:
all_emails.iloc[0:N, :].join(
    mp_res_df.groupby(by=["message_index"])[all_labels].mean()
).to_csv("all_emails_scores.csv", index=True)
