In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

df = pd.read_csv("./data/parsed/parsed_emails.csv")

[nltk_data] Downloading package punkt to /Users/ayushgala/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ayushgala/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ayushgala/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df

Unnamed: 0,file,message,label,sender_email,subject,num_receivers,email_length,email_domain,sent_time,is_forwarded
0,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,1.0,phillip.allen@enron.com,,1,16,enron.com,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT)",0
1,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,1.0,phillip.allen@enron.com,Re: Hello,1,3,enron.com,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",0
2,allen-p/_sent_mail/1002.,Message-ID: <30965995.1075863688265.JavaMail.e...,1.0,phillip.allen@enron.com,Re: Hello,1,6,enron.com,"Thu, 31 Aug 2000 04:17:00 -0700 (PDT)",0
3,allen-p/_sent_mail/1003.,Message-ID: <16254169.1075863688286.JavaMail.e...,1.0,phillip.allen@enron.com,,2,24,enron.com,"Tue, 22 Aug 2000 07:44:00 -0700 (PDT)",0
4,allen-p/_sent_mail/1004.,Message-ID: <17189699.1075863688308.JavaMail.e...,1.0,phillip.allen@enron.com,Re: PRC review - phone calls,1,2,enron.com,"Fri, 14 Jul 2000 06:59:00 -0700 (PDT)",0
...,...,...,...,...,...,...,...,...,...,...
5023,arnold-j/deleted_items/163.,Message-ID: <11958623.1075852694098.JavaMail.e...,1.0,karen.buckley@enron.com,Telephone Interviews: Trading Track,16,69,enron.com,"Wed, 10 Oct 2001 12:53:27 -0700 (PDT)",0
5024,arnold-j/deleted_items/164.,Message-ID: <31095424.1075852694121.JavaMail.e...,1.0,a..shankman@enron.com,RE:,1,69,enron.com,"Wed, 10 Oct 2001 13:13:05 -0700 (PDT)",0
5025,arnold-j/deleted_items/165.,Message-ID: <10513011.1075852694149.JavaMail.e...,1.0,karen.buckley@enron.com,ENA Trading Track - Interviews October,33,45,enron.com,"Wed, 10 Oct 2001 12:17:23 -0700 (PDT)",0
5026,arnold-j/deleted_items/166.,Message-ID: <28097519.1075852694172.JavaMail.e...,0.0,margaret.allen@enron.com,THANKS!,1,25,enron.com,"Wed, 10 Oct 2001 09:00:39 -0700 (PDT)",0


In [4]:
def preprocess_text(text):
    # Tokenize
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords and non-alphabetic tokens
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens

# Apply preprocessing to EmailContent
df['processed_content'] = df['message'].apply(preprocess_text)

# Topic Modeling using LDA

In [5]:
# !pip3 install gensim
# !pip3 install scikit-learn

In [6]:
import gensim
from gensim import corpora
from gensim.models import LdaMulticore
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [7]:
dictionary = corpora.Dictionary(df['processed_content'])
corpus = [dictionary.doc2bow(text) for text in df['processed_content']]

# Train LDA model
num_topics = 10  # Adjust as needed
lda_model = gensim.models.LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=num_topics)

# Get topic distributions for each email
topic_distributions = [lda_model.get_document_topics(doc) for doc in corpus]

In [8]:
def get_dominant_topic(topic_dist):
    return max(topic_dist, key=lambda x: x[1])[0]

df['dominant_topic'] = [get_dominant_topic(dist) for dist in topic_distributions]

In [9]:
df['dominant_topic'].unique()

array([8, 6, 9, 1, 5, 4, 7, 3, 0, 2])

In [10]:
df

Unnamed: 0,file,message,label,sender_email,subject,num_receivers,email_length,email_domain,sent_time,is_forwarded,processed_content,dominant_topic
0,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,1.0,phillip.allen@enron.com,,1,16,enron.com,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT)",0,"[thyme, date, mon, oct, pdt, subject, phillip,...",8
1,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,1.0,phillip.allen@enron.com,Re: Hello,1,3,enron.com,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",0,"[thyme, date, thu, aug, pdt, subject, hello, p...",6
2,allen-p/_sent_mail/1002.,Message-ID: <30965995.1075863688265.JavaMail.e...,1.0,phillip.allen@enron.com,Re: Hello,1,6,enron.com,"Thu, 31 Aug 2000 04:17:00 -0700 (PDT)",0,"[thyme, date, thu, aug, pdt, subject, hello, p...",6
3,allen-p/_sent_mail/1003.,Message-ID: <16254169.1075863688286.JavaMail.e...,1.0,phillip.allen@enron.com,,2,24,enron.com,"Tue, 22 Aug 2000 07:44:00 -0700 (PDT)",0,"[thyme, date, tue, aug, pdt, subject, phillip,...",8
4,allen-p/_sent_mail/1004.,Message-ID: <17189699.1075863688308.JavaMail.e...,1.0,phillip.allen@enron.com,Re: PRC review - phone calls,1,2,enron.com,"Fri, 14 Jul 2000 06:59:00 -0700 (PDT)",0,"[thyme, date, fri, jul, pdt, subject, prc, rev...",9
...,...,...,...,...,...,...,...,...,...,...,...,...
5023,arnold-j/deleted_items/163.,Message-ID: <11958623.1075852694098.JavaMail.e...,1.0,karen.buckley@enron.com,Telephone Interviews: Trading Track,16,69,enron.com,"Wed, 10 Oct 2001 12:53:27 -0700 (PDT)",0,"[thyme, date, wed, oct, pdt, presto, forney, s...",8
5024,arnold-j/deleted_items/164.,Message-ID: <31095424.1075852694121.JavaMail.e...,1.0,a..shankman@enron.com,RE:,1,69,enron.com,"Wed, 10 Oct 2001 13:13:05 -0700 (PDT)",0,"[thyme, date, wed, oct, pdt, shankman, subject...",6
5025,arnold-j/deleted_items/165.,Message-ID: <10513011.1075852694149.JavaMail.e...,1.0,karen.buckley@enron.com,ENA Trading Track - Interviews October,33,45,enron.com,"Wed, 10 Oct 2001 12:17:23 -0700 (PDT)",0,"[thyme, date, wed, oct, pdt, k, allen, f, braw...",8
5026,arnold-j/deleted_items/166.,Message-ID: <28097519.1075852694172.JavaMail.e...,0.0,margaret.allen@enron.com,THANKS!,1,25,enron.com,"Wed, 10 Oct 2001 09:00:39 -0700 (PDT)",0,"[thyme, date, wed, oct, pdt, subject, thanks, ...",8


In [12]:
# Save the updated dataframe to a new CSV file
df.to_csv('./data/LDA_output/LDA_topics_modelled_emails.csv', index=False)