In [None]:
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')

email_data = pd.read_csv('/content/drive/MyDrive/FYPDataset/further_cleaned_enron_dataset.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import ast
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import gensim
from gensim import corpora
from rake_nltk import Rake
from yake import KeywordExtractor
from keybert import KeyBERT

In [None]:
import nltk
nltk.download('stopwords')

r = Rake()  # Initialize RAKE
kw_extractor = KeywordExtractor()  # Initialize YAKE
kw_model = KeyBERT()  # Initialize KeyBERT

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
corpus = [text.split() if isinstance(text, str) and text else [] for text in email_data['Cleaned_Message']]
dictionary = corpora.Dictionary(corpus)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in corpus]

In [None]:
num_topics = 10  # Experiment with this number
lda_model = gensim.models.LdaModel(doc_term_matrix, num_topics=num_topics, id2word=dictionary)

for topic_id in range(num_topics):
    top_words = lda_model.show_topic(topic_id, topn=10)
    print(f"Topic {topic_id}: {', '.join([word for word, prob in top_words])}")

topic_distributions = [lda_model.get_document_topics(bow) for bow in doc_term_matrix]
email_data['topic_distribution'] = topic_distributions

def convert_topic_dist(topic_dist):
    topic_vec = [0] * num_topics
    for topic_id, prob in topic_dist:
        topic_vec[topic_id] = prob
    return topic_vec

email_data['topic_vector'] = email_data['topic_distribution'].apply(convert_topic_dist)
topic_matrix = pd.DataFrame(email_data['topic_vector'].tolist())
email_data = pd.concat([email_data, topic_matrix], axis=1)

Topic 0: image, day, go, know, time, week, want, like, think, good
Topic 1: url, travel, tx, new, fare, click, hotel, houston, city, n
Topic 2: final, market, ferc, file, customer, issue, transmission, commission, request, order
Topic 3: gas, capacity, day, d, pipeline, volume, contract, point, delivery, storage
Topic 4: email, subject, pm, cc, forward, enron, hou, ect, j, john
Topic 5: schedule, date, pm, hour, time, meeting, start, am, description, calendar
Topic 6: e, mail, email, receive, message, url, information, click, send, database
Topic 7: deal, enron, agreement, change, attach, trade, fax, transaction, credit, need
Topic 8: energy, company, power, market, price, say, california, year, news, state
Topic 9: enron, time, need, work, know, group, meeting, week, business, like


In [None]:
# 5. Feature Engineering (Beyond Topics)
# Sentiment Analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import string  # Import the string module
import textstat

analyzer = SentimentIntensityAnalyzer()
email_data['message_sentiment'] = email_data['Cleaned_Message'].apply(lambda text: analyzer.polarity_scores(text)['compound'] if isinstance(text, str) else 0)
email_data['subject_sentiment'] = email_data['Clean_Subject'].apply(lambda text: analyzer.polarity_scores(text)['compound'] if isinstance(text, str) else 0)


# Email Length
email_data['message_length'] = email_data['Cleaned_Message'].str.len()
email_data['subject_length'] = email_data['Clean_Subject'].str.len()

# Punctuation Count
def count_punctuation(text):
    if isinstance(text, str):
      count = 0
      for char in text:
          if char in string.punctuation:  # Now 'string' is recognized
              count += 1
      return count
    return 0

email_data['message_punctuation_count'] = email_data['Cleaned_Message'].apply(count_punctuation)
email_data['subject_punctuation_count'] = email_data['Clean_Subject'].apply(count_punctuation)

# Readability Metrics
email_data['message_readability'] = email_data['Cleaned_Message'].apply(lambda text: textstat.flesch_kincaid_grade(text) if isinstance(text, str) else 0)
email_data['subject_readability'] = email_data['Clean_Subject'].apply(lambda text: textstat.flesch_kincaid_grade(text) if isinstance(text, str) else 0)

# Sender Domain
def get_domain(email):
    if isinstance(email, str):
      try:
          return email.split('@')[1]
      except:
          return None
    return None

email_data['sender_domain'] = email_data['From'].apply(get_domain)

# Time Features (Handle potential errors)
try:
    email_data['hour_of_day'] = pd.to_datetime(email_data['Date & Time']).dt.hour
    email_data['day_of_week'] = pd.to_datetime(email_data['Date & Time']).dt.day_name()
except (TypeError, ValueError) as e:
    print(f"Error processing 'Date & Time': {e}")
    email_data['hour_of_day'] = None
    email_data['day_of_week'] = None

In [None]:
email_data.to_csv('/content/drive/MyDrive/FYPDataset/email_data_with_topics_and_features.csv', index=False)