In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
stop_words = set(stopwords.words('english'))

In [35]:
df = pd.read_csv("CEAS_08.csv")
df_original = df.copy()

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39154 entries, 0 to 39153
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sender    39154 non-null  object
 1   receiver  38692 non-null  object
 2   date      39154 non-null  object
 3   subject   39126 non-null  object
 4   body      39154 non-null  object
 5   label     39154 non-null  int64 
 6   urls      39154 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 2.1+ MB


# Preprocessing

In [38]:
def clean_text(text):
    text = re.sub(r"<[^>]+>", "", str(text))  
    text = re.sub(r"[^a-zA-Z0-9\s\-]", "", text)  # Retain hyphens
    text = re.sub(r"[^\w\s]", "", text, flags=re.UNICODE)  
    text = [word.lower() for word in word_tokenize(text) if word.lower() not in stop_words]
    return " ".join(text).strip()    


df["clean_subject"] = df["subject"].apply(clean_text)
df["clean_body"] = df["body"].apply(clean_text)

# Extract sender domain
df["sender_domain"] = df["sender"].apply(lambda x: x.split("@")[-1] if pd.notnull(x) else "")
df['sender_domain'] = df['sender_domain'].str[:-1]

df["receiver_domain"] = df["receiver"].apply(lambda x: x.split("@")[-1] if pd.notnull(x) else "")
# df['receiver_domain'] = df['receiver_domain'].str[:-1]
df['receiver_domain'] = df['receiver_domain'].apply(lambda x: x[:-1] if x.endswith(">") else x)

# Parse date (handle inconsistent formats)
df["date"] = df["date"].apply(lambda x: pd.to_datetime(x, errors="coerce",utc = True))
df['hour'] = df['date'].dt.hour
df['day_of_week'] = df['date'].dt.dayofweek 
df['hour_normalized'] = df['hour'] / 23.0

df = df.dropna(subset=["label", "clean_subject", "clean_body","receiver","subject","date"])

In [39]:
df['label'].value_counts()

label
1    21812
0    16842
Name: count, dtype: int64

# Balancing

In [40]:
# Separate the majority and minority classes
majority_class = df[df['label'] == 1]
minority_class = df[df['label'] == 0]

# Randomly sample from the majority class to match the size of the minority class
balanced_majority_class = majority_class.sample(len(minority_class), random_state=42)

# Combine the balanced majority class with the minority class
df_balanced = pd.concat([balanced_majority_class, minority_class])

# Shuffle the resulting dataframe
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

df_balanced['label'].value_counts()

label
1    16842
0    16842
Name: count, dtype: int64

In [None]:
df_balanced = df_balanced.drop(columns=["date", "sender", "receiver", "subject", "body","hour"])

KeyError: "['date', 'sender', 'receiver', 'subject', 'body'] not found in axis"

In [None]:
df_balanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33684 entries, 0 to 33683
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   label            33684 non-null  int64  
 1   urls             33684 non-null  int64  
 2   clean_subject    33684 non-null  object 
 3   clean_body       33684 non-null  object 
 4   sender_domain    33684 non-null  object 
 5   receiver_domain  33684 non-null  object 
 6   hour             33684 non-null  float64
 7   day_of_week      33684 non-null  float64
 8   hour_normalized  33684 non-null  float64
dtypes: float64(3), int64(2), object(4)
memory usage: 2.3+ MB


In [47]:
df_balanced.to_csv("CEAS_08_cleaned_balanced.csv", index=False)

# Vectorize

In [51]:
# df_balanced
df_original['body'].iloc[33682]

'\n\n\n\n\n\n\nP xn harm izf acy\n\nVis rpb it Your Local Target P kh h qi arm lxk acy\nfor Ge hzq ner ix ic Dr vhj ug obj s, Re tx fills, & More.\n\nhttp://ingenuityseat.com\n\n\n\n\n'

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

ModuleNotFoundError: No module named 'sklearn'