#  Import Libraries

In [1]:

import pandas as pd
import numpy as np
import re
from nltk.stem import PorterStemmer

#   Mount Google Drive

In [2]:

from google.colab import drive
drive.mount('/content/drive')

#  Set file path (change this to your actual file path in Drive)
file_path = "/content/drive/MyDrive/augmented_emails_cleaned.csv"

#   Load CSV
import pandas as pd
df = pd.read_csv(file_path)

print(" File loaded successfully!")
print("Shape:", df.shape)
df.head()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
 File loaded successfully!
Shape: (83548, 2)


Unnamed: 0,cleaned_text,label
0,ounc feather bowl hummingbird opec moment alab...,
1,wulvob medirc onlin qnb ikud viagra escapenumb...,
2,univers degre obtain prosper futur money earn ...,
3,work gateway world art explor tattoo flow walk...,
4,thi account onli veri cautiou escapenumb men w...,


# Dataset Info

In [3]:

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83548 entries, 0 to 83547
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   cleaned_text  83493 non-null  object
 1   label         100 non-null    object
dtypes: object(2)
memory usage: 1.3+ MB
None


# Shape (rows & cols)

In [4]:

print("Number of rows:", df.shape[0])
print("Number of columns:", df.shape[1])

Number of rows: 83548
Number of columns: 2


# Describe dataset

In [5]:
print(df.describe(include='all').round(2))

                                             cleaned_text label
count                                               83493   100
unique                                              81449     2
top     start date hourahead hour ancillari schedul aw...  spam
freq                                                  449    50


# Value Counts for Each Column

In [6]:
print("\n=== VALUE COUNTS FOR EACH COLUMN ===")
for column in df.columns:
    print(f"\n--- {column} ---")
    print("Data type:", df[column].dtype)
    print("Unique values:", df[column].nunique())
    print("Missing values:", df[column].isna().sum())
    print("Top value counts:")
    print(df[column].value_counts().head(10))


=== VALUE COUNTS FOR EACH COLUMN ===

--- cleaned_text ---
Data type: object
Unique values: 81449
Missing values: 55
Top value counts:
cleaned_text
start date hourahead hour ancillari schedul award varianc detect log messag pars file portland westdesk california schedul iso final schedul txt                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

# Missing Values

In [7]:

print(df.isnull().sum())
df.dropna(subset=["cleaned_text"], inplace=True)
print("After dropping null cleaned_text:", df.shape)

cleaned_text       55
label           83448
dtype: int64
After dropping null cleaned_text: (83493, 2)


# Duplicate Values

In [8]:
dup = df.duplicated().sum()
print(f"\nNumber of duplicated rows: {dup}")
df.drop_duplicates(inplace=True)
print("After removing duplicates:", df.shape)


Number of duplicated rows: 2044
After removing duplicates: (81449, 2)


# Unique Value Counts

In [9]:

for col in df.columns.tolist():
    print("No. of unique values in", col, "is", df[col].nunique())

No. of unique values in cleaned_text is 81449
No. of unique values in label is 2


# Text Cleaning Function

In [10]:
import re
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

# Download stopwords (first time only)
nltk.download('stopwords')

ps = PorterStemmer()
stop_words = set(stopwords.words("english"))

def simple_clean(text):
    """Full text preprocessing: lowercase, regex, stopword removal, stemming"""
    if pd.isna(text):
        return ""

    # lowercase
    text = str(text).lower()

    # remove links, numbers, special chars (keep only letters & spaces)
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)

    # remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()

    # tokenize, remove stopwords, apply stemming
    words = [ps.stem(word) for word in text.split() if word not in stop_words]

    return " ".join(words)

# Apply to your dataframe
df["cleaned_text"] = df["cleaned_text"].apply(simple_clean)

# Preview cleaned text
df.head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,cleaned_text,label
0,ounc feather bowl hummingbird opec moment alab...,
1,wulvob medirc onlin qnb ikud viagra escapenumb...,
2,univ degr obtain prosper futur money earn powe...,
3,work gateway world art explor tattoo flow walk...,
4,thi account onli veri cautiou escapenumb men w...,


#  Save Processed Dataset

In [11]:

df.to_csv("/content/preprocessed_dataset.csv", index=False)
print("\n Preprocessed dataset saved as 'preprocessed_dataset.csv'")

# Quick preview
df.head()


 Preprocessed dataset saved as 'preprocessed_dataset.csv'


Unnamed: 0,cleaned_text,label
0,ounc feather bowl hummingbird opec moment alab...,
1,wulvob medirc onlin qnb ikud viagra escapenumb...,
2,univ degr obtain prosper futur money earn powe...,
3,work gateway world art explor tattoo flow walk...,
4,thi account onli veri cautiou escapenumb men w...,
