In [None]:
import pandas as pd

email_data = pd.read_csv(
    'C:/Users/User/OneDrive - Asia Pacific University/APU Final Year Project/FYP Email Project Documents/Email Dataset Python/enron.csv',
    delimiter=',', # Explicitly define the delimiter
    quotechar='"', # Handle quoted fields properly
    escapechar='\\', # Escape special characters
    engine='python', # Use the Python engine for better handling of irregular rows
    on_bad_lines='skip',
    encoding='ISO-8859-1',# Skip problematic lines
    skip_blank_lines=True # Skip completely blank lines
)

email_data = email_data.apply(lambda x: x.str.strip().str.replace(r'\s+', ' ', regex=True) if x.dtype == "object" else x)

#email_data = email_data.drop(columns=['Cc', 'Bcc'])

In [None]:
email_data

In [None]:
email_data.info(memory_usage='deep') 
print("Number of Rows and Columns: ", email_data.shape) 

# Data Cleaning

Duplication Exploration

In [None]:
duplicates = email_data.duplicated()
print(email_data[duplicates])
num_duplicates = duplicates.sum()
print("Number of Duplicate:", num_duplicates)

In [None]:
#check duplication for specific columns
check_duplicate_rows = email_data[email_data.duplicated(subset=['Message-ID', 'Date', 'From', 'To', 'Subject', 'Message'], keep=False)]
check_duplicate_rows

In [None]:
check_duplicate_rows = email_data[email_data.duplicated(subset=['Date', 'From', 'To', 'Subject', 'Message'], keep=False)]
check_duplicate_rows

Duplication Cleaning

In [None]:
#Remove duplication
email_data = email_data.drop_duplicates(subset=['Message-ID', 'Date', 'From', 'To', 'Subject', 'Message'], keep='first')
email_data

In [None]:
#Remove duplication
email_data = email_data.drop_duplicates(subset=['Date', 'From', 'To', 'Subject', 'Message'], keep='first')
email_data

Missing Values Exploration

In [None]:
#Check Missing Values #email_data.isnull()
email_data.isnull().sum()

In [None]:
#Check All Columns
check_null_rows = email_data[email_data[['Message-ID', 'Date', 'From', 'To', 'Subject', 'Message']].isnull().all(axis=1)]
check_null_rows

In [None]:
#Check All Columns Except Message-ID
check_null_rows = email_data[email_data[['Date', 'From', 'To', 'Subject', 'Message']].isnull().all(axis=1)]
check_null_rows

In [None]:
#Check From and To Columns 
check_null_rows = email_data[email_data[['From', 'To']].isnull().all(axis=1)]
check_null_rows

In [None]:
#Check From Only
check_null_rows = email_data[email_data[['From']].isnull().all(axis=1)]
check_null_rows

In [None]:
#Check Subject and To Message 
check_null_rows = email_data[email_data[['Subject', 'Message']].isnull().all(axis=1)]
check_null_rows

In [None]:
check_null_rows = email_data[email_data[['Message-ID']].isnull().all(axis=1)]
check_null_rows

Missing Values Cleaning

In [16]:
email_data = email_data.dropna(subset=['Message-ID', 'Date', 'From', 'To', 'Subject', 'Message'], how='all')
email_data = email_data.dropna(subset=['Date', 'From', 'To', 'Subject', 'Message'], how='all')
email_data = email_data.dropna(subset=['From', 'To'], how='all')
email_data = email_data.dropna(subset=['From'], how='all')
email_data = email_data.dropna(subset=['Subject', 'Message'], how='all')
email_data = email_data.dropna(subset=['Message-ID'], how='all')

In [None]:
email_data.info(memory_usage='deep') 
print("Number of Rows and Columns: ", email_data.shape) 

In [None]:
email_data

Noise Reduction Exploration

In [None]:
not_contain_javamail_check = email_data[~email_data['Message-ID'].str.contains('JavaMail', na=False)]
not_contain_javamail_check

Noise Reduction Cleaning

In [None]:
email_data = email_data[email_data['Message-ID'].str.contains('JavaMail', na=False)]
email_data

In [None]:
numerical_subjects_and_message = email_data[email_data['Subject'].str.match(r'^\d+$', na=False) & email_data['Message'].str.match(r'^\d+$', na=False)]
numerical_subjects_and_message

In [None]:
email_data.info(memory_usage='deep') 
print("Number of Rows and Columns: ", email_data.shape) 

In [None]:
# Filter rows where 'From' does not contain '@'
no_at_symbol = email_data[~email_data['From'].str.contains('@', na=False)]

pd.set_option('display.width', 1000)

print(no_at_symbol)

In [None]:
pd.set_option('display.max_rows', None)

from_counts = email_data['From'].value_counts()
print(from_counts)

Dropping Irrelevant features

In [25]:
email_data = email_data.drop(columns=['Message-ID'])

In [None]:
email_data.info(memory_usage='deep') 
print("Number of Rows and Columns: ", email_data.shape) 

In [27]:
cleaned_enron_dataset = 'enron_cleaned.csv'
email_data.to_csv(cleaned_enron_dataset, index=False)