In [6]:
data_columns = {
    'label': 'IsSpam'
    ,'text': 'EmailContent'
}

In [10]:
#Polars vs Pandas benchmarking
#https://www.pola.rs/benchmarks.html
import polars as pl

pl_top_data = pl.read_csv('./Data/0-enron-emails-labeled.csv')
pl_bottom_data = pl.read_csv('./Data/1-enron-emails-labeled.csv')

merged = pl.concat([pl_top_data, pl_bottom_data]).drop('index')

expected_rows = 83448
assert expected_rows == merged.height, f'Expected merged data to contain {expected_rows} rows.'

labeled_emails = merged.rename(data_columns)

labeled_emails

IsSpam,EmailContent
i64,str
1,"""hello , how ar…"
0,"""maybe you're l…"
0,""" begin pgp sig…"
0,""" why filename …"
1,"""lynnette just …"
1,"""companion shor…"
0,"""you have recei…"
1,"""hello , we se…"
1,"""dear valued me…"
1,"""from the offic…"


In [1]:
import pandas as pd

'''
Data was pre-split due to GitHub file limitation. Index was added for joining the data.
Removed for use in project.
'''
data_top = pd.read_csv('./Data/0-enron-emails-labeled.csv').drop('index', axis=1)
data_bottom = pd.read_csv('./Data/1-enron-emails-labeled.csv').drop('index', axis=1)



In [2]:
data_top

Unnamed: 0,label,text
0,1,"hello , how are you doing ?\r\nbetter than all..."
1,0,maybe you're looking for something like this x...
2,0,begin pgp signed message hash shaescapenumber...
3,0,why filename xmp it should be basename xmp no...
4,1,lynnette just told me about what they have bee...
...,...,...
41719,0,hi given a date how do i get the last date of ...
41720,1,now you can order software on cd or download i...
41721,1,dear valued member canadianpharmacy provides a...
41722,0,subscribe change profile contact us long term ...


In [3]:
data_bottom

Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...
...,...,...
41719,1,srea jumps escapenumber after news releases sc...
41720,1,while we may have high expectations of our ass...
41721,1,"hello , welcome to pharmonl nuncupate ine s pa..."
41722,0,linux cifs client bounces shirishp us ibm com ...


In [4]:
data_top = data_top.reset_index(drop=True)
data_bottom = data_bottom.reset_index(drop=True)
enron_original = pd.concat([data_top, data_bottom], ignore_index=True)

print(enron_original.describe())

enron_original_size = len(enron_original)
print(f'Total Entries: {enron_original_size}')

expected_data_count = 83448
assert expected_data_count == enron_original_size, f'Original data should total: {expected_data_count}'

column_names = {
    'label': 'IsSpam',
    'text': 'EmailContent'
}

enron_original.rename(columns=column_names, inplace=True)

print(enron_original['IsSpam'].isna().sum())

enron_original

              label
count  83448.000000
mean       0.526196
std        0.499316
min        0.000000
25%        0.000000
50%        1.000000
75%        1.000000
max        1.000000
Total Entries: 83448
0


Unnamed: 0,IsSpam,EmailContent
0,1,"hello , how are you doing ?\r\nbetter than all..."
1,0,maybe you're looking for something like this x...
2,0,begin pgp signed message hash shaescapenumber...
3,0,why filename xmp it should be basename xmp no...
4,1,lynnette just told me about what they have bee...
...,...,...
83443,1,srea jumps escapenumber after news releases sc...
83444,1,while we may have high expectations of our ass...
83445,1,"hello , welcome to pharmonl nuncupate ine s pa..."
83446,0,linux cifs client bounces shirishp us ibm com ...


In [5]:
def calculate_sparsity(labelled_emails: pd.DataFrame, debug: bool=False) -> float:
    """
    :param labelled_emails: Data with emails labelled as (1)-SPAM or (0)-HAM
    :type labelled_emails: pd.DataFrame  
    
    :return: Percentage of Sparsity (Empty Values) 
    """
    #Check IsSpam for empty/non-correct values
    total_missing_is_spam = labelled_emails['IsSpam'].isna().sum()
       
    #Check EmailContent for empty values
    total_missing_email_content = labelled_emails['EmailContent'].isna().sum()
    
    total_elements = labelled_emails.size #total of 
    
    # Calculate the number of zero elements
    zero_elements = (total_missing_is_spam + total_missing_email_content)
    # Calculate sparsity
    sparsity_percentage = zero_elements / total_elements
    
    if debug:
        debug_info = f'''
        Total Element: {total_elements}
        Total Missing IsSpam: {total_missing_is_spam}
        Total Missing EmailContent: {total_missing_email_content}
        Sparsity Percentage: {zero_elements} / {total_elements}
        '''
        print(debug_info)
    
    return sparsity_percentage

sparsity = calculate_sparsity(enron_original, True)
print(f'Sparsity Percentage: {sparsity}')


        Total Element: 166896
        Total Missing IsSpam: 0
        Total Missing EmailContent: 0
        Sparsity Percentage: 0 / 166896
        
Sparsity Percentage: 0.0
