<h2>Experimentation on RF model training for integration with Flask Application</h2>

In [7]:
import pandas as pd

In [8]:
#load datasets
ceas = 'data/CEAS_08.csv'
nazario = 'data/Nazario.csv'
fraud = 'data/Nigerian_Fraud.csv'
spam = 'data/SpamAssasin.csv'


In [9]:
ceas = pd.read_csv('data/CEAS_08.csv')
nazario = pd.read_csv('data/Nazario.csv')
fraud = pd.read_csv('data/Nigerian_Fraud.csv')
spam = pd.read_csv('data/SpamAssasin.csv')



In [10]:
print(ceas.head())
print(nazario.head())
print(fraud.head())
print(spam.head())

                                              sender  \
0                   Young Esposito <Young@iworld.de>   
1                       Mok <ipline's1983@icable.ph>   
2  Daily Top 10 <Karmandeep-opengevl@universalnet...   
3                 Michael Parker <ivqrnai@pobox.com>   
4  Gretchen Suggs <externalsep1@loanofficertool.com>   

                                         receiver  \
0                     user4@gvc.ceas-challenge.cc   
1                   user2.2@gvc.ceas-challenge.cc   
2                   user2.9@gvc.ceas-challenge.cc   
3  SpamAssassin Dev <xrh@spamassassin.apache.org>   
4                   user2.2@gvc.ceas-challenge.cc   

                              date  \
0  Tue, 05 Aug 2008 16:31:02 -0700   
1  Tue, 05 Aug 2008 18:31:03 -0500   
2  Tue, 05 Aug 2008 20:28:00 -1200   
3  Tue, 05 Aug 2008 17:31:20 -0600   
4  Tue, 05 Aug 2008 19:31:21 -0400   

                                             subject  \
0                          Never agree to be a loser   
1  

In [11]:
#standardise columns in all datasets
ceas.columns = ['Sender', 'Receiver', 'Date', 'Subject', 'Body', 'Urls', 'Label']
nazario.columns = ['Sender', 'Receiver', 'Date', 'Subject', 'Body', 'Urls', 'Label']
fraud.columns = ['Sender', 'Receiver', 'Date', 'Subject', 'Body', 'Urls', 'Label']
spam.columns = ['Sender', 'Receiver', 'Date', 'Subject', 'Body', 'Urls', 'Label']

In [12]:
#combine together the datasets for a larger sample
combined_dataset = pd.concat([ceas, nazario, fraud, spam])

In [13]:
print(f"Combine dataset shape: {combined_dataset.shape}")

Combine dataset shape: (49860, 7)


In [14]:
print(combined_dataset.head(20))

                                               Sender  \
0                    Young Esposito <Young@iworld.de>   
1                        Mok <ipline's1983@icable.ph>   
2   Daily Top 10 <Karmandeep-opengevl@universalnet...   
3                  Michael Parker <ivqrnai@pobox.com>   
4   Gretchen Suggs <externalsep1@loanofficertool.com>   
5   Caroline Aragon <dwthaidomainnamesm@thaidomain...   
6     Replica Watches <jhorton@thebakercompanies.com>   
7              Daily Top 10 <acidirev_1972@tcwpg.com>   
8                   qydlqcws-iacfym@issues.apache.org   
9       Daily Top 10 <orn|dent_1973@musicaedischi.it>   
10    ambrosius edwin <370jcmiller@flychautauqua.com>   
11         Alejandra Levy <rehearsings46@gametea.com>   
12      Daily Top 10 <Atchuthan-erbatest@weijgers.nl>   
13  Daily Top 10 <Scooter-obailat@picklesmaternity...   
14              Alphonso Roach <exited@realskate.com>   
15                         Racing <uqyrmo@sailing.ie>   
16  Daily Top 10 <Joep-ntorions

In [15]:
#clean the data
    #rem duplicates


#rem dups
combined_dataset = combined_dataset.drop_duplicates()

#drop rows with missings vals
combined_dataset = combined_dataset.dropna(subset=['Subject', 'Body'])

#check shape 
print(f"Combine dataset shape: {combined_dataset.shape}")

print(combined_dataset.head())

Combine dataset shape: (49772, 7)
                                              Sender  \
0                   Young Esposito <Young@iworld.de>   
1                       Mok <ipline's1983@icable.ph>   
2  Daily Top 10 <Karmandeep-opengevl@universalnet...   
3                 Michael Parker <ivqrnai@pobox.com>   
4  Gretchen Suggs <externalsep1@loanofficertool.com>   

                                         Receiver  \
0                     user4@gvc.ceas-challenge.cc   
1                   user2.2@gvc.ceas-challenge.cc   
2                   user2.9@gvc.ceas-challenge.cc   
3  SpamAssassin Dev <xrh@spamassassin.apache.org>   
4                   user2.2@gvc.ceas-challenge.cc   

                              Date  \
0  Tue, 05 Aug 2008 16:31:02 -0700   
1  Tue, 05 Aug 2008 18:31:03 -0500   
2  Tue, 05 Aug 2008 20:28:00 -1200   
3  Tue, 05 Aug 2008 17:31:20 -0600   
4  Tue, 05 Aug 2008 19:31:21 -0400   

                                             Subject  \
0                        

<h2>Normalise labels</h2>

In [16]:
print("NaNs in Label before any transformation:", combined_dataset['Label'].isna().sum())

NaNs in Label before any transformation: 0


In [17]:
#understand 
print("Original labels", combined_dataset['Label'].unique())


Original labels [1 0]


In [18]:
print(combined_dataset['Label'].value_counts())

Label
1    36073
0    13699
Name: count, dtype: int64


In [19]:
combined_dataset.to_csv('data/combined_dataset.csv', index=False)

print("Combined dataset saved as 'Combined_dataset.csv'")

Combined dataset saved as 'Combined_dataset.csv'


<h2>Feature Engineering</h2>

In [20]:
#standard
import pandas as pd
import numpy as np
import re 
import string

In [21]:
#text processing 
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
print(f"Dataset Shape: {combined_dataset.shape}")

Dataset Shape: (49772, 7)


<h3>To lower, rem special char, nums, and weird punctuation, rem stopwords.</h3>

In [28]:
from textstat import flesch_reading_ease, sentence_count
from collections import Counter

In [31]:
#func to extract stylistic features
def extract_text_features(text):
    if pd.isnull(text):
        return pd.Series([0, 0, 0, 0, 0, 0, 0]) # since is 7D feature vector.
    
    num_sentences = max(sentence_count(text), 1) #avoid div by 0
    words = text.split() # split str to list
    num_words = len(words)
    num_chars = len(text)

    #punctuation/capitalisation
    punctuation_count = sum(1 for char in text if char in string.punctuation)
    exclamation_count = text.count("!")
    question_count = text.count("?")
    uppercase_ration = sum(1 for char in text if char.isupper()) / max(len(text), 1)

    #readability/sentence complexity
    readability_score = flesch_reading_ease(text)
    avg_word_length = num_chars / max(num_words, 1)
    avg_sentence_length = num_words / num_sentences

    return pd.Series([avg_sentence_length, avg_word_length, punctuation_count,
                      exclamation_count, question_count, uppercase_ration, readability_score])  
             


In [32]:
#apply to dataset
feature_names = ['avg_sentence_length', 'avg_word_length', 'punctuation_count',
                 'exclamation_count', 'question_count', 'uppercase_ration', 'readability_score']

combined_dataset[feature_names] = combined_dataset['Body'].apply(extract_text_features)
combined_dataset.head()


AttributeError: module 'pandas' has no attribute 'insull'