In [1]:
import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'spam-mails-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F109196%2F260807%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240413%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240413T171849Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D1d4e4b36878f1541ef9e230123a6c32c3d7e7c46d48f13d0e95add359a0731bd5d5e22c0c46a0b6647d9c6dd2d554bf5f5915db92a008889650f6b445cfd4f6d934f1046a968b7b2ca5fbf6f1c944af54305c0948e8789682926c66fc1d547b85ee32af5a5d29750b8db6577fa24380c144c82fd8b97990e0a55511284f123aae1ecc637fec80b217347017889138b178ff342cbb73f65a00ddf9d32ac664c1795f6b2cc3d505f1e6300eeb393498cf29b6337d63c10c8c7119eb5d35164c6da3b75ed0ea810644f03279f9696b2f1de7aff584713ed652ba56d355d57cf0a039fe79a305c3a80a7ac751be4edaf0d096d4d1ee69eb513ecf55d6c30719442b6'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading spam-mails-dataset, 1954828 bytes compressed
Downloaded and uncompressed: spam-mails-dataset
Data source import complete.


In [2]:
# Import necessary libraries
import numpy as np
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/spam-mails-dataset/spam_ham_dataset.csv


In [3]:
# Download NLTK's stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# Read the dataset into a Pandas DataFrame
data = pd.read_csv("/kaggle/input/spam-mails-dataset/spam_ham_dataset.csv")
# Display the first few rows of the DataFrame
data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [5]:
# Preprocess the text data by removing carriage return and newline characters
data['text'] = data['text'].apply(lambda x :x.replace("\r\n"," "))
# Display the preprocessed DataFrame
data

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291 thi...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001 ( see at...",0
2,3624,ham,"Subject: neon retreat ho ho ho , we ' re aroun...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs this deal is to b...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft the transport vo...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms hpl c...,0
5168,2933,ham,Subject: calpine daily gas nomination > > juli...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


In [6]:
# Display the text of the first row in the DataFrame
data.text.iloc[0]

"Subject: enron methanol ; meter # : 988291 this is a follow up to the note i gave you on monday , 4 / 3 / 00 { preliminary flow data provided by daren } . please override pop ' s daily volume { presently zero } to reflect daily activity you can obtain from gas control . this change is needed asap for economics purposes ."

In [7]:
# Display information about the DataFrame
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [8]:
# Initialize a Porter stemmer for stemming words
stemmer = PorterStemmer()
stemmer.stem('normally')

'normal'

In [9]:
# Perform stemming and preprocessing on the text data
corpus = []
stopwords_set = set(stopwords.words("english"))

for i in range(len(data)):
    # Convert text to lowercase
    text = data["text"].iloc[i].lower()
    # Remove punctuation and split text into words
    text = text.translate(str.maketrans("","",string.punctuation)).split()
    # Perform stemming and remove stopwords
    text = [stemmer.stem(word) for word in text if word not in stopwords_set]
    # Join the stemmed words back into a single string
    text = " ".join(text)
    # Append the preprocessed text to the corpus
    corpus.append(text)

In [10]:
# Display the preprocessed text of the first row
data.text.iloc[0]

"Subject: enron methanol ; meter # : 988291 this is a follow up to the note i gave you on monday , 4 / 3 / 00 { preliminary flow data provided by daren } . please override pop ' s daily volume { presently zero } to reflect daily activity you can obtain from gas control . this change is needed asap for economics purposes ."

In [11]:
# Display the preprocessed text of the first row in the corpus
corpus[0]

'subject enron methanol meter 988291 follow note gave monday 4 3 00 preliminari flow data provid daren pleas overrid pop daili volum present zero reflect daili activ obtain ga control chang need asap econom purpos'

In [12]:
# Initialize a CountVectorizer for text vectorization
vectorizer = CountVectorizer()

# Vectorize the corpus
X = vectorizer.fit_transform(corpus).toarray()
y = data.label_num

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [13]:
# Initialize a Random Forest classifier
clf = RandomForestClassifier(n_jobs=-1)

# Train the classifier
clf.fit(X_train, y_train)

In [14]:
# Evaluate the classifier on the testing set
clf.score(X_test, y_test)

0.9806763285024155

In [15]:
# Select the text of the first email to classify
email_to_classify = data.text.values[0]
email_to_classify

"Subject: enron methanol ; meter # : 988291 this is a follow up to the note i gave you on monday , 4 / 3 / 00 { preliminary flow data provided by daren } . please override pop ' s daily volume { presently zero } to reflect daily activity you can obtain from gas control . this change is needed asap for economics purposes ."

In [16]:
# Preprocess the text of the email to classify
email_text = email_to_classify.lower().translate(str.maketrans("","", string.punctuation)).split()
email_text = [stemmer.stem(word) for word in text if word not in stopwords_set]
email_text = " ".join(email_text)

email_corpus = [email_text]

# Vectorize the preprocessed email text
X_email = vectorizer.transform(email_corpus)

In [17]:
# Predict the label of the email
clf.predict(X_email)

array([1])

In [18]:
# Display the label of the first email
data.label_num.iloc[0]

0