In [1]:
import tarfile
from pathlib import Path
import urllib.request

def fetch_spam_data():
    spam_root = "http://spamassassin.apache.org/old/publiccorpus/"
    ham_url = spam_root + "20030228_easy_ham.tar.bz2"
    spam_url = spam_root + "20030228_spam.tar.bz2"

    spam_path = Path() / "datasets" / "spam"
    spam_path.mkdir(parents=True, exist_ok=True)
    for dir_name, tar_name, url in (("easy_ham", "ham", ham_url),
                                    ("spam", "spam", spam_url)):
        if not (spam_path / dir_name).is_dir():
            path = (spam_path / tar_name).with_suffix(".tar.bz2")
            print("Downloading", path)
            urllib.request.urlretrieve(url, path)
            tar_bz2_file = tarfile.open(path)
            tar_bz2_file.extractall(path=spam_path)
            tar_bz2_file.close()
    return [spam_path / dir_name for dir_name in ("easy_ham", "spam")]

In [2]:
ham_dir, spam_dir = fetch_spam_data()

In [3]:
ham_filenames = [f for f in sorted(ham_dir.iterdir()) if len(f.name) > 20]
spam_filenames = [f for f in sorted(spam_dir.iterdir()) if len(f.name) > 20]

In [4]:
import email
import email.policy

def load_email(filepath):
    with open(filepath, "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [5]:
ham_emails = [load_email(filepath) for filepath in ham_filenames]
spam_emails = [load_email(filepath) for filepath in spam_filenames]

In [23]:
from email.message import Message
from collections import Counter
import re

spam = Counter()

for email in spam_emails:
    email_content = ""

    if email.is_multipart():
        for part in email.walk():
            if part.get_content_type() == "text/plain":
                email_content = part.get_payload(decode=True).decode("utf-8", "ignore")
                break  
    else:
        email_content = email.get_payload(decode=True).decode("utf-8", "ignore")

    words = re.findall(r'\b\w+\b', email_content.lower())
    word_counts = Counter(words)
    spam.update(dict(word_counts.most_common(10)))

for word, count in spam.most_common(20):
    print(f"{word}: {count}")

font: 7038
the: 4716
to: 4020
a: 2937
and: 2746
td: 2724
of: 2630
you: 2592
size: 1842
br: 1642
b: 1511
face: 1438
width: 1391
tr: 1362
p: 1259
http: 1230
color: 1185
com: 1094
in: 1083
i: 1028


In [11]:
ham = Counter()

for email in ham_emails:
    email_content = ""

    if email.is_multipart():
        for part in email.walk():
            if part.get_content_type() == "text/plain":
                email_content = part.get_payload(decode=True).decode("utf-8", "ignore")
                break  
    else:
        email_content = email.get_payload(decode=True).decode("utf-8", "ignore")

    words = re.findall(r'\b\w+\b', email_content.lower())
    word_counts = Counter(words)
    ham.update(dict(word_counts.most_common(10)))

In [25]:
for word, count in ham.most_common(20):
    print(f"{word}: {count}")

the: 22157
to: 12254
a: 8960
of: 8954
and: 8869
i: 5979
in: 5033
that: 4063
is: 3347
it: 3212
http: 2124
0: 2017
com: 1797
you: 1787
for: 1646
s: 1526
on: 1222
net: 1155
1: 1067
this: 1040


In [43]:
import pandas as pd

df = pd.DataFrame(columns=[word for word, count in ham.most_common(20)])

In [44]:
df = df.reindex(columns=list(df.columns) + [word for word, count in spam.most_common(20)])

In [45]:
df = df.loc[:, ~df.columns.duplicated()]

In [46]:
df = df.reindex(columns=list(df.columns) + ['spam_label'])

In [47]:
df.columns

Index(['the', 'to', 'a', 'of', 'and', 'i', 'in', 'that', 'is', 'it', 'http',
       '0', 'com', 'you', 'for', 's', 'on', 'net', '1', 'this', 'font', 'td',
       'size', 'br', 'b', 'face', 'width', 'tr', 'p', 'color', 'spam_label'],
      dtype='object')

In [52]:
for email in spam_emails:
    data = []
    email_content = ""

    if email.is_multipart():
        for part in email.walk():
            if part.get_content_type() == "text/plain":
                email_content = part.get_payload(decode=True).decode("utf-8", "ignore")
                break  
    else:
        email_content = email.get_payload(decode=True).decode("utf-8", "ignore")

    words = re.findall(r'\b\w+\b', email_content.lower())
    word_counts = {word: words.count(word) for word in df.columns}
    word_counts['spam_label'] = 1
    new_row_df = pd.DataFrame([word_counts])
    df = pd.concat([df, new_row_df], ignore_index=True)

In [55]:
for email in ham_emails:
    data = []
    email_content = ""

    if email.is_multipart():
        for part in email.walk():
            if part.get_content_type() == "text/plain":
                email_content = part.get_payload(decode=True).decode("utf-8", "ignore")
                break  
    else:
        email_content = email.get_payload(decode=True).decode("utf-8", "ignore")

    words = re.findall(r'\b\w+\b', email_content.lower())
    word_counts = {word: words.count(word) for word in df.columns}
    word_counts['spam_label'] = 0
    new_row_df = pd.DataFrame([word_counts])
    df = pd.concat([df, new_row_df], ignore_index=True)

In [65]:
from sklearn.model_selection import train_test_split

ham_df = df[df['spam_label'] == 0]
spam_df = df[df['spam_label'] == 1]

ham_df = ham_df[:500]

In [66]:
print(ham_df.shape)
print(spam_df.shape)

(500, 31)
(500, 31)


In [68]:
df_final = pd.concat([ham_df, spam_df], axis=0)
df_final = df_final.sample(frac = 1)
df_final

Unnamed: 0,the,to,a,of,and,i,in,that,is,it,...,td,size,br,b,face,width,tr,p,color,spam_label
1475,4,5,2,1,0,7,1,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
481,10,7,0,2,5,1,1,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
195,2,6,8,4,7,12,3,0,0,0,...,14.0,24.0,13.0,14.0,20.0,14.0,14.0,6.0,37.0,1.0
228,5,12,3,4,6,1,4,1,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
401,3,7,9,5,7,12,3,0,0,0,...,14.0,39.0,13.0,14.0,33.0,14.0,14.0,6.0,41.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726,0,3,0,1,2,0,2,1,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
11,1,1,6,0,1,0,0,0,0,0,...,4.0,5.0,9.0,0.0,1.0,1.0,4.0,0.0,5.0,1.0
446,11,4,2,0,3,0,0,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
426,36,55,45,17,32,32,14,15,16,16,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [70]:
X = df_final.drop('spam_label', axis=1)
y = df_final['spam_label']

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [81]:
def to_int(X):
    return X.astype(int)

DECISION_TREE

In [82]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.tree import DecisionTreeClassifier

pipeline = Pipeline(steps=[
    ('to_int', FunctionTransformer(to_int)),
    ('scaler', StandardScaler()),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

In [83]:
pipeline.fit(X_train, y_train)

In [88]:
predictions = pipeline.predict(X_train)

In [89]:
from sklearn.metrics import accuracy_score, confusion_matrix

accuracy = accuracy_score(y_train, predictions) 
print(f"Dokładność: {accuracy * 100:.2f}%")

Dokładność: 99.88%


In [90]:
conf_matrix = confusion_matrix(y_train, predictions)

In [91]:
conf_matrix

array([[391,   0],
       [  1, 408]])

In [92]:
y_pred = pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Dokładność na zbiorze testowym: {accuracy * 100:.2f}%")

conf_matrix = confusion_matrix(y_test, y_pred)
print("Macierz pomyłek:")
print(conf_matrix)

Dokładność na zbiorze testowym: 88.50%
Macierz pomyłek:
[[96 13]
 [10 81]]


DECISION_TREE WITH GRIDSEARCHCV

In [100]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'classifier__max_depth': [1, 2, 3, None],  
    'classifier__min_samples_split': [1, 2, 3, 5, 10], 
    'classifier__min_samples_leaf': [1, 2, 4],  
    'classifier__criterion': ['gini', 'entropy']  
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

grid_search.best_params_

Fitting 5 folds for each of 120 candidates, totalling 600 fits


120 fits failed out of a total of 600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\artur\PycharmProjects\ML_Learning\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\artur\PycharmProjects\ML_Learning\.venv\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\artur\PycharmProjects\ML_Learning\.venv\lib\site-packages\sklearn\pipeline.py", line 660, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\Users\artur\PycharmProjects\ML_Lear

{'classifier__criterion': 'entropy',
 'classifier__max_depth': None,
 'classifier__min_samples_leaf': 1,
 'classifier__min_samples_split': 3}

In [101]:
grid_search.best_score_

np.float64(0.86625)

In [102]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Dokładność na zbiorze testowym: {accuracy * 100:.2f}%")

Dokładność na zbiorze testowym: 87.00%


In [103]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Macierz pomyłek:")
print(conf_matrix)

Macierz pomyłek:
[[97 12]
 [14 77]]


RANDOM FOREST

In [104]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline(steps=[
    ('to_int', FunctionTransformer(to_int)),
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])

pipeline.fit(X_train, y_train)

In [105]:
predictions = pipeline.predict(X_train)

In [106]:
from sklearn.metrics import accuracy_score, confusion_matrix

accuracy = accuracy_score(y_train, predictions) 
print(f"Dokładność: {accuracy * 100:.2f}%")

Dokładność: 99.88%


In [107]:
conf_matrix = confusion_matrix(y_train, predictions)

In [108]:
conf_matrix

array([[391,   0],
       [  1, 408]])

In [109]:
y_pred = pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Dokładność na zbiorze testowym: {accuracy * 100:.2f}%")

conf_matrix = confusion_matrix(y_test, y_pred)
print("Macierz pomyłek:")
print(conf_matrix)

Dokładność na zbiorze testowym: 92.00%
Macierz pomyłek:
[[104   5]
 [ 11  80]]


RANDOM FOREST WITH GRIDSEARCHCV

In [114]:
param_grid = {
    'classifier__n_estimators': [80, 90, 100, 110, 120], 
    'classifier__max_depth': [15, 18, 20, 22, 24], 
    'classifier__min_samples_split': [3,4 , 5, 6, 7],  
    'classifier__min_samples_leaf': [1, 2, 4], 
    'classifier__max_features': ['auto', 'sqrt', 'log2'],  
    'classifier__bootstrap': [True, False]  
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

grid_search.best_params_

Fitting 5 folds for each of 2250 candidates, totalling 11250 fits


3750 fits failed out of a total of 11250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1881 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\artur\PycharmProjects\ML_Learning\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\artur\PycharmProjects\ML_Learning\.venv\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\artur\PycharmProjects\ML_Learning\.venv\lib\site-packages\sklearn\pipeline.py", line 660, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\Users\artur\PycharmProjects\ML_

{'classifier__bootstrap': False,
 'classifier__max_depth': 20,
 'classifier__max_features': 'log2',
 'classifier__min_samples_leaf': 1,
 'classifier__min_samples_split': 5,
 'classifier__n_estimators': 120}

In [115]:
grid_search.best_score_

np.float64(0.9400000000000001)

In [116]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Dokładność na zbiorze testowym: {accuracy * 100:.2f}%")

Dokładność na zbiorze testowym: 92.00%


In [117]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Macierz pomyłek:")
print(conf_matrix)

Macierz pomyłek:
[[105   4]
 [ 12  79]]
