In [122]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_classif
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [93]:
class EmailPreprocess:
    def __init__(self, filename):
        self.filename = filename
        self.data = None
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.scaler = None
        self.feature_selector = None
        self.feature_selection_methods = ['KBest', 'Percentile']
        
    def load_data(self):
        self.data = pd.read_csv(self.filename)

    def check_missing_values(self):
        cols = self.data.columns
        cols_with_missing_values = [col for col in cols if self.data[col].isna().sum()]
        if cols_with_missing_values:
            for col in cols_with_missing_values:
                self.data[col] = self.data[col].fillna(0)   
        
    def split_dataset(self):
        self.X = self.data.iloc[:, 1:-1] # Извлечь признаки
        self.y = self.data.iloc[:, -1] # Извлечь метки

    def feature_scaling(self, method='MinMax'):
        if (method == 'Standard'):
            self.scaler = StandardScaler().fit(self.X)
        elif (method == 'MinMax'):
            self.scaler = MinMaxScaler().fit(self.X)
        else:
            return NotImplemented
        self.X = pd.DataFrame(self.scaler.transform(self.X), columns=self.X.columns) # 缩放功能

    def train_test_split(self, test_size=0.2, random_state=42):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=test_size, random_state=random_state)

    def feature_selection(self, method='KBest', k=10):
        if (method == 'Percentile'):
            percentile = k/self.X.shape[1]*100
            self.feature_selector = SelectPercentile(f_classif, percentile=percentile).fit(self.X, self.y)
        elif (method == 'KBest'):
            self.feature_selector = SelectKBest(f_classif, k=k).fit(self.X, self.y)
        else:
            return NotImplemented
        self.X_best_feats = pd.DataFrame(self.feature_selector.transform(self.X), columns=self.feature_selector.get_feature_names_out()) # 最佳功能

    def text_cleaning(self):
        STOPWORDS = stopwords.words('english')
        for word in STOPWORDS:
            if word in self.X.columns:
                self.X = self.X.drop(word, axis=1)

    def preprocess(self):
        self.load_data()
        self.check_missing_values()
        self.split_dataset()
        self.text_cleaning()
        self.feature_scaling()
        self.train_test_split()



In [155]:
class EmailClassifier(EmailPreprocess):
    def __init__(self, filename):
        super().__init__(filename)
        self.preprocess()
        self.best_fitted_model = None
        self.best_roc_auc_score = None  # 我们将根据roc auc选择最佳型号

    def find_best_model_and_params(self):
        """
        Метод для поиска лучшей модели и ее параметров.
        Записывает лучшую модель в self.best_fitted_model.
        """
        models = [KNeighborsClassifier(), LogisticRegression(), MultinomialNB(), RandomForestClassifier()]  
        self.best_roc_auc_score = 0

        for model in models:
            scores = cross_validate(model, self.X, self.y, cv=3, scoring=('accuracy', 'f1', 'precision', 'recall', 'roc_auc'))
            roc_auc_score = max(scores['test_roc_auc'])
            if roc_auc_score > self.best_roc_auc_score:
                self.best_roc_auc_score = roc_auc_score
                self.best_fitted_model = model

        self.best_fitted_model.fit(self.X_train, self.y_train)

    def evaluate_test_metrics(self):
        """
        Метод для оценки метрик лучшей модели на тестовом наборе данных.
        Возвращает словарь с метриками.
        """
        if self.best_fitted_model is None:
            raise ValueError("Лучшая модель не найдена. Сначала запустите find_best_model_and_params()")
        
        y_pred = self.best_fitted_model.predict(self.X_test)
        accuracy = accuracy_score(self.y_test, y_pred)
        f1 = f1_score(self.y_test, y_pred)
        precision = precision_score(self.y_test, y_pred)
        recall = recall_score(self.y_test, y_pred)
        return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
        }
    
    def cluster_emails(self, method):
        """
        Метод для кластеризации электронных писем на два кластера.
        method: 'kmeans', 'hierarchical' или 'dbscan'.
        Возвращает два значения: процент данных, соответствующих спаму в каждом
        кластере.
        """
        pass



In [156]:
clf = EmailClassifier('emails.csv')
clf.find_best_model_and_params()


In [157]:
print(clf.evaluate_test_metrics())

{'accuracy': 0.9768115942028985, 'f1': 0.9606557377049181, 'precision': 0.9331210191082803, 'recall': 0.9898648648648649}


In [127]:
filename = 'emails.csv'
data = pd.read_csv(filename)
data.head()

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


In [128]:
X = data.iloc[:, 1:-1] # Извлечь признаки
y = data.iloc[:, -1] # Извлечь метки

In [129]:
cols_with_missing_values = [col for col in data.columns if data[col].isna().sum()]
cols_with_missing_values

[]

In [130]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = MinMaxScaler()
scaler.fit(X)
X_scaled = pd.DataFrame(scaler.transform(X), columns=X.columns)

In [131]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = MinMaxScaler()
scaler.fit(X)
X_scaled = pd.DataFrame(scaler.transform(X), columns=X.columns)

In [132]:
X

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,enhancements,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry
0,0,0,1,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8,13,24,6,6,2,102,1,27,18,...,0,0,0,0,0,0,0,0,1,0
2,0,0,1,0,0,0,8,0,0,4,...,0,0,0,0,0,0,0,0,0,0
3,0,5,22,0,5,1,51,2,10,1,...,0,0,0,0,0,0,0,0,0,0
4,7,6,17,1,5,2,57,0,9,3,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,2,2,2,3,0,0,32,0,0,5,...,0,0,0,0,0,0,0,0,0,0
5168,35,27,11,2,6,5,151,4,3,23,...,0,0,0,0,0,0,0,0,1,0
5169,0,0,1,1,0,0,11,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5170,2,7,1,0,2,1,28,2,0,8,...,0,0,0,0,0,0,0,0,1,0


In [133]:
X_scaled

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,enhancements,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.001054,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
1,0.038095,0.098485,0.067055,0.067416,0.127660,0.025974,0.053741,0.014286,0.161677,0.080717,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008772,0.0
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.004215,0.000000,0.000000,0.017937,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
3,0.000000,0.037879,0.061224,0.000000,0.106383,0.012987,0.026870,0.028571,0.059880,0.004484,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
4,0.033333,0.045455,0.046647,0.011236,0.106383,0.025974,0.030032,0.000000,0.053892,0.013453,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008772,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,0.009524,0.015152,0.002915,0.033708,0.000000,0.000000,0.016860,0.000000,0.000000,0.022422,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
5168,0.166667,0.204545,0.029155,0.022472,0.127660,0.064935,0.079557,0.057143,0.017964,0.103139,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008772,0.0
5169,0.000000,0.000000,0.000000,0.011236,0.000000,0.000000,0.005796,0.000000,0.000000,0.004484,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
5170,0.009524,0.053030,0.000000,0.000000,0.042553,0.012987,0.014752,0.028571,0.000000,0.035874,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008772,0.0


In [134]:
tr = SelectKBest(f_classif, k=10).fit(X, y)
X_new = pd.DataFrame(tr.transform(X), columns=tr.get_feature_names_out())
X_new

Unnamed: 0,hpl,our,forwarded,daren,attached,more,thanks,subject,hanks,thank
0,0,0,0,0,0,0,0,0,0,0
1,0,21,3,3,1,0,1,3,1,1
2,0,0,0,0,0,0,0,0,0,0
3,0,0,2,2,0,0,1,3,1,1
4,0,0,2,1,0,0,1,2,1,1
...,...,...,...,...,...,...,...,...,...,...
5167,0,1,0,1,1,0,1,1,1,1
5168,0,2,0,1,0,0,0,1,0,0
5169,0,0,0,0,0,0,0,0,0,0
5170,0,2,0,0,0,0,0,0,0,0


In [135]:
tr = SelectKBest(f_classif, k=10).fit(X_scaled, y)
X_new = pd.DataFrame(tr.transform(X_scaled), columns=tr.get_feature_names_out())
X_new

Unnamed: 0,hpl,our,forwarded,daren,attached,more,thanks,subject,hanks,thank
0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000,0.000000,0.000000,0.000000
1,0.0,0.567568,0.272727,0.200000,0.142857,0.0,0.125,0.157895,0.090909,0.111111
2,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000,0.000000,0.000000,0.000000
3,0.0,0.000000,0.181818,0.133333,0.000000,0.0,0.125,0.157895,0.090909,0.111111
4,0.0,0.000000,0.181818,0.066667,0.000000,0.0,0.125,0.105263,0.090909,0.111111
...,...,...,...,...,...,...,...,...,...,...
5167,0.0,0.027027,0.000000,0.066667,0.142857,0.0,0.125,0.052632,0.090909,0.111111
5168,0.0,0.054054,0.000000,0.066667,0.000000,0.0,0.000,0.052632,0.000000,0.000000
5169,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000,0.000000,0.000000,0.000000
5170,0.0,0.054054,0.000000,0.000000,0.000000,0.0,0.000,0.000000,0.000000,0.000000


In [136]:
tr = SelectPercentile(f_classif, percentile=0.333).fit(X_scaled, y)
X_new = pd.DataFrame(tr.transform(X_scaled), columns=tr.get_feature_names_out())
X_new

Unnamed: 0,hpl,our,forwarded,daren,attached,more,thanks,subject,hanks,thank
0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000,0.000000,0.000000,0.000000
1,0.0,0.567568,0.272727,0.200000,0.142857,0.0,0.125,0.157895,0.090909,0.111111
2,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000,0.000000,0.000000,0.000000
3,0.0,0.000000,0.181818,0.133333,0.000000,0.0,0.125,0.157895,0.090909,0.111111
4,0.0,0.000000,0.181818,0.066667,0.000000,0.0,0.125,0.105263,0.090909,0.111111
...,...,...,...,...,...,...,...,...,...,...
5167,0.0,0.027027,0.000000,0.066667,0.142857,0.0,0.125,0.052632,0.090909,0.111111
5168,0.0,0.054054,0.000000,0.066667,0.000000,0.0,0.000,0.052632,0.000000,0.000000
5169,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000,0.000000,0.000000,0.000000
5170,0.0,0.054054,0.000000,0.000000,0.000000,0.0,0.000,0.000000,0.000000,0.000000


In [137]:
10/X_scaled.shape[1]*100

0.33333333333333337

In [138]:
STOPWORDS = stopwords.words('english')
for word in STOPWORDS:
    if word in X.columns:
        X = X.drop(word, axis=1)
X


Unnamed: 0,ect,hou,enron,com,gas,deal,meter,hpl,please,e,...,enhancements,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry
0,1,0,0,0,0,0,0,0,0,4,...,0,0,0,0,0,0,0,0,0,0
1,24,27,1,3,1,0,0,0,2,141,...,0,0,0,0,0,0,0,0,1,0
2,1,0,0,0,2,0,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
3,22,10,0,0,0,2,1,0,0,79,...,0,0,0,0,0,0,0,0,0,0
4,17,9,0,0,2,0,3,0,1,71,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,2,0,0,0,0,0,0,0,0,29,...,0,0,0,0,0,0,0,0,0,0
5168,11,3,1,3,5,0,0,0,1,218,...,0,0,0,0,0,0,0,0,1,0
5169,1,0,0,0,0,0,0,0,0,12,...,0,0,0,0,0,0,0,0,0,0
5170,1,0,0,4,0,1,0,0,1,45,...,0,0,0,0,0,0,0,0,1,0


In [139]:
tr = SelectPercentile(f_classif, percentile=0.333).fit(X, y)
best_feats = tr.get_feature_names_out()
X[best_feats]

Unnamed: 0,hpl,forwarded,daren,attached,thanks,subject,able,hanks,thank,hp
0,0,0,0,0,0,0,0,0,0,0
1,0,3,3,1,1,3,0,1,1,0
2,0,0,0,0,0,0,0,0,0,0
3,0,2,2,0,1,3,0,1,1,0
4,0,2,1,0,1,2,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...
5167,0,0,1,1,1,1,0,1,1,0
5168,0,0,1,0,0,1,4,0,0,0
5169,0,0,0,0,0,0,0,0,0,0
5170,0,0,0,0,0,0,1,0,0,0


In [140]:
X = X_scaled

In [151]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [152]:
models = [KNeighborsClassifier(), LogisticRegression(), MultinomialNB(), RandomForestClassifier()]  
best_model = None
best_score = 0

for model in models:
    scores = cross_validate(model, X, y, cv=5, scoring=('accuracy', 'f1', 'precision', 'recall', 'roc_auc'))
    score = max(scores['test_roc_auc'])
    if score > best_score:
        best_model = model
        best_score = score

best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
print(accuracy_score(y_pred, y_test), f1_score(y_pred, y_test), roc_auc_score(y_pred, y_test))


0.966183574879227 0.9397590361445783 0.9605714524765728
