In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_classif
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [25]:
class EmailPreprocess:
    def __init__(self, filename):
        self.filename = filename
        self.data = None
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.scaler = None
        self.feature_selector = None
        self.feature_selection_methods = ['KBest', 'Percentile']
        
    def load_data(self):
        self.data = pd.read_csv(self.filename)

    def check_missing_values(self):
        cols = self.data.columns
        cols_with_missing_values = [col for col in cols if self.data[col].isna().sum()]
        if cols_with_missing_values:
            for col in cols_with_missing_values:
                self.data[col] = self.data[col].fillna(0)   
        
    def split_dataset(self):
        self.X = self.data.iloc[:, 1:-1] # 提取标志
        self.y = self.data.iloc[:, -1] # 提取标签

    def feature_scaling(self, method='MinMax'):
        """
        Метод для масштабирования признаков.
        method: 'MinMax', 'Standard'
        """
        if (method == 'Standard'):
            self.scaler = StandardScaler().fit(self.X)
        elif (method == 'MinMax'):
            self.scaler = MinMaxScaler().fit(self.X)
        else:
            return NotImplemented
        self.X = pd.DataFrame(self.scaler.transform(self.X), columns=self.X.columns) # 缩放功能

    def train_test_split(self, test_size=0.2, random_state=42):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=test_size, random_state=random_state)

    def feature_selection(self, method='KBest', k=10):
        """
        Метод для отбора k лучших признаков.
        method: 'Kbest', 'Percentile'.
        Записывает лучше признаки в атрибут X_best_features
        """
        if (method == 'Percentile'):
            percentile = k/self.X.shape[1]*100
            self.feature_selector = SelectPercentile(f_classif, percentile=percentile).fit(self.X, self.y)
        elif (method == 'KBest'):
            self.feature_selector = SelectKBest(f_classif, k=k).fit(self.X, self.y)
        else:
            return NotImplemented
        self.X_best_feats = pd.DataFrame(self.feature_selector.transform(self.X), columns=self.feature_selector.get_feature_names_out()) # 最佳功能

    def text_cleaning(self):
        """
        Метод для очистки стоп слов 
        """
        STOPWORDS = stopwords.words('english')
        for word in STOPWORDS:
            if word in self.X.columns:
                self.X = self.X.drop(word, axis=1)

    def preprocess(self):
        self.load_data()
        self.check_missing_values()
        self.split_dataset()
        self.text_cleaning()
        self.feature_scaling()
        self.train_test_split()



In [36]:
class EmailClassifier(EmailPreprocess):
    def __init__(self, filename):
        super().__init__(filename)
        self.preprocess()
        self.best_fitted_model = None
        self.best_roc_auc_score = None  # 我们将根据roc auc选择最佳型号
        self.clusterer = None

    def find_best_model_and_params(self):
        """
        Метод для поиска лучшей модели и ее параметров.
        Записывает лучшую модель в self.best_fitted_model.
        """
        models = [KNeighborsClassifier(), LogisticRegression(), MultinomialNB(), RandomForestClassifier()]  
        self.best_roc_auc_score = 0

        for model in models:
            scores = cross_validate(model, self.X, self.y, cv=3, scoring=('accuracy', 'f1', 'precision', 'recall', 'roc_auc'))
            roc_auc_score = max(scores['test_roc_auc'])
            if roc_auc_score > self.best_roc_auc_score:
                self.best_roc_auc_score = roc_auc_score
                self.best_fitted_model = model

        self.best_fitted_model.fit(self.X_train, self.y_train)

    def evaluate_test_metrics(self):
        """
        Метод для оценки метрик лучшей модели на тестовом наборе данных.
        Возвращает словарь с метриками.
        """
        if self.best_fitted_model is None:
            raise ValueError("Лучшая модель не найдена. Сначала запустите find_best_model_and_params()")
        
        y_pred = self.best_fitted_model.predict(self.X_test)
        accuracy = accuracy_score(self.y_test, y_pred)
        f1 = f1_score(self.y_test, y_pred)
        precision = precision_score(self.y_test, y_pred)
        recall = recall_score(self.y_test, y_pred)
        return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
        }
    
    def cluster_emails(self, method='kmeans'):
        """
        Метод для кластеризации электронных писем на два кластера.
        method: 'kmeans', 'hierarchical' или 'dbscan'.
        Возвращает два значения: процент данных, соответствующих спаму в каждом
        кластере.
        """
        if method == 'kmeans':
            self.clusterer = KMeans(n_clusters=2).fit(self.X)
        elif method == 'dbscan':
            self.clusterer = DBSCAN(eps=4).fit(self.X)
        elif method == 'hierarchal':
            self.clusterer = AgglomerativeClustering(n_clusters=2).fit(self.X)
        else:
            return NotImplemented

        y_and_clusters = pd.concat([self.y, pd.Series(self.clusterer.labels_, name='Cluster')], axis=1)
        cond0 = (y_and_clusters['Cluster'] == 0) & (y_and_clusters['Prediction'] == 1)
        cond1 = (y_and_clusters['Cluster'] == 1) & (y_and_clusters['Prediction'] == 1)

        return (round(y_and_clusters[cond0].shape[0]/y_and_clusters[y_and_clusters['Cluster'] == 0].shape[0] * 100, 2), 
                round(y_and_clusters[cond1].shape[0]/y_and_clusters[y_and_clusters['Cluster'] == 1].shape[0] * 100, 2))



In [37]:
clf = EmailClassifier('emails.csv')
clf.find_best_model_and_params()


In [38]:
print(clf.evaluate_test_metrics())

{'accuracy': 0.9758454106280193, 'f1': 0.9584026622296173, 'precision': 0.9442622950819672, 'recall': 0.972972972972973}


In [41]:
clf.cluster_emails()

(79.69, 28.37)

In [4]:
filename = 'emails.csv'
data = pd.read_csv(filename)
data.head()

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


In [5]:
X = data.iloc[:, 1:-1] # Извлечь признаки
y = data.iloc[:, -1] # Извлечь метки

In [6]:
cols_with_missing_values = [col for col in data.columns if data[col].isna().sum()]
cols_with_missing_values

[]

In [7]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = MinMaxScaler()
scaler.fit(X)
X_scaled = pd.DataFrame(scaler.transform(X), columns=X.columns)

In [12]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = MinMaxScaler()
scaler.fit(X)
X_scaled = pd.DataFrame(scaler.transform(X), columns=X.columns)

In [13]:
X

Unnamed: 0,ect,hou,enron,com,gas,deal,meter,hpl,please,e,...,enhancements,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry
0,1,0,0,0,0,0,0,0,0,4,...,0,0,0,0,0,0,0,0,0,0
1,24,27,1,3,1,0,0,0,2,141,...,0,0,0,0,0,0,0,0,1,0
2,1,0,0,0,2,0,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
3,22,10,0,0,0,2,1,0,0,79,...,0,0,0,0,0,0,0,0,0,0
4,17,9,0,0,2,0,3,0,1,71,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,2,0,0,0,0,0,0,0,0,29,...,0,0,0,0,0,0,0,0,0,0
5168,11,3,1,3,5,0,0,0,1,218,...,0,0,0,0,0,0,0,0,1,0
5169,1,0,0,0,0,0,0,0,0,12,...,0,0,0,0,0,0,0,0,0,0
5170,1,0,0,4,0,1,0,0,1,45,...,0,0,0,0,0,0,0,0,1,0


In [14]:
X_scaled

Unnamed: 0,ect,hou,enron,com,gas,deal,meter,hpl,please,e,...,enhancements,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.001290,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
1,0.067055,0.161677,0.006849,0.004132,0.034483,0.00,0.000000,0.0,0.166667,0.060189,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008772,0.0
2,0.000000,0.000000,0.000000,0.000000,0.068966,0.00,0.000000,0.0,0.000000,0.000860,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
3,0.061224,0.059880,0.000000,0.000000,0.000000,0.08,0.034483,0.0,0.000000,0.033534,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
4,0.046647,0.053892,0.000000,0.000000,0.068966,0.00,0.103448,0.0,0.083333,0.030095,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008772,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,0.002915,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.012038,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
5168,0.029155,0.017964,0.006849,0.004132,0.172414,0.00,0.000000,0.0,0.083333,0.093293,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008772,0.0
5169,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.004729,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
5170,0.000000,0.000000,0.000000,0.005510,0.000000,0.04,0.000000,0.0,0.083333,0.018917,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008772,0.0


In [10]:
tr = SelectKBest(f_classif, k=10).fit(X, y)
X_new = pd.DataFrame(tr.transform(X), columns=tr.get_feature_names_out())
X_new

Unnamed: 0,hpl,our,forwarded,daren,attached,more,thanks,subject,hanks,thank
0,0,0,0,0,0,0,0,0,0,0
1,0,21,3,3,1,0,1,3,1,1
2,0,0,0,0,0,0,0,0,0,0
3,0,0,2,2,0,0,1,3,1,1
4,0,0,2,1,0,0,1,2,1,1
...,...,...,...,...,...,...,...,...,...,...
5167,0,1,0,1,1,0,1,1,1,1
5168,0,2,0,1,0,0,0,1,0,0
5169,0,0,0,0,0,0,0,0,0,0
5170,0,2,0,0,0,0,0,0,0,0


In [135]:
tr = SelectKBest(f_classif, k=10).fit(X_scaled, y)
X_new = pd.DataFrame(tr.transform(X_scaled), columns=tr.get_feature_names_out())
X_new

Unnamed: 0,hpl,our,forwarded,daren,attached,more,thanks,subject,hanks,thank
0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000,0.000000,0.000000,0.000000
1,0.0,0.567568,0.272727,0.200000,0.142857,0.0,0.125,0.157895,0.090909,0.111111
2,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000,0.000000,0.000000,0.000000
3,0.0,0.000000,0.181818,0.133333,0.000000,0.0,0.125,0.157895,0.090909,0.111111
4,0.0,0.000000,0.181818,0.066667,0.000000,0.0,0.125,0.105263,0.090909,0.111111
...,...,...,...,...,...,...,...,...,...,...
5167,0.0,0.027027,0.000000,0.066667,0.142857,0.0,0.125,0.052632,0.090909,0.111111
5168,0.0,0.054054,0.000000,0.066667,0.000000,0.0,0.000,0.052632,0.000000,0.000000
5169,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000,0.000000,0.000000,0.000000
5170,0.0,0.054054,0.000000,0.000000,0.000000,0.0,0.000,0.000000,0.000000,0.000000


In [136]:
tr = SelectPercentile(f_classif, percentile=0.333).fit(X_scaled, y)
X_new = pd.DataFrame(tr.transform(X_scaled), columns=tr.get_feature_names_out())
X_new

Unnamed: 0,hpl,our,forwarded,daren,attached,more,thanks,subject,hanks,thank
0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000,0.000000,0.000000,0.000000
1,0.0,0.567568,0.272727,0.200000,0.142857,0.0,0.125,0.157895,0.090909,0.111111
2,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000,0.000000,0.000000,0.000000
3,0.0,0.000000,0.181818,0.133333,0.000000,0.0,0.125,0.157895,0.090909,0.111111
4,0.0,0.000000,0.181818,0.066667,0.000000,0.0,0.125,0.105263,0.090909,0.111111
...,...,...,...,...,...,...,...,...,...,...
5167,0.0,0.027027,0.000000,0.066667,0.142857,0.0,0.125,0.052632,0.090909,0.111111
5168,0.0,0.054054,0.000000,0.066667,0.000000,0.0,0.000,0.052632,0.000000,0.000000
5169,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000,0.000000,0.000000,0.000000
5170,0.0,0.054054,0.000000,0.000000,0.000000,0.0,0.000,0.000000,0.000000,0.000000


In [137]:
10/X_scaled.shape[1]*100

0.33333333333333337

In [11]:
STOPWORDS = stopwords.words('english')
for word in STOPWORDS:
    if word in X.columns:
        X = X.drop(word, axis=1)
X


Unnamed: 0,ect,hou,enron,com,gas,deal,meter,hpl,please,e,...,enhancements,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry
0,1,0,0,0,0,0,0,0,0,4,...,0,0,0,0,0,0,0,0,0,0
1,24,27,1,3,1,0,0,0,2,141,...,0,0,0,0,0,0,0,0,1,0
2,1,0,0,0,2,0,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
3,22,10,0,0,0,2,1,0,0,79,...,0,0,0,0,0,0,0,0,0,0
4,17,9,0,0,2,0,3,0,1,71,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,2,0,0,0,0,0,0,0,0,29,...,0,0,0,0,0,0,0,0,0,0
5168,11,3,1,3,5,0,0,0,1,218,...,0,0,0,0,0,0,0,0,1,0
5169,1,0,0,0,0,0,0,0,0,12,...,0,0,0,0,0,0,0,0,0,0
5170,1,0,0,4,0,1,0,0,1,45,...,0,0,0,0,0,0,0,0,1,0


In [139]:
tr = SelectPercentile(f_classif, percentile=0.333).fit(X, y)
best_feats = tr.get_feature_names_out()
X[best_feats]

Unnamed: 0,hpl,forwarded,daren,attached,thanks,subject,able,hanks,thank,hp
0,0,0,0,0,0,0,0,0,0,0
1,0,3,3,1,1,3,0,1,1,0
2,0,0,0,0,0,0,0,0,0,0
3,0,2,2,0,1,3,0,1,1,0
4,0,2,1,0,1,2,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...
5167,0,0,1,1,1,1,0,1,1,0
5168,0,0,1,0,0,1,4,0,0,0
5169,0,0,0,0,0,0,0,0,0,0
5170,0,0,0,0,0,0,1,0,0,0


In [140]:
X = X_scaled

In [151]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [152]:
models = [KNeighborsClassifier(), LogisticRegression(), MultinomialNB(), RandomForestClassifier()]  
best_model = None
best_score = 0

for model in models:
    scores = cross_validate(model, X, y, cv=5, scoring=('accuracy', 'f1', 'precision', 'recall', 'roc_auc'))
    score = max(scores['test_roc_auc'])
    if score > best_score:
        best_model = model
        best_score = score

best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
print(accuracy_score(y_pred, y_test), f1_score(y_pred, y_test), roc_auc_score(y_pred, y_test))


0.966183574879227 0.9397590361445783 0.9605714524765728


In [17]:
clusterer = AgglomerativeClustering(n_clusters=2).fit(X_scaled)
set(clusterer.labels_)

{0, 1}

In [22]:
y_and_clusters = pd.concat([y, pd.Series(clusterer.labels_, name='Cluster')], axis=1)
y_and_clusters

Unnamed: 0,Prediction,Cluster
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
5167,0,0
5168,0,0
5169,1,0
5170,1,0


In [23]:
cond0 = (y_and_clusters['Cluster'] == 0) & (y_and_clusters['Prediction'] == 1)
cond1 = (y_and_clusters['Cluster'] == 1) & (y_and_clusters['Prediction'] == 1)
y_and_clusters[cond0].shape[0]/y_and_clusters[y_and_clusters['Cluster'] == 0].shape[0], y_and_clusters[cond1].shape[0]/y_and_clusters[y_and_clusters['Cluster'] == 1].shape[0]            

(0.2851859061709169, 1.0)

In [202]:
y_and_clusters[y_and_clusters['Cluster'] == 0].shape[0], y_and_clusters[y_and_clusters['Cluster'] == 1].shape[0]

(5083, 89)

In [251]:
clusterer = DBSCAN(eps=4).fit(X_scaled)
set(clusterer.labels_)

{-1, 0, 1}

In [252]:
y_and_clusters = pd.concat([y, pd.Series(clusterer.labels_, name='Cluster')], axis=1)
y_and_clusters

Unnamed: 0,Prediction,Cluster
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
5167,0,0
5168,0,0
5169,1,0
5170,1,0


In [253]:
cond0 = (y_and_clusters['Cluster'] == 0) & (y_and_clusters['Prediction'] == 1)
cond1 = (y_and_clusters['Cluster'] == 1) & (y_and_clusters['Prediction'] == 1)
y_and_clusters[cond0].shape[0]/y_and_clusters[y_and_clusters['Cluster'] == 0].shape[0], y_and_clusters[cond1].shape[0]/y_and_clusters[y_and_clusters['Cluster'] == 1].shape[0]

(0.2805327868852459, 1.0)