In [64]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_classif
from nltk.corpus import stopwords

In [91]:
class EmailPreprocess:
    def __init__(self, filename):
        self.filename = filename
        self.data = None
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.scaler = None
        self.feature_selector = None
        self.feature_selection_methods = ['KBest', 'Percentile']
        
    def load_data(self):
        self.data = pd.read_csv(self.filename)

    def check_missing_values(self):
        cols = self.data.columns
        cols_with_missing_values = [col for col in cols if self.data[col].isna().sum()]
        if cols_with_missing_values:
            for col in cols_with_missing_values:
                self.data[col] = self.data[col].fillna(0)   
        

    def split_dataset(self):
        self.X = self.data.iloc[:, 1:-1] # Извлечь признаки
        self.y = self.data.iloc[:, -1] # Извлечь метки

    def feature_scaling(self, method='MinMax'):
        if (method == 'Standard'):
            self.scaler = StandardScaler().fit(self.X)
        elif (method == 'MinMax'):
            self.scaler = MinMaxScaler().fit(self.X)
        else:
            return NotImplemented
        self.X = pd.DataFrame(self.scaler.transform(self.X), columns=self.X.columns) # 缩放功能

    def train_test_split(self, test_size=0.2, random_state=42):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=test_size, random_state=random_state)

    def feature_selection(self, method='KBest', k=10):
        if (method == 'Percentile'):
            percentile = k/self.X.shape[1]*100
            self.feature_selector = SelectPercentile(f_classif, percentile=percentile).fit(self.X, self.y)
        elif (method == 'KBest'):
            self.feature_selector = SelectKBest(f_classif, k=k).fit(self.X, self.y)
        else:
            return NotImplemented
        self.X_best_feats = pd.DataFrame(self.feature_selector.transform(self.X), columns=self.feature_selector.get_feature_names_out()) # 最佳功能

    def text_cleaning(self):
        STOPWORDS = stopwords.words('english')
        for word in STOPWORDS:
            if word in self.X.columns:
                self.X = self.X.drop(word, axis=1)

    def preprocess(self):
        self.load_data()
        self.check_missing_values()
        self.split_dataset()
        self.text_cleaning()
        self.feature_scaling()
        self.train_test_split()


In [92]:
preprocessor = EmailPreprocess('emails.csv')
preprocessor.load_data()
preprocessor.check_missing_values()
preprocessor.split_dataset()
preprocessor.text_cleaning()
preprocessor.feature_scaling()
preprocessor.X
preprocessor.scaler

In [3]:
filename = 'emails.csv'
data = pd.read_csv(filename)
data.head()

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


In [4]:
X = data.iloc[:, 1:-1] # Извлечь признаки
y = data.iloc[:, -1] # Извлечь метки

In [5]:
cols_with_missing_values = [col for col in data.columns if data[col].isna().sum()]
cols_with_missing_values

[]

In [19]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = MinMaxScaler()
scaler.fit(X)
X_scaled = pd.DataFrame(scaler.transform(X), columns=X.columns)

In [26]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X_scaled = pd.DataFrame(scaler.transform(X), columns=X.columns)

In [36]:
X

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,enhancements,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry
0,0,0,1,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8,13,24,6,6,2,102,1,27,18,...,0,0,0,0,0,0,0,0,1,0
2,0,0,1,0,0,0,8,0,0,4,...,0,0,0,0,0,0,0,0,0,0
3,0,5,22,0,5,1,51,2,10,1,...,0,0,0,0,0,0,0,0,0,0
4,7,6,17,1,5,2,57,0,9,3,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,2,2,2,3,0,0,32,0,0,5,...,0,0,0,0,0,0,0,0,0,0
5168,35,27,11,2,6,5,151,4,3,23,...,0,0,0,0,0,0,0,0,1,0
5169,0,0,1,1,0,0,11,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5170,2,7,1,0,2,1,28,2,0,8,...,0,0,0,0,0,0,0,0,1,0


In [28]:
X_scaled

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,enhancements,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry
0,-0.565449,-0.649083,-0.293895,-0.508752,-0.667663,-0.421725,-0.611169,-0.571751,-0.290556,-0.549800,...,-0.02919,-0.047525,-0.062944,-0.091138,-0.172137,-0.044197,-0.04733,-0.056285,-0.329048,-0.070971
1,0.115757,0.714508,1.337337,0.483741,0.614369,-0.100659,0.530831,-0.339949,3.584743,0.383809,...,-0.02919,-0.047525,-0.062944,-0.091138,-0.172137,-0.044197,-0.04733,-0.056285,0.030672,-0.070971
2,-0.565449,-0.649083,-0.293895,-0.508752,-0.667663,-0.421725,-0.542649,-0.571751,-0.290556,-0.342331,...,-0.02919,-0.047525,-0.062944,-0.091138,-0.172137,-0.044197,-0.04733,-0.056285,-0.329048,-0.070971
3,-0.565449,-0.124625,1.195490,-0.508752,0.400697,-0.261192,-0.051589,-0.108147,1.144740,-0.497933,...,-0.02919,-0.047525,-0.062944,-0.091138,-0.172137,-0.044197,-0.04733,-0.056285,-0.329048,-0.070971
4,0.030606,-0.019733,0.840875,-0.343336,0.400697,-0.100659,0.016931,-0.571751,1.001210,-0.394198,...,-0.02919,-0.047525,-0.062944,-0.091138,-0.172137,-0.044197,-0.04733,-0.056285,0.030672,-0.070971
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,-0.395148,-0.439299,-0.222972,-0.012505,-0.667663,-0.421725,-0.268569,-0.571751,-0.290556,-0.290464,...,-0.02919,-0.047525,-0.062944,-0.091138,-0.172137,-0.044197,-0.04733,-0.056285,-0.329048,-0.070971
5168,2.414828,2.182990,0.415336,-0.177921,0.614369,0.380940,1.090411,0.355457,0.140033,0.643145,...,-0.02919,-0.047525,-0.062944,-0.091138,-0.172137,-0.044197,-0.04733,-0.056285,0.030672,-0.070971
5169,-0.565449,-0.649083,-0.293895,-0.343336,-0.667663,-0.421725,-0.508389,-0.571751,-0.290556,-0.497933,...,-0.02919,-0.047525,-0.062944,-0.091138,-0.172137,-0.044197,-0.04733,-0.056285,-0.329048,-0.070971
5170,-0.395148,0.085158,-0.293895,-0.508752,-0.240319,-0.261192,-0.314249,-0.108147,-0.290556,-0.134863,...,-0.02919,-0.047525,-0.062944,-0.091138,-0.172137,-0.044197,-0.04733,-0.056285,0.030672,-0.070971


In [51]:
tr = SelectKBest(f_classif, k=10).fit(X, y)
X_new = pd.DataFrame(tr.transform(X), columns=tr.get_feature_names_out())
X_new

Unnamed: 0,hpl,our,forwarded,daren,attached,more,thanks,subject,hanks,thank
0,0,0,0,0,0,0,0,0,0,0
1,0,21,3,3,1,0,1,3,1,1
2,0,0,0,0,0,0,0,0,0,0
3,0,0,2,2,0,0,1,3,1,1
4,0,0,2,1,0,0,1,2,1,1
...,...,...,...,...,...,...,...,...,...,...
5167,0,1,0,1,1,0,1,1,1,1
5168,0,2,0,1,0,0,0,1,0,0
5169,0,0,0,0,0,0,0,0,0,0
5170,0,2,0,0,0,0,0,0,0,0


In [53]:
tr = SelectKBest(f_classif, k=10).fit(X_scaled, y)
X_new = pd.DataFrame(tr.transform(X_scaled), columns=tr.get_feature_names_out())
X_new

Unnamed: 0,hpl,our,forwarded,daren,attached,more,thanks,subject,hanks,thank
0,-0.418754,-0.492384,-0.362020,-0.369530,-0.387156,-0.338738,-0.508949,-0.440311,-0.490508,-0.569328
1,-0.418754,6.281370,3.968817,2.641826,1.438161,-0.338738,0.864895,1.921206,0.681274,0.697696
2,-0.418754,-0.492384,-0.362020,-0.369530,-0.387156,-0.338738,-0.508949,-0.440311,-0.490508,-0.569328
3,-0.418754,-0.492384,2.525205,1.638041,-0.387156,-0.338738,0.864895,1.921206,0.681274,0.697696
4,-0.418754,-0.492384,2.525205,0.634256,-0.387156,-0.338738,0.864895,1.134034,0.681274,0.697696
...,...,...,...,...,...,...,...,...,...,...
5167,-0.418754,-0.169824,-0.362020,0.634256,1.438161,-0.338738,0.864895,0.346861,0.681274,0.697696
5168,-0.418754,0.152736,-0.362020,0.634256,-0.387156,-0.338738,-0.508949,0.346861,-0.490508,-0.569328
5169,-0.418754,-0.492384,-0.362020,-0.369530,-0.387156,-0.338738,-0.508949,-0.440311,-0.490508,-0.569328
5170,-0.418754,0.152736,-0.362020,-0.369530,-0.387156,-0.338738,-0.508949,-0.440311,-0.490508,-0.569328


In [61]:
tr = SelectPercentile(f_classif, percentile=0.333).fit(X_scaled, y)
X_new = pd.DataFrame(tr.transform(X_scaled), columns=tr.get_feature_names_out())
X_new

Unnamed: 0,hpl,our,forwarded,daren,attached,more,thanks,subject,hanks,thank
0,-0.418754,-0.492384,-0.362020,-0.369530,-0.387156,-0.338738,-0.508949,-0.440311,-0.490508,-0.569328
1,-0.418754,6.281370,3.968817,2.641826,1.438161,-0.338738,0.864895,1.921206,0.681274,0.697696
2,-0.418754,-0.492384,-0.362020,-0.369530,-0.387156,-0.338738,-0.508949,-0.440311,-0.490508,-0.569328
3,-0.418754,-0.492384,2.525205,1.638041,-0.387156,-0.338738,0.864895,1.921206,0.681274,0.697696
4,-0.418754,-0.492384,2.525205,0.634256,-0.387156,-0.338738,0.864895,1.134034,0.681274,0.697696
...,...,...,...,...,...,...,...,...,...,...
5167,-0.418754,-0.169824,-0.362020,0.634256,1.438161,-0.338738,0.864895,0.346861,0.681274,0.697696
5168,-0.418754,0.152736,-0.362020,0.634256,-0.387156,-0.338738,-0.508949,0.346861,-0.490508,-0.569328
5169,-0.418754,-0.492384,-0.362020,-0.369530,-0.387156,-0.338738,-0.508949,-0.440311,-0.490508,-0.569328
5170,-0.418754,0.152736,-0.362020,-0.369530,-0.387156,-0.338738,-0.508949,-0.440311,-0.490508,-0.569328


In [60]:
10/X_scaled.shape[1]*100

0.33333333333333337

In [68]:
STOPWORDS = stopwords.words('english')
for word in STOPWORDS:
    if word in X.columns:
        X = X.drop(word, axis=1)
X


Unnamed: 0,ect,hou,enron,com,gas,deal,meter,hpl,please,e,...,enhancements,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry
0,1,0,0,0,0,0,0,0,0,4,...,0,0,0,0,0,0,0,0,0,0
1,24,27,1,3,1,0,0,0,2,141,...,0,0,0,0,0,0,0,0,1,0
2,1,0,0,0,2,0,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
3,22,10,0,0,0,2,1,0,0,79,...,0,0,0,0,0,0,0,0,0,0
4,17,9,0,0,2,0,3,0,1,71,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,2,0,0,0,0,0,0,0,0,29,...,0,0,0,0,0,0,0,0,0,0
5168,11,3,1,3,5,0,0,0,1,218,...,0,0,0,0,0,0,0,0,1,0
5169,1,0,0,0,0,0,0,0,0,12,...,0,0,0,0,0,0,0,0,0,0
5170,1,0,0,4,0,1,0,0,1,45,...,0,0,0,0,0,0,0,0,1,0


In [70]:
tr = SelectPercentile(f_classif, percentile=0.333).fit(X, y)
X_new = pd.DataFrame(tr.transform(X), columns=tr.get_feature_names_out())
X_new

Unnamed: 0,hpl,forwarded,daren,attached,thanks,subject,able,hanks,thank,hp
0,0,0,0,0,0,0,0,0,0,0
1,0,3,3,1,1,3,0,1,1,0
2,0,0,0,0,0,0,0,0,0,0
3,0,2,2,0,1,3,0,1,1,0
4,0,2,1,0,1,2,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...
5167,0,0,1,1,1,1,0,1,1,0
5168,0,0,1,0,0,1,4,0,0,0
5169,0,0,0,0,0,0,0,0,0,0
5170,0,0,0,0,0,0,1,0,0,0
