In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from model.feature_selector import *
from model.selector import *

We will work with 'SMS' dataset

In [2]:
df = pd.read_csv('data\SMS.tsv', sep='\t')
df.head()

  df = pd.read_csv('data\SMS.tsv', sep='\t')


Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


**Prepare data**

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   class   5572 non-null   object
 1   text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


Transform 'class' column into numerical data

In [4]:
print(df['class'].info())
print(df['class'].unique())
le = LabelEncoder()
df['label'] = le.fit_transform(df['class'])
df['label']

<class 'pandas.core.series.Series'>
RangeIndex: 5572 entries, 0 to 5571
Series name: class
Non-Null Count  Dtype 
--------------  ----- 
5572 non-null   object
dtypes: object(1)
memory usage: 43.7+ KB
None
['ham' 'spam']


0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: label, Length: 5572, dtype: int32

In [5]:
train_exdog, test_exdog, train_ans, test_ans = train_test_split(df['text'], df['label'], test_size=0.25,
                                                                random_state=42)
vectorizer = CountVectorizer()
train_exdog_vector = vectorizer.fit_transform(train_exdog)
test_exdog_vector = vectorizer.transform(test_exdog)

feature_names = vectorizer.get_feature_names_out()
print(len(feature_names))
feature_names

7490


array(['00', '000', '000pes', ..., 'zouk', 'zyada', 'èn'], dtype=object)

Constants, global vars and common method to train and get scores from model

In [6]:
FEATURES_TO_SELECT_COUNT = 30
SELECTED_FEATURES_BY_METHOD: Dict[str, FeatureSelectorDescr] = {
    'NO_SELECTOR': FeatureSelectorDescr('NO_SELECTOR', range(train_exdog_vector.shape[1]), feature_names)}


def evaluate_model(model, train_x, test_x, ans_train, ans_test, metric: Callable[[[float], [float]], float]):
    model.fit(train_x, ans_train)
    y_pred = model.predict(test_x)
    return metric(ans_test, y_pred)


def select_features(selector: SelectorAbc, selector_name: str) -> FeatureSelectorDescr:
    selected_feature_indices = selector.select(train_exdog_vector, test_exdog_vector, train_ans, test_ans,
                                               FEATURES_TO_SELECT_COUNT)
    selected_features = [feature_names[i] for i in selected_feature_indices]
    fsd = FeatureSelectorDescr(selector_name, selected_feature_indices, selected_features)
    SELECTED_FEATURES_BY_METHOD[selector_name] = fsd
    return fsd

## Implement 3 features sampling methods
All implementations placed in `model\selector.py`

### Embedded : SVM-RFE

In [7]:
EMBEDDED_FEATURES = select_features(SVM_RFE(), 'EMBEDDED_FEATURES')
print("Selected features:", EMBEDDED_FEATURES.selected_features)

[1;96m [INFO][0m SVM_RFE 2024-06-10 00:42:50.400862: Number of features selected: 7475 / 30
[1;96m [INFO][0m SVM_RFE 2024-06-10 00:43:10.311530: Number of features selected: 6725 / 30
[1;96m [INFO][0m SVM_RFE 2024-06-10 00:43:30.006078: Number of features selected: 5975 / 30
[1;96m [INFO][0m SVM_RFE 2024-06-10 00:44:00.570057: Number of features selected: 5225 / 30
[1;96m [INFO][0m SVM_RFE 2024-06-10 00:44:31.210788: Number of features selected: 4475 / 30
[1;96m [INFO][0m SVM_RFE 2024-06-10 00:45:01.474930: Number of features selected: 3725 / 30
[1;96m [INFO][0m SVM_RFE 2024-06-10 00:45:31.921986: Number of features selected: 2975 / 30
[1;96m [INFO][0m SVM_RFE 2024-06-10 00:45:59.538468: Number of features selected: 2225 / 30
[1;96m [INFO][0m SVM_RFE 2024-06-10 00:46:23.334756: Number of features selected: 1475 / 30
[1;96m [INFO][0m SVM_RFE 2024-06-10 00:46:38.576246: Number of features selected: 725 / 30
Selected features: ['10p', '150p', '16', '20p', '800', 'alert

### Wrapper: forward sequential selection
*NB: much faster than backward since number of features ~7k but target is only 30*

In [8]:
WRAPPER_FEATURES = select_features(WrapperForwardSelector(), 'WRAPPER_FEATURES')
print("Wrapper Method Selected Features:")
print(WRAPPER_FEATURES.selected_features)

[1;96m [INFO][0m WrapperForwardSelector 2024-06-10 00:47:21.539571: On epoch 0/30	 elapsed 37.632 s	Added feature 1549 with scores 0.89375
[1;96m [INFO][0m WrapperForwardSelector 2024-06-10 00:48:01.963249: On epoch 1/30	 elapsed 40.422 s	Added feature 6849 with scores 0.91744
[1;96m [INFO][0m WrapperForwardSelector 2024-06-10 00:48:45.134358: On epoch 2/30	 elapsed 43.171 s	Added feature 4244 with scores 0.93037
[1;96m [INFO][0m WrapperForwardSelector 2024-06-10 00:49:32.143906: On epoch 3/30	 elapsed 47.010 s	Added feature 3881 with scores 0.93898
[1;96m [INFO][0m WrapperForwardSelector 2024-06-10 00:50:23.449935: On epoch 4/30	 elapsed 51.305 s	Added feature 7388 with scores 0.94688
[1;96m [INFO][0m WrapperForwardSelector 2024-06-10 00:51:15.893396: On epoch 5/30	 elapsed 52.442 s	Added feature 293 with scores 0.95190
[1;96m [INFO][0m WrapperForwardSelector 2024-06-10 00:52:06.756904: On epoch 6/30	 elapsed 50.864 s	Added feature 302 with scores 0.95693
[1;96m [INFO]

### Filter

In [9]:
FILTER_FEATURES = select_features(FilterSelector(), 'FILTER_FEATURES')
print("Filter Method Selected Features:")
FILTER_FEATURES.selected_features

Filter Method Selected Features:


['150p',
 '16',
 '18',
 '50',
 '500',
 'call',
 'cash',
 'claim',
 'co',
 'cs',
 'free',
 'guaranteed',
 'mobile',
 'nokia',
 'now',
 'or',
 'prize',
 'reply',
 'service',
 'stop',
 'text',
 'to',
 'tone',
 'txt',
 'uk',
 'urgent',
 'win',
 'won',
 'www',
 'your']

## 3 library methods 

### 1. SelectFromModel using RandomForestClassifier
It is an embedded method that uses features importance calculated by the model and selects the most valuable features

In [10]:
SFM_FEATURES = select_features(SelectFromModelRF(), 'SFM_FEATURES')

print("SelectFromModel with RandomForestClassifier Selected Features:")
SFM_FEATURES.selected_features

SelectFromModel with RandomForestClassifier Selected Features:


['1000',
 '150p',
 '18',
 '50',
 '500',
 'call',
 'cash',
 'claim',
 'co',
 'com',
 'customer',
 'free',
 'guaranteed',
 'mobile',
 'new',
 'now',
 'or',
 'prize',
 'reply',
 'ringtone',
 'service',
 'stop',
 'text',
 'to',
 'txt',
 'uk',
 'urgent',
 'win',
 'www',
 'your']

### 2. VarianceThreshold
It is a simple filtering method

In [11]:
VT_FEATURES = select_features(VarianceThresholdSelector(p=0.915), 'VT_FEATURES')
VT_FEATURES.selected_features

['and',
 'are',
 'at',
 'be',
 'but',
 'call',
 'can',
 'do',
 'for',
 'free',
 'gt',
 'have',
 'if',
 'in',
 'is',
 'it',
 'lt',
 'me',
 'my',
 'not',
 'now',
 'of',
 'on',
 'or',
 'so',
 'that',
 'the',
 'to',
 'ur',
 'we']

### 3. MutualInformation
It is a filtering method that uses mutual information to evaluate the importance of each feature

In [12]:
MI_FEATURES = select_features(MutualInfoSelector(), 'MI_FEATURES')
print("Mutual Information Features:")
MI_FEATURES.selected_features

Mutual Information Features:


['1000',
 '150p',
 '16',
 '18',
 '50',
 '500',
 'call',
 'cash',
 'claim',
 'co',
 'cs',
 'free',
 'guaranteed',
 'mobile',
 'nokia',
 'now',
 'or',
 'prize',
 'reply',
 'service',
 'stop',
 'text',
 'to',
 'tone',
 'txt',
 'uk',
 'win',
 'won',
 'www',
 'your']

### Compare sampling results


In [13]:
from prettytable import PrettyTable

table = PrettyTable()
table.field_names = [item[0] for item in SELECTED_FEATURES_BY_METHOD.items()]
table.field_names.remove('NO_SELECTOR')
transp = []
for item in SELECTED_FEATURES_BY_METHOD.items():
    if item[1].name != 'NO_SELECTOR':
        item[1].selected_features.sort()
        transp.append(item[1].selected_features)

transp = np.array(transp)

for i_ in range(transp.shape[1]):
    table.add_row(transp[:, i_])

print(table)

+------------------------------+------------------+-----------------+--------------+-------------+-------------+
|      EMBEDDED_FEATURES       | WRAPPER_FEATURES | FILTER_FEATURES | SFM_FEATURES | VT_FEATURES | MI_FEATURES |
+------------------------------+------------------+-----------------+--------------+-------------+-------------+
|             10p              |       000        |       150p      |     1000     |     and     |     1000    |
|             150p             |       150p       |        16       |     150p     |     are     |     150p    |
|              16              |        16        |        18       |      18      |      at     |      16     |
|             20p              |       8552       |        50       |      50      |      be     |      18     |
|             800              |     accounts     |       500       |     500      |     but     |      50     |
|            alert             |    afternoon     |       call      |     call     |     call   

## Selection impact on model results analysis

In [14]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

classifiers = {
    'bias': MultinomialNB(),
    'dec_tree': DecisionTreeClassifier(),
    'svc': SVC(probability=True)
}


In [15]:
def compare_selector_results(metric: Callable[[[float], [float]], float]) -> None:
    res_table = PrettyTable()
    field_names = ['selector']
    for classifier_item in classifiers.items():
        field_names.append(classifier_item[0])

    res_table.field_names = field_names

    for selector_item in SELECTED_FEATURES_BY_METHOD.items():
        selector = selector_item[1]
        row = [f'{selector_item[0]}']
        for classifier_item in classifiers.items():
            st = time.time()
            ev = evaluate_model(classifier_item[1], train_exdog_vector[:, selector.selected_indices],
                                test_exdog_vector[:, selector.selected_indices], train_ans, test_ans, metric)
            elapsed = time.time() - st
            with_selection = ModelResult(classifier_item[0], str(metric), ev, True)
            selector.classifier_results[classifier_item[0]] = with_selection
            row.append(f'{with_selection.result:.5f} / {elapsed:.6f} s')
        res_table.add_row(row)

    print(res_table)

In [16]:
from sklearn.metrics import f1_score, roc_auc_score

compare_selector_results(accuracy_score)

+-------------------+----------------------+----------------------+----------------------+
|      selector     |         bias         |       dec_tree       |         svc          |
+-------------------+----------------------+----------------------+----------------------+
|    NO_SELECTOR    | 0.98851 / 0.005004 s | 0.97416 / 0.175598 s | 0.98421 / 5.243571 s |
| EMBEDDED_FEATURES | 0.92678 / 0.002000 s | 0.93180 / 0.002577 s | 0.93037 / 0.124978 s |
|  WRAPPER_FEATURES | 0.95765 / 0.001000 s | 0.97775 / 0.005001 s | 0.97775 / 0.267844 s |
|  FILTER_FEATURES  | 0.95190 / 0.001265 s | 0.95765 / 0.006095 s | 0.96482 / 0.382791 s |
|    SFM_FEATURES   | 0.95190 / 0.002086 s | 0.96123 / 0.005006 s | 0.96339 / 0.395813 s |
|    VT_FEATURES    | 0.93826 / 0.003006 s | 0.93180 / 0.018036 s | 0.95908 / 1.102260 s |
|    MI_FEATURES    | 0.95118 / 0.001925 s | 0.95836 / 0.005338 s | 0.96411 / 0.397794 s |
+-------------------+----------------------+----------------------+----------------------+

In [17]:
compare_selector_results(f1_score)

+-------------------+----------------------+----------------------+----------------------+
|      selector     |         bias         |       dec_tree       |         svc          |
+-------------------+----------------------+----------------------+----------------------+
|    NO_SELECTOR    | 0.95604 / 0.006093 s | 0.90000 / 0.183327 s | 0.93714 / 5.235034 s |
| EMBEDDED_FEATURES | 0.62774 / 0.002096 s | 0.66899 / 0.003997 s | 0.66436 / 0.126528 s |
|  WRAPPER_FEATURES | 0.81505 / 0.002981 s | 0.91317 / 0.004171 s | 0.91268 / 0.271974 s |
|  FILTER_FEATURES  | 0.78457 / 0.002998 s | 0.83237 / 0.005546 s | 0.86197 / 0.388351 s |
|    SFM_FEATURES   | 0.78594 / 0.002250 s | 0.85470 / 0.006870 s | 0.85634 / 0.391688 s |
|    VT_FEATURES    | 0.72785 / 0.003089 s | 0.73740 / 0.019072 s | 0.82569 / 1.100742 s |
|    MI_FEATURES    | 0.78065 / 0.002422 s | 0.83626 / 0.006728 s | 0.85795 / 0.384618 s |
+-------------------+----------------------+----------------------+----------------------+

In [18]:
compare_selector_results(roc_auc_score)

+-------------------+----------------------+----------------------+----------------------+
|      selector     |         bias         |       dec_tree       |         svc          |
+-------------------+----------------------+----------------------+----------------------+
|    NO_SELECTOR    | 0.96608 / 0.005399 s | 0.93423 / 0.170022 s | 0.94086 / 5.179624 s |
| EMBEDDED_FEATURES | 0.73035 / 0.002539 s | 0.75599 / 0.003030 s | 0.75516 / 0.124480 s |
|  WRAPPER_FEATURES | 0.84822 / 0.002000 s | 0.93486 / 0.004452 s | 0.93258 / 0.282847 s |
|  FILTER_FEATURES  | 0.82671 / 0.002935 s | 0.87861 / 0.005331 s | 0.90466 / 0.388709 s |
|    SFM_FEATURES   | 0.82899 / 0.002002 s | 0.88895 / 0.006219 s | 0.90156 / 0.391290 s |
|    VT_FEATURES    | 0.80293 / 0.002992 s | 0.85150 / 0.019098 s | 0.86042 / 1.103250 s |
|    MI_FEATURES    | 0.82403 / 0.002180 s | 0.87902 / 0.006100 s | 0.89970 / 0.386183 s |
+-------------------+----------------------+----------------------+----------------------+

**As demonstrated by tests on 3 different classifiers, reducing the number of features in most cases doesn't significantly spoil model quality but reduces evaluation time on order**