# Feature Selection Using Association rules

### This notebook presents the interface to use the newly crafted association rules based feature selector. In addition, the notebook presents a comparison between the new approach and chi-squared (correlation based) feature selector.

### Setup

#### Install requirements.txt

In [1]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


#### Import the standard libs as well as feature Selector based on association rules

##### standard libs

In [2]:
from typing import List
from sklearn import tree

##### feature selector based on association rules

In [3]:
from feature_selector import preprocessing
from feature_selector.feature_selection import feature_rank
from feature_selector.evaluation import evaluate_prediction
from feature_selector.bin_columns import bin_numeric_columns
from feature_selector.one_hot_column import transform_one_hot_column
from feature_selector.correlation import get_sorted_chi_squared_parameters
from feature_selector.association_rules import calc_apriori_rules, filter_rules_related_to_target
from feature_selector.evaluation import pre_process_df, preprocess_columns, split_to_features_and_target
from feature_selector.utils import load_datasets, get_min_support, get_min_confidence, get_target_column

### Read and preprocess Data

In [4]:
# Dataset be one of the following: ["MobilePriceRange", "HomeLoanApproval", "AirlinesDelay", "HeartAttack"]

dataset_name = "HeartAttack"
train_df, test_df = load_datasets(dataset_name=dataset_name)
train_df = pre_process_df(train_df)

In [5]:
train_df, columns_types = preprocessing.preprocessing(train_df)
target_column = get_target_column(dataset_name=dataset_name)

### Feature Selection using association rules

#### preprocess

In [6]:
train_df = bin_numeric_columns(df=train_df, columns=columns_types.very_numerical)

cols_to_hot_col = columns_types.all()
cols_to_hot_col.remove(target_column)

train_df = transform_one_hot_column(df=train_df, columns=cols_to_hot_col, to_remove_one_hot_col=False)

#### Execute Apriori algorithm to extract features relations

In [7]:
min_support = get_min_support(dataset_name=dataset_name)
min_confidence = get_min_confidence(dataset_name=dataset_name)

rules = calc_apriori_rules(train_df=train_df, min_support=min_support, min_confidence=min_confidence)

#### Filter only relevant rules that implies on the target column and generate feature importance ranking

In [8]:
relevant_rules = filter_rules_related_to_target(rules=rules, target_column=target_column)

feature_ranks = feature_rank(relevant_rules)
association_rules_features = [feature for feature, _ in feature_ranks]
print(f"feature ranks:{association_rules_features}")

feature ranks:['thall', 'fbs', 'exng', 'restecg', 'slp', 'oldpeak', 'caa', 'sex', 'cp']


### Evaluation

In [9]:
number_of_features_to_include_in_model = 8

selected_features = association_rules_features[:number_of_features_to_include_in_model]
total_features = selected_features.copy()
total_features.append(target_column)
print(total_features)

['thall', 'fbs', 'exng', 'restecg', 'slp', 'oldpeak', 'caa', 'sex', 'output']


In [10]:
eval_train_df, eval_test_df = load_datasets(dataset_name=dataset_name)
eval_train_df = pre_process_df(eval_train_df)
eval_train_df, y_train_df = split_to_features_and_target(
    df=eval_train_df,
    relevant_features=selected_features,
    target_feature=target_column
)
eval_train_df, y_train_df = preprocess_columns(features_df=eval_train_df,
                                                target_df=y_train_df,
                                                columns_to_label_encode=columns_types.categorical)

### create classifier model and fit

In [11]:
clf = tree.DecisionTreeClassifier()

clf = clf.fit(eval_train_df, y_train_df)

eval_train_df = pre_process_df(df=eval_test_df)
eval_test_df, y_test_df = split_to_features_and_target(df=eval_test_df,
                                                           relevant_features=selected_features,
                                                           target_feature=target_column)

eval_test_df, y_test_df = preprocess_columns(features_df=eval_test_df,
                                             target_df=y_test_df,
                                             columns_to_label_encode=columns_types.categorical)

### evaluate prediction

In [12]:
column_name = y_test_df.columns[0]
test_values = y_test_df[column_name].tolist()
predictions = clf.predict(eval_test_df)

success_percentage = evaluate_prediction(predictions=predictions, test_values=test_values)
print(f"Association rules based feature-selector success rate:{success_percentage}")

Association rules based feature-selector success rate:0.8351648351648352


#### check `chi squared` based feature selector performance

In [13]:
train_df2, test_df2 = load_datasets(dataset_name=dataset_name)
sorted_chi_squared_features_correlation: List = get_sorted_chi_squared_parameters(
    df=train_df2,
    categorical_columns=cols_to_hot_col,
    target_column=target_column)

In [14]:
chi_squared_best_features = sorted_chi_squared_features_correlation[:number_of_features_to_include_in_model]
print(chi_squared_best_features)

['thall', 'cp', 'caa', 'exng', 'slp', 'sex', 'restecg', 'oldpeak']


###### Evaluate such model

In [16]:
eval_train_df, eval_test_df = load_datasets(dataset_name=dataset_name)
eval_train_df = pre_process_df(eval_train_df)
eval_train_df, y_train_df = split_to_features_and_target(
    df=eval_train_df,
    relevant_features=chi_squared_best_features,
    target_feature=target_column
)
eval_train_df, y_train_df = preprocess_columns(features_df=eval_train_df,
                                               target_df=y_train_df,
                                               columns_to_label_encode=columns_types.categorical)
clf = tree.DecisionTreeClassifier()
clf = clf.fit(eval_train_df, y_train_df)

eval_train_df = pre_process_df(df=eval_test_df)
eval_test_df, y_test_df = split_to_features_and_target(df=eval_test_df,
                                                       relevant_features=chi_squared_best_features,
                                                       target_feature=target_column)

eval_test_df, y_test_df = preprocess_columns(features_df=eval_test_df,
                                             target_df=y_test_df,
                                             columns_to_label_encode=columns_types.categorical)
column_name = y_test_df.columns[0]
test_values = y_test_df[column_name].tolist()
predictions = clf.predict(eval_test_df)

success_percentage = evaluate_prediction(predictions=predictions, test_values=test_values)
print(f"Chi squared feature selector success rate:{success_percentage}")

Chi squared feature selector success rate:0.8021978021978022
