### Library Import

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split,StratifiedKFold,cross_val_score,cross_validate
from sklearn.metrics import classification_report,make_scorer,confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier
from xgboost import XGBClassifier



import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import ListedColormap

from sklearn.preprocessing import MinMaxScaler,StandardScaler

from imblearn.over_sampling import RandomOverSampler,SMOTE,ADASYN,BorderlineSMOTE,SMOTEN,SVMSMOTE,KMeansSMOTE
from imblearn.under_sampling import ClusterCentroids,RandomUnderSampler,NearMiss
from imblearn.combine import SMOTEENN,SMOTETomek

from collections import Counter

from sklearn.cluster import DBSCAN

### Data Import

In [2]:
random_state = 23873
dt = pd.read_excel(r'CA_BC.xlsx').T

In [3]:
dt.columns = dt.iloc[0]
dt = dt.iloc[1:]
dt['Label'] = dt.Label.map({'E':1,'A':0})
dt.drop_duplicates(keep='first',inplace=True)

#### Duplicate Column Names

In [4]:
# Columns to not rename
excluded = dt.columns[~dt.columns.duplicated(keep=False)]

# An incrementer
import itertools
inc = itertools.count().__next__

# A renamer
def ren(name):
    return f"{name}{inc()}" if name not in excluded else name

# Use inside rename()
for x in range(0,2):
    dt.rename(columns=ren,inplace=True)

In [5]:
for x in dt.select_dtypes(include=['object']).columns:
    dt[x] = dt[x].astype('float64')
dt.columns = [str(x) for x in dt.columns]
dt = dt.select_dtypes(include=['float64','int64'])

### Baseline Modelling

In [6]:
from lazypredict.Supervised import LazyClassifier
features = dt.copy()
target = features.pop('Label')
x_train, x_test, y_train, y_test = train_test_split(features, target,stratify=target, test_size=0.3,random_state=random_state)
clf=LazyClassifier(verbose=0,ignore_warnings=True,custom_metric=None)
models,predictions=clf.fit(x_train, x_test, y_train, y_test)

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:22<00:00,  1.30it/s]


In [7]:
print(models.to_latex(index=False))

\begin{tabular}{rrrrr}
\toprule
Empty DataFrame
Columns: Index(['Accuracy', 'Balanced Accuracy', 'ROC AUC', 'F1 Score', 'Time Taken'], dtype='object')
Index: Float64Index([], dtype='float64', name='Model') \\
\bottomrule
\end{tabular}



### Importing from previous study

In [8]:
def change_columns(dt,x):
    dt.columns = [f'{y}_{x}' for y in dt.columns]
    return dt
files = ['baseline','pearson','chi','ig','rfe']
dataframes = [pd.read_csv(f'{x}.csv').set_index('Model') for x in files]
for x in range(len(files)):
    dataframes[x].columns = [f'{y}_{files[x]}' for y in dataframes[x].columns]

In [9]:
from functools import reduce
baseline_merged = reduce(lambda  left,right: pd.merge(left,right,on='Model',
                                            how='inner'), dataframes)

In [10]:
baseline_merged[['F1 Score_baseline','F1 Score_pearson','F1 Score_chi','F1 Score_ig','F1 Score_rfe']]

Unnamed: 0_level_0,F1 Score_baseline,F1 Score_pearson,F1 Score_chi,F1 Score_ig,F1 Score_rfe
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AdaBoostClassifier,0.92,0.82,0.87,0.82,0.95
LGBMClassifier,0.87,0.92,0.85,0.92,0.97
XGBClassifier,0.87,0.95,0.87,0.92,0.97
ExtraTreesClassifier,0.87,0.85,0.89,0.95,0.92
RidgeClassifierCV,0.84,0.89,0.89,0.92,0.92
RidgeClassifier,0.84,0.89,0.89,0.92,0.92
BaggingClassifier,0.84,0.87,0.78,0.85,0.9
LogisticRegression,0.81,0.89,0.89,0.95,0.92
LinearDiscriminantAnalysis,0.81,0.89,0.89,0.92,0.92
RandomForestClassifier,0.81,0.89,0.85,0.95,0.93


## Linear Models

### Remove Co-Linear Fields

In [11]:
# # https://chrisalbon.com/code/machine_learning/feature_selection/drop_highly_correlated_features/
# corr = dt.corr()

# # sns.heatmap(corr)

# # Create correlation matrix
# corr_matrix = corr.abs()

# # Select upper triangle of correlation matrix
# upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# # Find index of feature columns with correlation greater than 0.95
# to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

# print(f"Would drop {len(to_drop)} fields")

In [12]:
features = dt.drop(to_drop, axis=1).copy()
target = features.pop('Label')
x_train, x_test, y_train, y_test = train_test_split(features, target,stratify=target, test_size=0.3,random_state=random_state)
clf=LazyClassifier(verbose=0,ignore_warnings=True,custom_metric=None)
models_corr,predictions=clf.fit(x_train, x_test, y_train, y_test)

models_corr

NameError: name 'to_drop' is not defined

### Min Max Scaling

In [None]:
# x_train_MinMax = MinMaxScaler().fit_transform(x_train)
# clf=LazyClassifier(verbose=0,ignore_warnings=True,custom_metric=None)
# models_min_max,predictions=clf.fit(x_train_MinMax, x_test, y_train, y_test)

# models_min_max

In [None]:
features = dt.drop(to_drop, axis=1).copy()
target = features.pop('Label')
x_train, x_test, y_train, y_test = train_test_split(MinMaxScaler().fit_transform(features), target,stratify=target, test_size=0.3,random_state=random_state)

clf=LazyClassifier(verbose=0,ignore_warnings=True,custom_metric=None)
models_min_max,predictions=clf.fit(x_train, x_test, y_train, y_test)

#### Both Techniques (scaling & Co-linear Varaibles Reduction) improve performance for Linear Models.

### Feature Selection 

Two Step Selection:
- Recursive Feature Elimination (select peak point)
- Forward Selection (Find the right mix feature count & performance)

In [None]:
from sklearn.model_selection import cross_validate

In [None]:
def feat_select_log_reg(col):
    features = dt[col].copy()
    target = features.pop('Label')

    from sklearn.pipeline import Pipeline
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('model', LogisticRegression())
    ])
    
    scoring = {
               'balanced_accuracy':'balanced_accuracy',
               'f1_macro':'f1_macro',
               'precision_macro':'precision_macro',
               'recall_macro':'recall_macro',
              }

    # clf=RandomForestClassifier(n_estimators =10, random_state = 42,class_weight='balanced')
    clf = LogisticRegression()
    output = cross_validate(pipe, features, target, cv=4, scoring = scoring, return_estimator =True)

    imp = pd.DataFrame(data = {'fields':features.columns,'importance':np.mean([estimator.steps[1][1].coef_[0] for estimator in output['estimator']],axis=0)}).sort_values(by='importance',axis=0, ascending=False, inplace=False)
    imp = pd.concat([imp[imp.importance <= imp[imp.importance < 0].importance.median()],
    imp[imp.importance >= imp[imp.importance > 0].importance.median()]],axis=0)
    print(f"{len(col)} columns produced macro recall of {output['test_recall_macro'].mean()}")
    return {
            'features': len(col),
            'cols':col,
            'medians': (imp[imp.importance < 0].importance.median(),imp[imp.importance > 0].importance.median()),
            'balanced_accuracy' :output['test_balanced_accuracy'].mean(),
            'f1_macro':output['test_f1_macro'].mean(),
            'precision_macro':output['test_precision_macro'].mean(),
            'recall_macro':output['test_recall_macro'].mean(),
            'balanced_accuracy_std':output['test_balanced_accuracy'].std(),
            'f1_macro_std':output['test_f1_macro'].std(),
            'precision_macro_std':output['test_precision_macro'].std(),
            'recall_macro_std':output['test_recall_macro'].std(),
            'next' : imp,
            'next_columns' : [x for x in imp.fields.values] + ['Label']
            }

In [None]:
col = dt.drop(to_drop, axis=1).columns
features = [0,1]
result= []
while len(set(features[-5:])) != 1:
    res = feat_select_log_reg(col)
    result.append(res)
    features.append(res['features'])
    col = res['next_columns']

In [None]:
cv_res = pd.DataFrame(data = {'features': [res['features'] for res in result],
'balanced_accuracy': [res['balanced_accuracy'] for res in result],
'f1_macro': [res['f1_macro'] for res in result],
'precision_macro': [res['precision_macro'] for res in result],
'recall_macro': [res['recall_macro'] for res in result]})

plt.rcParams["figure.figsize"] = [20.00, 6.00]
plt.rcParams["figure.autolayout"] = True
f, axes = plt.subplots(1, 2)
sns.barplot(x = 'features',y = 'balanced_accuracy',data = cv_res, palette = "Set2",ax=axes[0]).set(title='Balanced Accuracy')
sns.barplot(x = 'features',y = 'f1_macro',data = cv_res, palette = "Set2",ax=axes[1]).set(title='F1 Macro Score')
plt.show()


plt.rcParams["figure.figsize"] = [20.00, 6.00]
plt.rcParams["figure.autolayout"] = True
f, axes = plt.subplots(1, 2)
sns.barplot(x = 'features',y = 'precision_macro',data = cv_res, palette = "Set2",ax=axes[0]).set(title='Precision Macro Score')
sns.barplot(x = 'features',y = 'recall_macro',data = cv_res, palette = "Set2",ax=axes[1]).set(title='Recall Macro Score')
plt.show()
cv_res

In [None]:
cv_res = pd.DataFrame(data = {'features': [res['features'] for res in result],
'balanced_accuracy': [res['balanced_accuracy'] for res in result],
'f1_macro': [res['f1_macro'] for res in result],
'precision_macro': [res['precision_macro'] for res in result],
'recall_macro': [res['recall_macro'] for res in result]})

plt.rcParams["figure.figsize"] = [8.00, 5.00]
plt.rcParams["figure.autolayout"] = True
sns.color_palette("deep")
sns.barplot(x = 'features',y = 'f1_macro',data = cv_res,color='green')
plt.xlabel('Number of Features')
plt.ylabel('F1 Scores')
plt.savefig('lrs1.eps',bbox_inches = 'tight',dpi=1200,transparent= True, format='eps')

In [None]:
peak = 301

cols = [x for x in result if x['features'] == peak][0]['cols']
features = dt.drop(to_drop, axis=1)[cols].copy()
target = features.pop('Label')
x_train, x_test, y_train, y_test = train_test_split(StandardScaler().fit_transform(features), target,stratify=target, test_size=0.5,random_state=random_state)
# Creating the Nearest Centroid Classifier
model = LogisticRegression()
 
# Training the classifier
model.fit(x_train, y_train.values.ravel())
 
# Printing Accuracy on Training and Test sets
print(f"Training Set Score : {model.score(x_train, y_train) * 100} %")
print(f"Test Set Score : {model.score(x_test, y_test) * 100} %")
 
# Printing classification report of classifier on the test set set data
print(f"Model Classification Report : \n{classification_report(y_test, model.predict(x_test))}")

In [None]:
def classifier(x_train, y_train,scoring):
    
    cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=random_state)
    classifiers = LogisticRegression()
    scores = cross_val_score(classifiers, StandardScaler().fit_transform(x_train), y_train, scoring=scoring, cv=cv)
    
    return scores.mean()

In [None]:
def feed_forward(unprocessed_cols,current_bestcols,scoring):
    results = [classifier(features[current_bestcols + [x]], target,scoring) for x in unprocessed_cols]
    current_bestcols = current_bestcols + [unprocessed_cols[results.index(max(results))]]
    bestcols.append((max(results), current_bestcols))
    unprocessed_cols.pop(results.index(max(results)))
    if len(unprocessed_cols) > 0:
        print(f"{len(unprocessed_cols)} columns left to process")
        feed_forward(unprocessed_cols,current_bestcols,scoring)
    else:
        return bestcols

##### Scorer = f1_macro

In [None]:
bestcols = []
cols = [x for x in result if x['features'] == peak][0]['cols']
features = dt.drop(to_drop, axis=1)[cols].copy()
target = features.pop('Label')
cols = features.columns.tolist()
feed_forward(cols,bestcols,scoring='f1_macro')

In [None]:
plt.rcParams["figure.figsize"] = [8.00, 5.00]
plt.rcParams["figure.autolayout"] = True
sns.lineplot(x=[len(x[1]) for x in bestcols], y=[x[0] for x in bestcols], palette='Set2')
plt.xlabel('Number of Features')
plt.ylabel('F1 Scores')
plt.savefig('lrs2.eps',bbox_inches = 'tight',dpi=1200,transparent= False, format='eps')

In [None]:
bestcols

In [None]:
features = dt[bestcols[5][1] + ['Label']].copy()
target = features.pop('Label')
x_train, x_test, y_train, y_test = train_test_split(StandardScaler().fit_transform(features), target,stratify=target, test_size=0.3,random_state=random_state)

clf=LazyClassifier(verbose=0,ignore_warnings=True,custom_metric=None)
models_lr,predictions=clf.fit(x_train, x_test, y_train, y_test)


In [None]:
lr = pd.merge(baseline_merged,models_lr,left_index=True,right_index=True)
lr = lr[lr.index =='LogisticRegression'][['F1 Score_baseline','F1 Score_pearson','F1 Score_chi','F1 Score_ig','F1 Score_rfe','F1 Score']].values.flatten().tolist()

In [None]:
plt.rcParams["figure.figsize"] = [8.00, 5.00]
sns.barplot(x = lr[::-1] ,y = ['Baseline','Pearson Correlation','Chi Squared','Information Gain','RFE','RFE + Forward Selection'][::-1],palette='Set2',orient='h')
plt.ylabel('Selection Method')
plt.xlabel('F1 Scores')
plt.savefig('lr_final.eps',bbox_inches = 'tight',dpi=1200,transparent= True, format='eps')

In [None]:
lr.reverse()

#### Recall Macro

In [None]:
# bestcols = []
# cols = [x for x in result if x['features'] == peak][0]['cols']
# features = dt.drop(to_drop, axis=1)[cols].copy()
# target = features.pop('Label')
# cols = features.columns.tolist()
# feed_forward(cols,bestcols,scoring='recall_macro')

In [None]:
# sns.lineplot(x=[len(x[1]) for x in bestcols], y=[x[0] for x in bestcols], palette='Set2')
# plt.xlabel('Number of Columns')
# plt.ylabel('Scores')
# plt.show()

In [None]:
# bestcols

### Ensembles

#### Extra Trees

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
def feat_select_extra(col):
    features = dt[col].copy()
    target = features.pop('Label')

    from sklearn.pipeline import Pipeline
    pipe = Pipeline([
        ('scaler', MinMaxScaler()),
        ('model', ExtraTreesClassifier(n_estimators =10, random_state = random_state))
    ])
    
    scoring = {
               'balanced_accuracy':'balanced_accuracy',
               'f1_macro':'f1_macro',
               'precision_macro':'precision_macro',
               'recall_macro':'recall_macro',
              }

    # clf=RandomForestClassifier(n_estimators =10, random_state = 42,class_weight='balanced')
    output = cross_validate(pipe, features, target, cv=4, scoring = scoring, return_estimator =True)

    imp = pd.DataFrame(data = {'fields':features.columns,'importance':np.mean([estimator.steps[1][1].feature_importances_ for estimator in output['estimator']],axis=0)}).sort_values(by='importance',axis=0, ascending=False, inplace=False)
    imp = pd.concat([imp[imp.importance <= imp[imp.importance < 0].importance.median()],
    imp[imp.importance >= imp[imp.importance > 0].importance.median()]],axis=0)
    print(f"{len(col)} columns produced macro recall of {output['test_recall_macro'].mean()}")
    return {
            'features': len(col),
            'cols':col,
            'medians': (imp[imp.importance < 0].importance.median(),imp[imp.importance > 0].importance.median()),
            'balanced_accuracy' :output['test_balanced_accuracy'].mean(),
            'f1_macro':output['test_f1_macro'].mean(),
            'precision_macro':output['test_precision_macro'].mean(),
            'recall_macro':output['test_recall_macro'].mean(),
            'balanced_accuracy_std':output['test_balanced_accuracy'].std(),
            'f1_macro_std':output['test_f1_macro'].std(),
            'precision_macro_std':output['test_precision_macro'].std(),
            'recall_macro_std':output['test_recall_macro'].std(),
            'next' : imp,
            'next_columns' : [x for x in imp.fields.values] + ['Label']
            }

In [None]:
col = dt.drop(to_drop, axis=1).columns
features = [0,1]
result= []
while len(set(features[-5:])) != 1:
    res = feat_select_extra(col)
    result.append(res)
    features.append(res['features'])
    col = res['next_columns']

In [None]:
cv_res = pd.DataFrame(data = {'features': [res['features'] for res in result],
'balanced_accuracy': [res['balanced_accuracy'] for res in result],
'f1_macro': [res['f1_macro'] for res in result],
'precision_macro': [res['precision_macro'] for res in result],
'recall_macro': [res['recall_macro'] for res in result]})

plt.rcParams["figure.figsize"] = [20.00, 6.00]
plt.rcParams["figure.autolayout"] = True
f, axes = plt.subplots(1, 2)
sns.barplot(x = 'features',y = 'balanced_accuracy',data = cv_res, palette = "Set2",ax=axes[0]).set(title='Balanced Accuracy')
sns.barplot(x = 'features',y = 'f1_macro',data = cv_res, palette = "Set2",ax=axes[1]).set(title='F1 Macro Score')
plt.show()


plt.rcParams["figure.figsize"] = [20.00, 6.00]
plt.rcParams["figure.autolayout"] = True
f, axes = plt.subplots(1, 2)
sns.barplot(x = 'features',y = 'precision_macro',data = cv_res, palette = "Set2",ax=axes[0]).set(title='Precision Macro Score')
sns.barplot(x = 'features',y = 'recall_macro',data = cv_res, palette = "Set2",ax=axes[1]).set(title='Recall Macro Score')
plt.show()

In [None]:
cv_res = pd.DataFrame(data = {'features': [res['features'] for res in result],
'balanced_accuracy': [res['balanced_accuracy'] for res in result],
'f1_macro': [res['f1_macro'] for res in result],
'precision_macro': [res['precision_macro'] for res in result],
'recall_macro': [res['recall_macro'] for res in result]})

plt.rcParams["figure.figsize"] = [8.00, 5.00]
plt.rcParams["figure.autolayout"] = True
sns.color_palette("deep")
sns.barplot(x = 'features',y = 'f1_macro',data = cv_res,color='green')
plt.xlabel('Number of Features')
plt.ylabel('F1 Scores')
plt.savefig('ETS1.eps',bbox_inches = 'tight',dpi=1200,transparent= True, format='eps')

In [None]:
cv_res

In [None]:
peak = 10

cols = [x for x in result if x['features'] == peak][0]['cols']
features = dt[cols].copy()
target = features.pop('Label')
x_train, x_test, y_train, y_test = train_test_split(MinMaxScaler().fit_transform(features), target,stratify=target, test_size=0.5,random_state=random_state)
# Creating the Nearest Centroid Classifier
model = ExtraTreesClassifier(n_estimators =10, random_state = random_state)
 
# Training the classifier
model.fit(x_train, y_train.values.ravel())
 
# Printing Accuracy on Training and Test sets
print(f"Training Set Score : {model.score(x_train, y_train) * 100} %")
print(f"Test Set Score : {model.score(x_test, y_test) * 100} %")
 
# Printing classification report of classifier on the test set set data
print(f"Model Classification Report : \n{classification_report(y_test, model.predict(x_test))}")

In [None]:
def classifier(x_train, y_train,scoring):
    
    cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=random_state)
    classifiers = ExtraTreesClassifier(n_estimators =10, random_state = random_state)
    scores = cross_val_score(classifiers, MinMaxScaler().fit_transform(x_train), y_train, scoring=scoring, cv=cv)
    
    return scores.mean()

In [None]:
def feed_forward(unprocessed_cols,current_bestcols,scoring):
    results = [classifier(features[current_bestcols + [x]], target,scoring) for x in unprocessed_cols]
    current_bestcols = current_bestcols + [unprocessed_cols[results.index(max(results))]]
    bestcols.append((max(results), current_bestcols))
    unprocessed_cols.pop(results.index(max(results)))
    if len(unprocessed_cols) > 0:
        print(f"{len(unprocessed_cols)} columns left to process")
        feed_forward(unprocessed_cols,current_bestcols,scoring)
    else:
        return bestcols

#### F1 SCORE

In [None]:
bestcols = []
cols = [x for x in result if x['features'] == peak][0]['cols']
features = dt[cols].copy()
target = features.pop('Label')
cols = features.columns.tolist()
feed_forward(cols,bestcols,scoring='f1_macro')

In [None]:
sns.lineplot(x=[len(x[1]) for x in bestcols], y=[x[0] for x in bestcols], palette='Set2')
plt.xlabel('Number of Features')
plt.ylabel('F1 Scores')
plt.savefig('ETS2.eps',bbox_inches = 'tight',dpi=1200,transparent= False, format='eps')

In [None]:
bestcols

#### Recall

In [None]:
# bestcols = []
# cols = [x for x in result if x['features'] == peak][0]['cols']
# features = dt[cols].copy()
# target = features.pop('Label')
# cols = features.columns.tolist()
# feed_forward(cols,bestcols,scoring='recall_macro')

In [None]:
# sns.lineplot(x=[len(x[1]) for x in bestcols], y=[x[0] for x in bestcols], palette='Set2')
# plt.xlabel('Number of Columns')
# plt.ylabel('Scores')
# plt.show()

In [None]:
# bestcols

In [None]:
features = dt[bestcols[7][1] + ['Label']].copy()
target = features.pop('Label')
x_train, x_test, y_train, y_test = train_test_split(MinMaxScaler().fit_transform(features), target,stratify=target, test_size=0.3,random_state=random_state)

clf=LazyClassifier(verbose=0,ignore_warnings=True,custom_metric=None)
models_et,predictions=clf.fit(x_train, x_test, y_train, y_test)


et = pd.merge(baseline_merged,models_et,left_index=True,right_index=True)
et1 = et[et.index =='ExtraTreesClassifier'][['F1 Score_baseline','F1 Score_pearson','F1 Score_chi','F1 Score_ig','F1 Score_rfe','F1 Score']].values.flatten().tolist()

plt.rcParams["figure.figsize"] = [8.00, 5.00]
sns.barplot(x = et1[::-1] ,y = ['Baseline','Pearson Correlation','Chi Squared','Information Gain','RFE','RFE + Forward Selection'][::-1],palette='Set2',orient='h')
plt.ylabel('Selection Method')
plt.xlabel('F1 Scores')
plt.savefig('et_final.eps',bbox_inches = 'tight',dpi=1200,transparent= True, format='eps')
et[et.index =='ExtraTreesClassifier'][['F1 Score_baseline','F1 Score_pearson','F1 Score_chi','F1 Score_ig','F1 Score_rfe','F1 Score']]

#### Random Forest

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
def feat_select_rf(col):
    features = dt[col].copy()
    target = features.pop('Label')

    from sklearn.pipeline import Pipeline
    pipe = Pipeline([
        ('scaler', MinMaxScaler()),
        ('model', RandomForestClassifier(n_estimators =10, random_state = random_state))
    ])
    
    scoring = {
               'balanced_accuracy':'balanced_accuracy',
               'f1_macro':'f1_macro',
               'precision_macro':'precision_macro',
               'recall_macro':'recall_macro',
              }

    # clf=RandomForestClassifier(n_estimators =10, random_state = 42,class_weight='balanced')
    output = cross_validate(pipe, features, target, cv=4, scoring = scoring, return_estimator =True)

    imp = pd.DataFrame(data = {'fields':features.columns,'importance':np.mean([estimator.steps[1][1].feature_importances_ for estimator in output['estimator']],axis=0)}).sort_values(by='importance',axis=0, ascending=False, inplace=False)
    imp = pd.concat([imp[imp.importance <= imp[imp.importance < 0].importance.median()],
    imp[imp.importance >= imp[imp.importance > 0].importance.median()]],axis=0)
    print(f"{len(col)} columns produced macro recall of {output['test_recall_macro'].mean()}")
    return {
            'features': len(col),
            'cols':col,
            'medians': (imp[imp.importance < 0].importance.median(),imp[imp.importance > 0].importance.median()),
            'balanced_accuracy' :output['test_balanced_accuracy'].mean(),
            'f1_macro':output['test_f1_macro'].mean(),
            'precision_macro':output['test_precision_macro'].mean(),
            'recall_macro':output['test_recall_macro'].mean(),
            'balanced_accuracy_std':output['test_balanced_accuracy'].std(),
            'f1_macro_std':output['test_f1_macro'].std(),
            'precision_macro_std':output['test_precision_macro'].std(),
            'recall_macro_std':output['test_recall_macro'].std(),
            'next' : imp,
            'next_columns' : [x for x in imp.fields.values] + ['Label']
            }

In [None]:
col = dt.columns
features = [0,1]
result= []
while len(set(features[-5:])) != 1:
    res = feat_select_rf(col)
    result.append(res)
    features.append(res['features'])
    col = res['next_columns']

In [None]:
cv_res = pd.DataFrame(data = {'features': [res['features'] for res in result],
'balanced_accuracy': [res['balanced_accuracy'] for res in result],
'f1_macro': [res['f1_macro'] for res in result],
'precision_macro': [res['precision_macro'] for res in result],
'recall_macro': [res['recall_macro'] for res in result]})

plt.rcParams["figure.figsize"] = [20.00, 6.00]
plt.rcParams["figure.autolayout"] = True
f, axes = plt.subplots(1, 2)
sns.barplot(x = 'features',y = 'balanced_accuracy',data = cv_res, palette = "Set2",ax=axes[0]).set(title='Balanced Accuracy')
sns.barplot(x = 'features',y = 'f1_macro',data = cv_res, palette = "Set2",ax=axes[1]).set(title='F1 Macro Score')
plt.show()


plt.rcParams["figure.figsize"] = [20.00, 6.00]
plt.rcParams["figure.autolayout"] = True
f, axes = plt.subplots(1, 2)
sns.barplot(x = 'features',y = 'precision_macro',data = cv_res, palette = "Set2",ax=axes[0]).set(title='Precision Macro Score')
sns.barplot(x = 'features',y = 'recall_macro',data = cv_res, palette = "Set2",ax=axes[1]).set(title='Recall Macro Score')
plt.show()

In [None]:
cv_res = pd.DataFrame(data = {'features': [res['features'] for res in result],
'balanced_accuracy': [res['balanced_accuracy'] for res in result],
'f1_macro': [res['f1_macro'] for res in result],
'precision_macro': [res['precision_macro'] for res in result],
'recall_macro': [res['recall_macro'] for res in result]})

plt.rcParams["figure.figsize"] = [8.00, 5.00]
plt.rcParams["figure.autolayout"] = True
sns.color_palette("deep")
sns.barplot(x = 'features',y = 'f1_macro',data = cv_res,color='green')
plt.xlabel('Number of Features')
plt.ylabel('F1 Scores')
plt.savefig('RFS1.eps',bbox_inches = 'tight',dpi=1200,transparent= True, format='eps')
cv_res

In [None]:
peak = 29

cols = [x for x in result if x['features'] == peak][0]['cols']
features = dt[cols].copy()
target = features.pop('Label')
x_train, x_test, y_train, y_test = train_test_split(MinMaxScaler().fit_transform(features), target,stratify=target, test_size=0.5,random_state=random_state)
# Creating the Nearest Centroid Classifier
model = RandomForestClassifier(n_estimators =10, random_state = random_state)
 
# Training the classifier
model.fit(x_train, y_train.values.ravel())
 
# Printing Accuracy on Training and Test sets
print(f"Training Set Score : {model.score(x_train, y_train) * 100} %")
print(f"Test Set Score : {model.score(x_test, y_test) * 100} %")
 
# Printing classification report of classifier on the test set set data
print(f"Model Classification Report : \n{classification_report(y_test, model.predict(x_test))}")

In [None]:
def classifier(x_train, y_train,scoring):
    
    cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=random_state)
    classifiers = RandomForestClassifier(n_estimators =10, random_state = random_state)
    scores = cross_val_score(classifiers, StandardScaler().fit_transform(x_train), y_train, scoring=scoring, cv=cv)
    
    return scores.mean()

In [None]:
def feed_forward(unprocessed_cols,current_bestcols,scoring):
    results = [classifier(features[current_bestcols + [x]], target,scoring) for x in unprocessed_cols]
    current_bestcols = current_bestcols + [unprocessed_cols[results.index(max(results))]]
    bestcols.append((max(results), current_bestcols))
    unprocessed_cols.pop(results.index(max(results)))
    if len(unprocessed_cols) > 0:
        print(f"{len(unprocessed_cols)} columns left to process")
        feed_forward(unprocessed_cols,current_bestcols,scoring)
    else:
        return bestcols

In [None]:
bestcols = []
cols = [x for x in result if x['features'] == peak][0]['cols']
features = dt[cols].copy()
target = features.pop('Label')
cols = features.columns.tolist()
feed_forward(cols,bestcols,scoring='f1_macro')

In [None]:
sns.lineplot(x=[len(x[1]) for x in bestcols], y=[x[0] for x in bestcols], palette='Set2')
plt.xlabel('Number of Columns')
plt.ylabel('Scores')
plt.savefig('RFS2.eps',bbox_inches = 'tight',dpi=1200,transparent= False, format='eps')

In [None]:
bestcols

In [None]:
# bestcols = []
# cols = [x for x in result if x['features'] == peak][0]['cols']
# features = dt[cols].copy()
# target = features.pop('Label')
# cols = features.columns.tolist()
# feed_forward(cols,bestcols,scoring='recall_macro')

In [None]:
# sns.lineplot(x=[len(x[1]) for x in bestcols], y=[x[0] for x in bestcols], palette='Set2')
# plt.xlabel('Number of Columns')
# plt.ylabel('Scores')
# plt.show()

In [None]:
# bestcols

In [None]:
features = dt[bestcols[5][1] + ['Label']].copy()
target = features.pop('Label')
x_train, x_test, y_train, y_test = train_test_split(MinMaxScaler().fit_transform(features), target,stratify=target, test_size=0.3,random_state=random_state)

clf=LazyClassifier(verbose=0,ignore_warnings=True,custom_metric=None)
models_et,predictions=clf.fit(x_train, x_test, y_train, y_test)


et = pd.merge(baseline_merged,models_et,left_index=True,right_index=True)
et1 = et[et.index =='RandomForestClassifier'][['F1 Score_baseline','F1 Score_pearson','F1 Score_chi','F1 Score_ig','F1 Score_rfe','F1 Score']].values.flatten().tolist()

plt.rcParams["figure.figsize"] = [8.00, 5.00]
sns.barplot(x = et1[::-1] ,y = ['Baseline','Pearson Correlation','Chi Squared','Information Gain','RFE','RFE + Forward Selection'][::-1],palette='Set2',orient='h')
plt.ylabel('Selection Method')
plt.xlabel('F1 Scores')
plt.savefig('rf_final.eps',bbox_inches = 'tight',dpi=1200,transparent= True, format='eps')
et[et.index =='RandomForestClassifier'][['F1 Score_baseline','F1 Score_pearson','F1 Score_chi','F1 Score_ig','F1 Score_rfe','F1 Score']]

#### LGBMClassifier

In [None]:
from lightgbm import LGBMClassifier
from sklearn.ensemble import ExtraTreesClassifier
def feat_select_lgbm(col):
    features = dt[col].copy()
    target = features.pop('Label')

    from sklearn.pipeline import Pipeline
    pipe = Pipeline([
        ('scaler', MinMaxScaler()),
        ('model', LGBMClassifier(random_state = random_state))
    ])
    
    scoring = {
               'balanced_accuracy':'balanced_accuracy',
               'f1_macro':'f1_macro',
               'precision_macro':'precision_macro',
               'recall_macro':'recall_macro',
              }

    # clf=RandomForestClassifier(n_estimators =10, random_state = 42,class_weight='balanced')
    output = cross_validate(pipe, features, target, cv=4, scoring = scoring, return_estimator =True)

    imp = pd.DataFrame(data = {'fields':features.columns,'importance':np.mean([estimator.steps[1][1].feature_importances_ for estimator in output['estimator']],axis=0)}).sort_values(by='importance',axis=0, ascending=False, inplace=False)
    imp = pd.concat([imp[imp.importance <= imp[imp.importance < 0].importance.median()],
    imp[imp.importance >= imp[imp.importance > 0].importance.median()]],axis=0)
    print(f"{len(col)} columns produced macro recall of {output['test_recall_macro'].mean()}")
    return {
            'features': len(col),
            'cols':col,
            'medians': (imp[imp.importance < 0].importance.median(),imp[imp.importance > 0].importance.median()),
            'balanced_accuracy' :output['test_balanced_accuracy'].mean(),
            'f1_macro':output['test_f1_macro'].mean(),
            'precision_macro':output['test_precision_macro'].mean(),
            'recall_macro':output['test_recall_macro'].mean(),
            'balanced_accuracy_std':output['test_balanced_accuracy'].std(),
            'f1_macro_std':output['test_f1_macro'].std(),
            'precision_macro_std':output['test_precision_macro'].std(),
            'recall_macro_std':output['test_recall_macro'].std(),
            'next' : imp,
            'next_columns' : [x for x in imp.fields.values] + ['Label']
            }

In [None]:
col = dt.columns
features = [0,1]
result= []
while len(set(features[-5:])) != 1:
    res = feat_select_lgbm(col)
    result.append(res)
    features.append(res['features'])
    col = res['next_columns']

In [None]:
cv_res = pd.DataFrame(data = {'features': [res['features'] for res in result],
'balanced_accuracy': [res['balanced_accuracy'] for res in result],
'f1_macro': [res['f1_macro'] for res in result],
'precision_macro': [res['precision_macro'] for res in result],
'recall_macro': [res['recall_macro'] for res in result]})

plt.rcParams["figure.figsize"] = [20.00, 6.00]
plt.rcParams["figure.autolayout"] = True
f, axes = plt.subplots(1, 2)
sns.barplot(x = 'features',y = 'balanced_accuracy',data = cv_res, palette = "Set2",ax=axes[0]).set(title='Balanced Accuracy')
sns.barplot(x = 'features',y = 'f1_macro',data = cv_res, palette = "Set2",ax=axes[1]).set(title='F1 Macro Score')
plt.show()


plt.rcParams["figure.figsize"] = [20.00, 6.00]
plt.rcParams["figure.autolayout"] = True
f, axes = plt.subplots(1, 2)
sns.barplot(x = 'features',y = 'precision_macro',data = cv_res, palette = "Set2",ax=axes[0]).set(title='Precision Macro Score')
sns.barplot(x = 'features',y = 'recall_macro',data = cv_res, palette = "Set2",ax=axes[1]).set(title='Recall Macro Score')
plt.show()

In [None]:
cv_res = pd.DataFrame(data = {'features': [res['features'] for res in result],
'balanced_accuracy': [res['balanced_accuracy'] for res in result],
'f1_macro': [res['f1_macro'] for res in result],
'precision_macro': [res['precision_macro'] for res in result],
'recall_macro': [res['recall_macro'] for res in result]})

plt.rcParams["figure.figsize"] = [8.00, 5.00]
plt.rcParams["figure.autolayout"] = True
sns.color_palette("deep")
sns.barplot(x = 'features',y = 'f1_macro',data = cv_res,color='green')
plt.xlabel('Number of Features')
plt.ylabel('F1 Scores')
plt.savefig('LGBMS1.eps',bbox_inches = 'tight',dpi=1200,transparent= True, format='eps')
cv_res

In [None]:
peak = 401

cols = [x for x in result if x['features'] == peak][0]['cols']
features = dt[cols].copy()
target = features.pop('Label')
x_train, x_test, y_train, y_test = train_test_split(MinMaxScaler().fit_transform(features), target,stratify=target, test_size=0.5,random_state=random_state)
# Creating the Nearest Centroid Classifier
model = LGBMClassifier(random_state = random_state)
 
# Training the classifier
model.fit(x_train, y_train.values.ravel())
 
# Printing Accuracy on Training and Test sets
print(f"Training Set Score : {model.score(x_train, y_train) * 100} %")
print(f"Test Set Score : {model.score(x_test, y_test) * 100} %")
 
# Printing classification report of classifier on the test set set data
print(f"Model Classification Report : \n{classification_report(y_test, model.predict(x_test))}")

In [None]:
def classifier(x_train, y_train,scoring):
    
    cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=random_state)
    classifiers = LGBMClassifier(random_state = random_state)
    scores = cross_val_score(classifiers, MinMaxScaler().fit_transform(x_train), y_train, scoring=scoring, cv=cv)
    
    return scores.mean()

def feed_forward(unprocessed_cols,current_bestcols,scoring):
    results = [classifier(features[current_bestcols + [x]], target,scoring) for x in unprocessed_cols]
    current_bestcols = current_bestcols + [unprocessed_cols[results.index(max(results))]]
    bestcols.append((max(results), current_bestcols))
    unprocessed_cols.pop(results.index(max(results)))
    if len(unprocessed_cols) > 0:
        print(f"{len(unprocessed_cols)} columns left to process")
        feed_forward(unprocessed_cols,current_bestcols,scoring)
    else:
        return bestcols

bestcols = []
cols = [x for x in result if x['features'] == peak][0]['cols']
features = dt[cols].copy()
target = features.pop('Label')
cols = features.columns.tolist()
feed_forward(cols,bestcols,scoring='f1_macro')

In [None]:
sns.lineplot(x=[len(x[1]) for x in bestcols], y=[x[0] for x in bestcols], palette='Set2')
plt.xlabel('Number of Columns')
plt.ylabel('Scores')
plt.savefig('LGBMS2.eps',bbox_inches = 'tight',dpi=1200,transparent= False, format='eps')

In [None]:
bestcols

In [None]:
# bestcols = []
# cols = [x for x in result if x['features'] == peak][0]['cols']
# features = dt[cols].copy()
# target = features.pop('Label')
# cols = features.columns.tolist()
# feed_forward(cols,bestcols,scoring='recall_macro')

In [None]:
# sns.lineplot(x=[len(x[1]) for x in bestcols], y=[x[0] for x in bestcols], palette='Set2')
# plt.xlabel('Number of Columns')
# plt.ylabel('Scores')
# plt.show()

In [None]:
# bestcols

In [None]:
features = dt[bestcols[5][1] + ['Label']].copy()
target = features.pop('Label')
x_train, x_test, y_train, y_test = train_test_split(MinMaxScaler().fit_transform(features), target,stratify=target, test_size=0.3,random_state=random_state)

clf=LazyClassifier(verbose=0,ignore_warnings=True,custom_metric=None)
models_et,predictions=clf.fit(x_train, x_test, y_train, y_test)


et = pd.merge(baseline_merged,models_et,left_index=True,right_index=True)
et1 = et[et.index =='LGBMClassifier'][['F1 Score_baseline','F1 Score_pearson','F1 Score_chi','F1 Score_ig','F1 Score_rfe','F1 Score']].values.flatten().tolist()

plt.rcParams["figure.figsize"] = [8.00, 5.00]
sns.barplot(x = et1[::-1] ,y = ['Baseline','Pearson Correlation','Chi Squared','Information Gain','RFE','RFE + Forward Selection'][::-1],palette='Set2',orient='h')
plt.ylabel('Selection Method')
plt.xlabel('F1 Scores')
plt.savefig('lg_final.eps',bbox_inches = 'tight',dpi=1200,transparent= True, format='eps')
et[et.index =='LGBMClassifier'][['F1 Score_baseline','F1 Score_pearson','F1 Score_chi','F1 Score_ig','F1 Score_rfe','F1 Score']]

#### Adaboost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
def feat_select_ada(col):
    features = dt[col].copy()
    target = features.pop('Label')

    from sklearn.pipeline import Pipeline
    pipe = Pipeline([
        ('scaler', MinMaxScaler()),
        ('model', AdaBoostClassifier(n_estimators =10, random_state = random_state))
    ])
    
    scoring = {
               'balanced_accuracy':'balanced_accuracy',
               'f1_macro':'f1_macro',
               'precision_macro':'precision_macro',
               'recall_macro':'recall_macro',
              }

    # clf=RandomForestClassifier(n_estimators =10, random_state = 42,class_weight='balanced')
    output = cross_validate(pipe, features, target, cv=4, scoring = scoring, return_estimator =True)

    imp = pd.DataFrame(data = {'fields':features.columns,'importance':np.mean([estimator.steps[1][1].feature_importances_ for estimator in output['estimator']],axis=0)}).sort_values(by='importance',axis=0, ascending=False, inplace=False)
    imp = pd.concat([imp[imp.importance <= imp[imp.importance < 0].importance.median()],
    imp[imp.importance >= imp[imp.importance > 0].importance.median()]],axis=0)
    print(f"{len(col)} columns produced macro recall of {output['test_recall_macro'].mean()}")
    return {
            'features': len(col),
            'cols':col,
            'medians': (imp[imp.importance < 0].importance.median(),imp[imp.importance > 0].importance.median()),
            'balanced_accuracy' :output['test_balanced_accuracy'].mean(),
            'f1_macro':output['test_f1_macro'].mean(),
            'precision_macro':output['test_precision_macro'].mean(),
            'recall_macro':output['test_recall_macro'].mean(),
            'balanced_accuracy_std':output['test_balanced_accuracy'].std(),
            'f1_macro_std':output['test_f1_macro'].std(),
            'precision_macro_std':output['test_precision_macro'].std(),
            'recall_macro_std':output['test_recall_macro'].std(),
            'next' : imp,
            'next_columns' : [x for x in imp.fields.values] + ['Label']
            }

In [None]:
col = dt.columns
features = [0,1]
result= []
while len(set(features[-5:])) != 1:
    res = feat_select_ada(col)
    result.append(res)
    features.append(res['features'])
    col = res['next_columns']

In [None]:
cv_res = pd.DataFrame(data = {'features': [res['features'] for res in result],
'balanced_accuracy': [res['balanced_accuracy'] for res in result],
'f1_macro': [res['f1_macro'] for res in result],
'precision_macro': [res['precision_macro'] for res in result],
'recall_macro': [res['recall_macro'] for res in result]})

plt.rcParams["figure.figsize"] = [20.00, 6.00]
plt.rcParams["figure.autolayout"] = True
f, axes = plt.subplots(1, 2)
sns.barplot(x = 'features',y = 'balanced_accuracy',data = cv_res, palette = "Set2",ax=axes[0]).set(title='Balanced Accuracy')
sns.barplot(x = 'features',y = 'f1_macro',data = cv_res, palette = "Set2",ax=axes[1]).set(title='F1 Macro Score')
plt.show()


plt.rcParams["figure.figsize"] = [20.00, 6.00]
plt.rcParams["figure.autolayout"] = True
f, axes = plt.subplots(1, 2)
sns.barplot(x = 'features',y = 'precision_macro',data = cv_res, palette = "Set2",ax=axes[0]).set(title='Precision Macro Score')
sns.barplot(x = 'features',y = 'recall_macro',data = cv_res, palette = "Set2",ax=axes[1]).set(title='Recall Macro Score')
plt.show()

In [None]:
cv_res = pd.DataFrame(data = {'features': [res['features'] for res in result],
'balanced_accuracy': [res['balanced_accuracy'] for res in result],
'f1_macro': [res['f1_macro'] for res in result],
'precision_macro': [res['precision_macro'] for res in result],
'recall_macro': [res['recall_macro'] for res in result]})

plt.rcParams["figure.figsize"] = [8.00, 5.00]
plt.rcParams["figure.autolayout"] = True
sns.color_palette("deep")
sns.barplot(x = 'features',y = 'f1_macro',data = cv_res,color='green')
plt.xlabel('Number of Features')
plt.ylabel('F1 Scores')
plt.savefig('ADS1.eps',bbox_inches = 'tight',dpi=1200,transparent= True, format='eps')
cv_res

In [None]:
peak = 35

cols = [x for x in result if x['features'] == peak][0]['cols']
features = dt[cols].copy()
target = features.pop('Label')
x_train, x_test, y_train, y_test = train_test_split(MinMaxScaler().fit_transform(features), target,stratify=target, test_size=0.5,random_state=random_state)
# Creating the Nearest Centroid Classifier
model = AdaBoostClassifier(n_estimators =10, random_state = random_state)
 
# Training the classifier
model.fit(x_train, y_train.values.ravel())
 
# Printing Accuracy on Training and Test sets
print(f"Training Set Score : {model.score(x_train, y_train) * 100} %")
print(f"Test Set Score : {model.score(x_test, y_test) * 100} %")
 
# Printing classification report of classifier on the test set set data
print(f"Model Classification Report : \n{classification_report(y_test, model.predict(x_test))}")

In [None]:

def classifier(x_train, y_train,scoring):
    
    cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=random_state)
    classifiers = AdaBoostClassifier(n_estimators =10, random_state = random_state)
    scores = cross_val_score(classifiers, StandardScaler().fit_transform(x_train), y_train, scoring=scoring, cv=cv)
    
    return scores.mean()

def feed_forward(unprocessed_cols,current_bestcols,scoring):
    results = [classifier(features[current_bestcols + [x]], target,scoring) for x in unprocessed_cols]
    current_bestcols = current_bestcols + [unprocessed_cols[results.index(max(results))]]
    bestcols.append((max(results), current_bestcols))
    unprocessed_cols.pop(results.index(max(results)))
    if len(unprocessed_cols) > 0:
        print(f"{len(unprocessed_cols)} columns left to process")
        feed_forward(unprocessed_cols,current_bestcols,scoring)
    else:
        return bestcols

In [None]:
bestcols = []
cols = [x for x in result if x['features'] == peak][0]['cols']
features = dt[cols].copy()
target = features.pop('Label')
cols = features.columns.tolist()
feed_forward(cols,bestcols,scoring='f1_macro')

sns.lineplot(x=[len(x[1]) for x in bestcols], y=[x[0] for x in bestcols], palette='Set2')
plt.xlabel('Number of Columns')
plt.ylabel('Scores')
plt.savefig('ADS2.eps',bbox_inches = 'tight',dpi=1200,transparent= False, format='eps')

In [None]:
# bestcols = []
# cols = [x for x in result if x['features'] == peak][0]['cols']
# features = dt[cols].copy()
# target = features.pop('Label')
# cols = features.columns.tolist()
# feed_forward(cols,bestcols,scoring='recall_macro')

# sns.lineplot(x=[len(x[1]) for x in bestcols], y=[x[0] for x in bestcols], palette='Set2')
# plt.xlabel('Number of Columns')
# plt.ylabel('Scores')
# plt.show()

# bestcols

In [None]:
features = dt[bestcols[5][1] + ['Label']].copy()
target = features.pop('Label')
x_train, x_test, y_train, y_test = train_test_split(MinMaxScaler().fit_transform(features), target,stratify=target, test_size=0.3,random_state=random_state)

clf=LazyClassifier(verbose=0,ignore_warnings=True,custom_metric=None)
models_et,predictions=clf.fit(x_train, x_test, y_train, y_test)


et = pd.merge(baseline_merged,models_et,left_index=True,right_index=True)
et1 = et[et.index =='AdaBoostClassifier'][['F1 Score_baseline','F1 Score_pearson','F1 Score_chi','F1 Score_ig','F1 Score_rfe','F1 Score']].values.flatten().tolist()

plt.rcParams["figure.figsize"] = [8.00, 5.00]
sns.barplot(x = et1[::-1] ,y = ['Baseline','Pearson Correlation','Chi Squared','Information Gain','RFE','RFE + Forward Selection'][::-1],palette='Set2',orient='h')
plt.ylabel('Selection Method')
plt.xlabel('F1 Scores')
plt.savefig('ad_final.eps',bbox_inches = 'tight',dpi=1200,transparent= True, format='eps')
et[et.index =='AdaBoostClassifier'][['F1 Score_baseline','F1 Score_pearson','F1 Score_chi','F1 Score_ig','F1 Score_rfe','F1 Score']]

#### SVM

In [None]:
from sklearn.svm import SVC
def feat_select_svc(col):
    features = dt[col].copy()
    target = features.pop('Label')

    from sklearn.pipeline import Pipeline
    pipe = Pipeline([
        ('scaler', MinMaxScaler()),
        ('model', SVC(kernel='linear'))
    ])
    
    scoring = {
               'balanced_accuracy':'balanced_accuracy',
               'f1_macro':'f1_macro',
               'precision_macro':'precision_macro',
               'recall_macro':'recall_macro',
              }

    # clf=RandomForestClassifier(n_estimators =10, random_state = 42,class_weight='balanced')
    output = cross_validate(pipe, features, target, cv=4, scoring = scoring, return_estimator =True)

    imp = pd.DataFrame(data = {'fields':features.columns,'importance':np.mean([estimator.steps[1][1].coef_[0] for estimator in output['estimator']],axis=0)}).sort_values(by='importance',axis=0, ascending=False, inplace=False)
    imp = pd.concat([imp[imp.importance <= imp[imp.importance < 0].importance.median()],
    imp[imp.importance >= imp[imp.importance > 0].importance.median()]],axis=0)
    print(f"{len(col)} columns produced macro recall of {output['test_recall_macro'].mean()}")
    return {
            'features': len(col),
            'cols':col,
            'medians': (imp[imp.importance < 0].importance.median(),imp[imp.importance > 0].importance.median()),
            'balanced_accuracy' :output['test_balanced_accuracy'].mean(),
            'f1_macro':output['test_f1_macro'].mean(),
            'precision_macro':output['test_precision_macro'].mean(),
            'recall_macro':output['test_recall_macro'].mean(),
            'balanced_accuracy_std':output['test_balanced_accuracy'].std(),
            'f1_macro_std':output['test_f1_macro'].std(),
            'precision_macro_std':output['test_precision_macro'].std(),
            'recall_macro_std':output['test_recall_macro'].std(),
            'next' : imp,
            'next_columns' : [x for x in imp.fields.values] + ['Label']
            }

In [None]:
col = dt.columns
features = [0,1]
result= []
while len(set(features[-5:])) != 1:
    res = feat_select_svc(col)
    result.append(res)
    features.append(res['features'])
    col = res['next_columns']

In [None]:
cv_res = pd.DataFrame(data = {'features': [res['features'] for res in result],
'balanced_accuracy': [res['balanced_accuracy'] for res in result],
'f1_macro': [res['f1_macro'] for res in result],
'precision_macro': [res['precision_macro'] for res in result],
'recall_macro': [res['recall_macro'] for res in result]})

plt.rcParams["figure.figsize"] = [20.00, 6.00]
plt.rcParams["figure.autolayout"] = True
f, axes = plt.subplots(1, 2)
sns.barplot(x = 'features',y = 'balanced_accuracy',data = cv_res, palette = "Set2",ax=axes[0]).set(title='Balanced Accuracy')
sns.barplot(x = 'features',y = 'f1_macro',data = cv_res, palette = "Set2",ax=axes[1]).set(title='F1 Macro Score')
plt.show()


plt.rcParams["figure.figsize"] = [20.00, 6.00]
plt.rcParams["figure.autolayout"] = True
f, axes = plt.subplots(1, 2)
sns.barplot(x = 'features',y = 'precision_macro',data = cv_res, palette = "Set2",ax=axes[0]).set(title='Precision Macro Score')
sns.barplot(x = 'features',y = 'recall_macro',data = cv_res, palette = "Set2",ax=axes[1]).set(title='Recall Macro Score')
plt.show()


In [None]:
cv_res = pd.DataFrame(data = {'features': [res['features'] for res in result],
'balanced_accuracy': [res['balanced_accuracy'] for res in result],
'f1_macro': [res['f1_macro'] for res in result],
'precision_macro': [res['precision_macro'] for res in result],
'recall_macro': [res['recall_macro'] for res in result]})

plt.rcParams["figure.figsize"] = [8.00, 5.00]
plt.rcParams["figure.autolayout"] = True
sns.color_palette("deep")
sns.barplot(x = 'features',y = 'f1_macro',data = cv_res,color='green')
plt.xlabel('Number of Features')
plt.ylabel('F1 Scores')
plt.savefig('SVMS1.eps',bbox_inches = 'tight',dpi=1200,transparent= True, format='eps')
cv_res

In [None]:
peak = 22

cols = [x for x in result if x['features'] == peak][0]['cols']
features = dt[cols].copy()
target = features.pop('Label')
x_train, x_test, y_train, y_test = train_test_split(MinMaxScaler().fit_transform(features), target,stratify=target, test_size=0.5,random_state=random_state)
# Creating the Nearest Centroid Classifier
model = SVC(kernel='linear')
 
# Training the classifier
model.fit(x_train, y_train.values.ravel())
 
# Printing Accuracy on Training and Test sets
print(f"Training Set Score : {model.score(x_train, y_train) * 100} %")
print(f"Test Set Score : {model.score(x_test, y_test) * 100} %")
 
# Printing classification report of classifier on the test set set data
print(f"Model Classification Report : \n{classification_report(y_test, model.predict(x_test))}")



def classifier(x_train, y_train,scoring):
    
    cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=random_state)
    classifiers = SVC(kernel='linear')
    scores = cross_val_score(classifiers, StandardScaler().fit_transform(x_train), y_train, scoring=scoring, cv=cv)
    
    return scores.mean()

def feed_forward(unprocessed_cols,current_bestcols,scoring):
    results = [classifier(features[current_bestcols + [x]], target,scoring) for x in unprocessed_cols]
    current_bestcols = current_bestcols + [unprocessed_cols[results.index(max(results))]]
    bestcols.append((max(results), current_bestcols))
    unprocessed_cols.pop(results.index(max(results)))
    if len(unprocessed_cols) > 0:
        print(f"{len(unprocessed_cols)} columns left to process")
        feed_forward(unprocessed_cols,current_bestcols,scoring)
    else:
        return bestcols

In [None]:
bestcols = []
cols = [x for x in result if x['features'] == peak][0]['cols']
features = dt[cols].copy()
target = features.pop('Label')
cols = features.columns.tolist()
feed_forward(cols,bestcols,scoring='f1_macro')

In [None]:
sns.lineplot(x=[len(x[1]) for x in bestcols], y=[x[0] for x in bestcols], palette='Set2')
plt.xlabel('Number of Features')
plt.ylabel('Scores')
plt.savefig('SVMS2.eps',bbox_inches = 'tight',dpi=1200,transparent= False, format='eps')

In [None]:
features = dt[bestcols[5][1] + ['Label']].copy()
target = features.pop('Label')
x_train, x_test, y_train, y_test = train_test_split(MinMaxScaler().fit_transform(features), target,stratify=target, test_size=0.3,random_state=random_state)

clf=LazyClassifier(verbose=0,ignore_warnings=True,custom_metric=None)
models_et,predictions=clf.fit(x_train, x_test, y_train, y_test)


et = pd.merge(baseline_merged,models_et,left_index=True,right_index=True)
et1 = et[et.index =='SVC'][['F1 Score_baseline','F1 Score_pearson','F1 Score_chi','F1 Score_ig','F1 Score_rfe','F1 Score']].values.flatten().tolist()

plt.rcParams["figure.figsize"] = [8.00, 5.00]
sns.barplot(x = et1[::-1] ,y = ['Baseline','Pearson Correlation','Chi Squared','Information Gain','RFE','RFE + Forward Selection'][::-1],palette='Set2',orient='h')
plt.ylabel('Selection Method')
plt.xlabel('F1 Scores')
plt.savefig('svm_final.eps',bbox_inches = 'tight',dpi=1200,transparent= True, format='eps')
et[et.index =='SVC'][['F1 Score_baseline','F1 Score_pearson','F1 Score_chi','F1 Score_ig','F1 Score_rfe','F1 Score']]

In [None]:

# bestcols = []
# cols = [x for x in result if x['features'] == peak][0]['cols']
# features = dt[cols].copy()
# target = features.pop('Label')
# cols = features.columns.tolist()
# feed_forward(cols,bestcols,scoring='recall_macro')



In [None]:
# sns.lineplot(x=[len(x[1]) for x in bestcols], y=[x[0] for x in bestcols], palette='Set2')
# plt.xlabel('Number of Columns')
# plt.ylabel('Scores')
# plt.show()

# bestcols

In [None]:
### Nearest Centroid
### Nearest Neighbour
### SVC
### NB