### Library Import

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split,StratifiedKFold,cross_val_score,cross_validate
from sklearn.metrics import classification_report,make_scorer,confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier
from xgboost import XGBClassifier



import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import ListedColormap

from sklearn.preprocessing import MinMaxScaler,StandardScaler

from imblearn.over_sampling import RandomOverSampler,SMOTE,ADASYN,BorderlineSMOTE,SMOTEN,SVMSMOTE,KMeansSMOTE
from imblearn.under_sampling import ClusterCentroids,RandomUnderSampler,NearMiss
from imblearn.combine import SMOTEENN,SMOTETomek

from collections import Counter

from sklearn.cluster import DBSCAN



### Data Import

In [2]:
random_state = 23873

In [3]:
dt = pd.read_excel(r'CA_BC.xlsx').T
dt.columns = dt.iloc[0]
dt = dt.iloc[1:]
dt['Label'] = dt.Label.map({'E':1,'A':0})

In [4]:
dt.drop_duplicates(keep='first',inplace=True)
for x in dt.select_dtypes(include=['object']).columns:
    dt[x] = dt[x].astype('float64')
dt.columns = [str(x) for x in dt.columns]
dt = dt.select_dtypes(include=['float64','int64'])

In [5]:
# Columns to not rename
excluded = dt.columns[~dt.columns.duplicated(keep=False)]

# An incrementer
import itertools
inc = itertools.count().__next__

# A renamer
def ren(name):
    return f"{name}{inc()}" if name not in excluded else name

# Use inside rename()
dt.rename(columns=ren,inplace=True)

### Baseline Modelling

In [6]:
from lazypredict.Supervised import LazyClassifier
features = dt.copy()
target = features.pop('Label')
x_train, x_test, y_train, y_test = train_test_split(features, target,stratify=target, test_size=0.3,random_state=random_state)
clf=LazyClassifier(verbose=0,ignore_warnings=True,custom_metric=None)
models,predictions=clf.fit(x_train, x_test, y_train, y_test)

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:38<00:00,  1.34s/it]


In [7]:
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


## Linear Models

### Remove Co-Linear Fields

In [8]:
# https://chrisalbon.com/code/machine_learning/feature_selection/drop_highly_correlated_features/
corr = dt.corr()

In [9]:
# sns.heatmap(corr)

In [None]:
# Create correlation matrix
corr_matrix = corr.abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

print(f"Would drop {len(to_drop)} fields")

In [None]:
features = dt.drop(to_drop, axis=1).copy()
target = features.pop('Label')
x_train, x_test, y_train, y_test = train_test_split(features, target,stratify=target, test_size=0.3,random_state=random_state)
clf=LazyClassifier(verbose=0,ignore_warnings=True,custom_metric=None)
models_corr,predictions=clf.fit(x_train, x_test, y_train, y_test)

In [None]:
models_corr

### Check Variance Inflation Factor

In [None]:
# import statsmodels.api as sm
# from statsmodels.stats.outliers_influence import variance_inflation_factor

# vif_info = pd.DataFrame()
# vif_info['VIF'] = [variance_inflation_factor(features.values, i) for i in range(features.shape[1])]
# vif_info['Column'] = features.columns
# vif_info.sort_values('VIF', ascending=False)

### Min Max Scaling

In [None]:
x_train_MinMax = MinMaxScaler().fit_transform(x_train)
clf=LazyClassifier(verbose=0,ignore_warnings=True,custom_metric=None)
models_min_max,predictions=clf.fit(x_train_MinMax, x_test, y_train, y_test)

In [None]:
models_min_max

In [None]:
x_train_Standard = StandardScaler().fit_transform(x_train)
clf=LazyClassifier(verbose=0,ignore_warnings=True,custom_metric=None)
models_standard,predictions=clf.fit(x_train_Standard, x_test, y_train, y_test)

In [None]:
models_standard

### Deep Dive on Scaled Data

In [None]:
from sklearn.neighbors import NearestCentroid

features = dt.drop(to_drop, axis=1).copy()
target = features.pop('Label')
x_train, x_test, y_train, y_test = train_test_split(StandardScaler().fit_transform(features), target,stratify=target, test_size=0.3,random_state=random_state)
# Creating the Nearest Centroid Classifier
model = LogisticRegression()
 
# Training the classifier
model.fit(x_train, y_train.values.ravel())
 
# Printing Accuracy on Training and Test sets
print(f"Training Set Score : {model.score(x_train, y_train) * 100} %")
print(f"Test Set Score : {model.score(x_test, y_test) * 100} %")
 
# Printing classification report of classifier on the test set set data
print(f"Model Classification Report : \n{classification_report(y_test, model.predict(x_test))}")

imp = pd.DataFrame(data = {'fields':features.columns,'importance':model.coef_[0]}).sort_values(by='importance',axis=0, ascending=False, inplace=False)
imp[imp.importance.abs() >= imp[imp.importance!=0].importance.median()]

#### Selecting top 10 would not give best result because of negative importance

In [None]:
cols = imp.head(10).fields.values.tolist() + ['Label']
features = dt.drop(to_drop, axis=1)[cols].copy()
target = features.pop('Label')
x_train, x_test, y_train, y_test = train_test_split(StandardScaler().fit_transform(features), target,stratify=target, test_size=0.3,random_state=random_state)
# Creating the Nearest Centroid Classifier
model = LogisticRegression()
 
# Training the classifier
model.fit(x_train, y_train.values.ravel())
 
# Printing Accuracy on Training and Test sets
print(f"Training Set Score : {model.score(x_train, y_train) * 100} %")
print(f"Test Set Score : {model.score(x_test, y_test) * 100} %")
 
# Printing classification report of classifier on the test set set data
print(f"Model Classification Report : \n{classification_report(y_test, model.predict(x_test))}")

### Recursive Feature Importance

In [None]:
from sklearn import datasets
from sklearn.model_selection import cross_validate
from sklearn.svm import LinearSVC
from sklearn.ensemble import  RandomForestClassifier
import pandas as pd
pd.set_option("display.precision", 8)

In [None]:
def feat_select_log_reg(col):
    features = dt[col].copy()
    target = features.pop('Label')

    from sklearn.pipeline import Pipeline
    pipe = Pipeline([
        ('scaler', MinMaxScaler()),
        ('model', LogisticRegression())
    ])
    
    scoring = {
               'balanced_accuracy':'balanced_accuracy',
               'f1_macro':'f1_macro',
               'precision_macro':'precision_macro',
               'recall_macro':'recall_macro',
              }

    # clf=RandomForestClassifier(n_estimators =10, random_state = 42,class_weight='balanced')
    clf = LogisticRegression()
    output = cross_validate(pipe, features, target, cv=4, scoring = scoring, return_estimator =True)

    imp = pd.DataFrame(data = {'fields':features.columns,'importance':np.mean([estimator.steps[1][1].coef_[0] for estimator in output['estimator']],axis=0)}).sort_values(by='importance',axis=0, ascending=False, inplace=False)
    imp = pd.concat([imp[imp.importance <= imp[imp.importance < 0].importance.median()],
    imp[imp.importance >= imp[imp.importance > 0].importance.median()]],axis=0)
    print(f"{len(col)} columns produced macro recall of {output['test_recall_macro'].mean()}")
    return {
            'features': len(col),
            'cols':col,
            'medians': (imp[imp.importance < 0].importance.median(),imp[imp.importance > 0].importance.median()),
            'balanced_accuracy' :output['test_balanced_accuracy'].mean(),
            'f1_macro':output['test_f1_macro'].mean(),
            'precision_macro':output['test_precision_macro'].mean(),
            'recall_macro':output['test_recall_macro'].mean(),
            'balanced_accuracy_std':output['test_balanced_accuracy'].std(),
            'f1_macro_std':output['test_f1_macro'].std(),
            'precision_macro_std':output['test_precision_macro'].std(),
            'recall_macro_std':output['test_recall_macro'].std(),
            'next' : imp,
            'next_columns' : [x for x in imp.fields.values] + ['Label']
            }

In [None]:
col = dt.drop(to_drop, axis=1).columns
features = [0,1]
result= []
while len(set(features[-5:])) != 1:
    res = feat_select_log_reg(col)
    result.append(res)
    features.append(res['features'])
    col = res['next_columns']

In [None]:
cv_res = pd.DataFrame(data = {'features': [res['features'] for res in result],
'balanced_accuracy': [res['balanced_accuracy'] for res in result],
'f1_macro': [res['f1_macro'] for res in result],
'precision_macro': [res['precision_macro'] for res in result],
'recall_macro': [res['recall_macro'] for res in result]})

plt.rcParams["figure.figsize"] = [20.00, 6.00]
plt.rcParams["figure.autolayout"] = True
f, axes = plt.subplots(1, 2)
sns.barplot(x = 'features',y = 'balanced_accuracy',data = cv_res, palette = "Set2",ax=axes[0]).set(title='Balanced Accuracy')
sns.barplot(x = 'features',y = 'f1_macro',data = cv_res, palette = "Set2",ax=axes[1]).set(title='F1 Macro Score')
plt.show()


plt.rcParams["figure.figsize"] = [20.00, 6.00]
plt.rcParams["figure.autolayout"] = True
f, axes = plt.subplots(1, 2)
sns.barplot(x = 'features',y = 'precision_macro',data = cv_res, palette = "Set2",ax=axes[0]).set(title='Precision Macro Score')
sns.barplot(x = 'features',y = 'recall_macro',data = cv_res, palette = "Set2",ax=axes[1]).set(title='Recall Macro Score')
plt.show()

In [None]:
dd = result[[res['features'] for res in result].index(21)]['cols']

In [None]:
features = dt[dd].copy()
target = features.pop('Label')
x_train, x_test, y_train, y_test = train_test_split(StandardScaler().fit_transform(features), target,stratify=target, test_size=0.3,random_state=random_state)
# Creating the Nearest Centroid Classifier
model = LogisticRegression()
 
# Training the classifier
model.fit(x_train, y_train.values.ravel())
 
# Printing Accuracy on Training and Test sets
print(f"Training Set Score : {model.score(x_train, y_train) * 100} %")
print(f"Test Set Score : {model.score(x_test, y_test) * 100} %")
 
# Printing classification report of classifier on the test set set data
print(f"Model Classification Report : \n{classification_report(y_test, model.predict(x_test))}")

In [None]:
features = dt[dd].copy()
target = features.pop('Label')
x_train, x_test, y_train, y_test = train_test_split(features, target,stratify=target, test_size=0.3,random_state=random_state)
clf=LazyClassifier(verbose=0,ignore_warnings=True,custom_metric=None)
models,predictions=clf.fit(x_train, x_test, y_train, y_test)

In [None]:
models

#### Pearson Correlation

In [None]:
def cor_selector(X, y):
    cor_list = []
    feature_name = X.columns.tolist()
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-20:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature

In [None]:
features = dt.copy()
target = features.pop('Label')
cor_support, cor_feature = cor_selector(features, target)
print(str(len(cor_feature)), 'selected features')

In [None]:
features = dt[cor_feature + ['Label']].copy()
target = features.pop('Label')
x_train, x_test, y_train, y_test = train_test_split(StandardScaler().fit_transform(features), target,stratify=target, test_size=0.3,random_state=random_state)
# Creating the Nearest Centroid Classifier
model = LogisticRegression()
 
# Training the classifier
model.fit(x_train, y_train.values.ravel())
 
# Printing Accuracy on Training and Test sets
print(f"Training Set Score : {model.score(x_train, y_train) * 100} %")
print(f"Test Set Score : {model.score(x_test, y_test) * 100} %")
 
# Printing classification report of classifier on the test set set data
print(f"Model Classification Report : \n{classification_report(y_test, model.predict(x_test))}")

In [None]:
features = dt[cor_feature + ['Label']].copy()
target = features.pop('Label')
x_train, x_test, y_train, y_test = train_test_split(features, target,stratify=target, test_size=0.3,random_state=random_state)
clf=LazyClassifier(verbose=0,ignore_warnings=True,custom_metric=None)
models,predictions=clf.fit(x_train, x_test, y_train, y_test)

In [None]:
models

#### Chi - Square Features

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

features = dt.copy()
target = features.pop('Label')

X_norm = MinMaxScaler().fit_transform(features)
chi_selector = SelectKBest(chi2, k=20)
chi_selector.fit(X_norm, target)
chi_support = chi_selector.get_support()
chi_feature = features.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')

In [None]:
features = dt[chi_feature + ['Label']].copy()
target = features.pop('Label')
x_train, x_test, y_train, y_test = train_test_split(StandardScaler().fit_transform(features), target,stratify=target, test_size=0.3,random_state=random_state)
# Creating the Nearest Centroid Classifier
model = LogisticRegression()
 
# Training the classifier
model.fit(x_train, y_train.values.ravel())
 
# Printing Accuracy on Training and Test sets
print(f"Training Set Score : {model.score(x_train, y_train) * 100} %")
print(f"Test Set Score : {model.score(x_test, y_test) * 100} %")
 
# Printing classification report of classifier on the test set set data
print(f"Model Classification Report : \n{classification_report(y_test, model.predict(x_test))}")

In [None]:
features = dt[chi_feature + ['Label']].copy()
target = features.pop('Label')
x_train, x_test, y_train, y_test = train_test_split(features, target,stratify=target, test_size=0.3,random_state=random_state)
clf=LazyClassifier(verbose=0,ignore_warnings=True,custom_metric=None)
models,predictions=clf.fit(x_train, x_test, y_train, y_test)

In [None]:
models

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

features = dt.copy()
target = features.pop('Label')
X_norm = MinMaxScaler().fit_transform(features)

rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=30, step=1000, verbose=5)
rfe_selector.fit(X_norm, target)

In [None]:
rfe_support = rfe_selector.get_support()
rfe_feature = features.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

In [None]:
features = dt[rfe_feature + ['Label']].copy()
target = features.pop('Label')
x_train, x_test, y_train, y_test = train_test_split(StandardScaler().fit_transform(features), target,stratify=target, test_size=0.3,random_state=random_state)
# Creating the Nearest Centroid Classifier
model = LogisticRegression()
 
# Training the classifier
model.fit(x_train, y_train.values.ravel())
 
# Printing Accuracy on Training and Test sets
print(f"Training Set Score : {model.score(x_train, y_train) * 100} %")
print(f"Test Set Score : {model.score(x_test, y_test) * 100} %")
 
# Printing classification report of classifier on the test set set data
print(f"Model Classification Report : \n{classification_report(y_test, model.predict(x_test))}")

In [None]:
features = dt[rfe_feature + ['Label']].copy()
target = features.pop('Label')
x_train, x_test, y_train, y_test = train_test_split(features, target,stratify=target, test_size=0.3,random_state=random_state)
clf=LazyClassifier(verbose=0,ignore_warnings=True,custom_metric=None)
models,predictions=clf.fit(x_train, x_test, y_train, y_test)

In [None]:
models

In [None]:
def modelling_mulitple(x_train, y_train):

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    f1_means,f1_std,tss_mean,tss_std,hss2_mean,hss2_std = [],[],[],[],[],[]

    classifiers = [LogisticRegression(random_state=random_state),DecisionTreeClassifier(random_state=random_state),
                  SVC(random_state=random_state),GaussianNB(),AdaBoostClassifier(random_state=random_state),
                    RandomForestClassifier(n_estimators=20,random_state=random_state),XGBClassifier()
                  ]

    scoring = {'f1_macro': 'f1_macro',
               'tss': tss_scorer,
               'hss2': hss2_scorer}

    for classifier in classifiers:
        print(classifier)
        scores = cross_validate(classifier, x_train, y_train, scoring=scoring, cv=cv)
        f1_means.append(scores['test_f1_macro'].mean())
        f1_std.append(scores['test_f1_macro'].std())
        tss_mean.append(scores['test_tss'].mean())
        tss_std.append(scores['test_tss'].std())
        hss2_mean.append(scores['test_hss2'].mean())
        hss2_std.append(scores['test_hss2'].std())

    cv_res = pd.DataFrame(data={'Algorithms':['LogisticRegression','DecisionTree','SVM','Naive Bayes','AdaBoost',
                                              'RandomForest','XGBoost'], 'F1 Mean Score':f1_means, 'F1 std':f1_std,
                               'TSS Mean Score':tss_mean, 'TSS std':tss_std, 'HSS2 Mean Score':hss2_mean, 'HSS2 std':hss2_std
                               })

    plt.rcParams["figure.figsize"] = [20.00, 6.00]
    plt.rcParams["figure.autolayout"] = True
    f, axes = plt.subplots(1, 3)

    sns.barplot(x = 'F1 Mean Score',y = 'Algorithms',data = cv_res, palette = "Set2",ax=axes[0]).set(title='Cross Validation Scores')
    sns.barplot(x = 'TSS Mean Score',y = 'Algorithms',data = cv_res, palette = "Set2",ax=axes[1]).set(title='Cross Validation Scores')
    sns.barplot(x = 'HSS2 Mean Score',y = 'Algorithms',data = cv_res, palette = "Set2",ax=axes[2]).set(title='Cross Validation Scores')

    plt.show()

    return cv_res