# Import statements

In [1]:
# Import Dependencies
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib inline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn.svm import LinearSVC
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import mutual_info_classif as MIC
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeClassifier 
from sklearn.linear_model import Perceptron
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
# import skopt
# from hyperopt import hp
import pickle
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import RFE
import warnings
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
warnings.filterwarnings("ignore")

# Result 

In [2]:
# After considering all five conventional classification models, we come to conclusion that Random forest is the best model.

## Load the model

In [3]:
RandomForest = pickle.load(open('base_model', 'rb'))

# Loading the Dataset

In [4]:
# Load Dataset
df = pd.read_csv('final_dataset.csv')
df.shape

(5856, 94)

# Train Test Split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(labels=['Label'], axis=1),
    df['Label'],
    test_size=0.2,
    random_state=23)

X_train.shape, X_test.shape

((4684, 93), (1172, 93))

# Cross Validation

In [6]:
def cross_validation(model, _X, _y, _cv=5):
      _scoring = ['f1']
      results = cross_validate(estimator=model,
                               X=_X,
                               y=_y,
                               cv=_cv,
                               scoring=_scoring,
                               return_train_score=True)
      return results['test_f1'].mean()

In [7]:
def get_result(model):
    F1_score = []
    for i in range(5,95,5):
        test_dataframe = X_test.filter(items=model(i)) #filtering only required important features
        F1_score.append(cross_validation(RandomForest,test_dataframe,y_test,5))
    print("The highest F1_score is {} for {} features.".format(max(F1_score),(5 * F1_score.index(max(F1_score)) + 5)))

# Evalution Parameters

In [8]:
def Eval_par(train_DF,test_DF):
    model = RandomForestClassifier(criterion = 'gini',max_depth = None,
                                   max_features = None,max_leaf_nodes = None,
                                   min_samples_leaf = 3,min_samples_split = 4,
                                   min_weight_fraction_leaf = 0.0,n_estimators = 200)
    model.fit(train_DF,y_train)
    y_pred = model.predict(test_DF)
    _confusion_matrix = confusion_matrix(y_test,y_pred)
    
    true_positive = _confusion_matrix[1][1]
    true_negative = _confusion_matrix[0][0]
    false_positive = _confusion_matrix[0][1]
    false_negative = _confusion_matrix[1][0]

    # Calculate precision
    precision = true_positive / (true_positive + false_positive)

    # Calculate recall
    recall = true_positive / (true_positive + false_negative)

    # Calculate accuracy
    accuracy = (true_positive + true_negative) / (true_positive + true_negative + false_positive + false_negative)

    print("Precision : ",precision)
    print("Recall : ",recall)
    print("Accuracy : ", accuracy)

# Correlation-based feature selection (CFS)

In [9]:
# X is your feature matrix, y is your target variable
def CFS(features):
    selector = SelectKBest(score_func=mutual_info_classif, k=features) # choose the number of features you want to keep
    CFS = selector.fit_transform(X_train, y_train) # features are selected
    colu = list(X_test.columns[selector.get_support()]) #generating name of columns consisting that feature 
    return colu

In [155]:
get_result(CFS)

The highest F1_score is 0.9220485950410214 for 55 features.


In [12]:
train_dataframe = X_train.filter(items=CFS(55)) #filtering only required important features
test_dataframe = X_test.filter(items=CFS(55)) #filtering only required important features

Eval_par(train_dataframe,test_dataframe)

Precision :  0.9279069767441861
Recall :  0.9236111111111112
Accuracy :  0.8907849829351536
Specificity :  0.7987012987012987


# L1 regularization

In [13]:
def L1_reg():
    l1 = SelectFromModel(LogisticRegression(C=1, penalty='l1',solver='liblinear'))
    l1.fit(X_train, y_train)
    colu = list(X_train.columns[l1.get_support()])
    return colu

In [20]:
colu = L1_reg()
train_dataframe = X_train.filter(items=colu) #filtering only required important features
test_dataframe = X_test.filter(items=colu) #filtering only required important features
print("F1_Score is {} for 65 features".format(cross_validation(RandomForest, test_dataframe, y_test, _cv=5)))

F1_Score is 0.9206455388654469 for 65 features


In [163]:
# F1_score = cross_validation(RandomForest,test_dataframe,y_test,5)
Eval_par(train_dataframe,test_dataframe)

Precision :  0.923963133640553
Recall :  0.9282407407407407
Accuracy :  0.8907849829351536
Specificity :  0.7857142857142857


# Recursive Feature Elimination (RFE)

In [156]:
def RFE_sel(features):
    sel = RFE(RandomForest, n_features_to_select = features)
    sel.fit(X_train, y_train)
    colu = list(X_test.columns[sel.get_support()])
    return colu

In [157]:
get_result(RFE_sel)


KeyboardInterrupt



# Principal component analysis (PCA)

In [128]:
def PCA_sel(features, DataFrame):
    pca_sel = PCA(n_components = features)
    pct = pca_sel.fit_transform(DataFrame)
    return pct

In [166]:
get_result(PCA_sel)

The highest F1_score is 0.9136775826109315 for 10 features.


In [165]:
train_dataframe =  PCA_sel(20,X_train)#filtering only required important features
test_dataframe = PCA_sel(20,X_test) #filtering only required important features
Eval_par(train_dataframe,test_dataframe)

Precision :  0.9052987598647125
Recall :  0.9293981481481481
Accuracy :  0.8762798634812287
Specificity :  0.7272727272727273


In [None]:
F1_score = []
for i in range(5,95,5):
    test_dataframe = PCA_sel(i,X_test) #filtering only required important features
    F1_score.append(cross_validation(RandomForest,test_dataframe,y_test,5))

print("The highest F1_score is {} for {} features.".format(max(F1_score),(5 * F1_score.index(max(F1_score)) + 5)))

# Random forest feature importance

In [141]:
def rff():
    rfc_ = SelectFromModel(RandomForest)
    rfc_.fit(X_train, y_train)
    colu =list(X_test.columns[rfc_.get_support()])
    return colu

In [167]:
colu = rff()
train_dataframe = X_train.filter(items = colu)
test_dataframe = X_test.filter(items = colu)

In [144]:
cross_validation(RandomForest,test_dataframe,y_test,5)

0.912603907266558

In [168]:
Eval_par(train_dataframe,test_dataframe)

Precision :  0.9143835616438356
Recall :  0.9270833333333334
Accuracy :  0.8822525597269625
Specificity :  0.7564935064935064


# Mutual information

In [135]:
mutual_info = mutual_info_classif(X_train, y_train)

In [136]:
def mut_info(features):
    k_best_features = SelectKBest(mutual_info_classif, k=features).fit(X_train, y_train)
    colu =list(X_test.columns[k_best_features.get_support()])
    return colu

In [105]:
# https://github.com/anujdutt9/Feature-Selection-for-Machine-Learning/blob/master/Filter%20Methods/Mutual-Information.ipynb

In [170]:
colu = mut_info(20)
train_dataframe = X_train.filter(items = colu)
test_dataframe = X_test.filter(items = colu)

In [137]:
get_result(mut_info)

The highest F1_score is 0.9213092225135131 for 20 features.


In [171]:
Eval_par(train_dataframe,test_dataframe)

Precision :  0.9212050984936269
Recall :  0.9201388888888888
Accuracy :  0.8831058020477816
Specificity :  0.7792207792207793


In [None]:
# https://github.com/krishnadulal/Feature-Selection-in-Machine-Learning-using-Python-All-Code

# Correlation-based Feature Selection gives the best Results

## Save and Load the best model from CFS Technique

In [13]:
train_dataframe = X_train.filter(items=CFS(55)) #filtering only required important features
test_dataframe = X_test.filter(items=CFS(55)) #filtering only required important features

In [12]:
best_model = RandomForestClassifier(criterion = 'gini',max_depth = None,
                                   max_features = None,max_leaf_nodes = None,
                                   min_samples_leaf = 3,min_samples_split = 4,
                                   min_weight_fraction_leaf = 0.0,n_estimators = 200)
best_model.fit(train_dataframe,y_train)

In [17]:
import pickle
# create an iterator object with write permission - model.pkl
with open('best_model', 'wb') as files:
    pickle.dump(best_model, files)

In [21]:
model_ = pickle.load(open('best_model', 'rb'))

In [22]:
model_.score(test_dataframe,y_test)

0.8523890784982935