In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LogisticRegression as LGR
from sklearn.datasets import load_iris
from mlxtend.plotting import plot_learning_curves

In [2]:
#classification and confusion matrix
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('FinalDataset.txt')
df = df.drop(columns = 'Number of businesses per capita')
df = df.drop(columns = 'Number of Businesses')
#df.head()

In [4]:
# Remove all null value
df.dropna(inplace=True)

# drop the uninformatica column("Loan_ID")
df.drop(labels=['Constituency'],axis=1,inplace=True)
df.reset_index(drop=True,inplace=True)

In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
cols = df.columns.tolist()
for column in cols:
    if df[column].dtype == 'object':
        df[column] = le.fit_transform(df[column])

In [6]:
X = df.iloc[:,1:]
y = df["Binary Value: NBC"]

In [7]:
X.head()

In [8]:
print(X.shape)
print(y.shape)
print(type(X))
print(type(y))

In [9]:
feature_names=tuple(X.columns)
feature_names

('Median House Price ( )',
 'Ratio of median house price to median salary',
 'Home Ownership (proportion of households)',
 'Unemployment Rate',
 'Share of LSOAs (small areas) in most deprived decile',
 'Standardised Weighted Overall Social Mobility Index',
 'School Funding Per Pupil (Real)',
 'Average Internet Speed (Mb/s)',
 '0-9',
 '19-Oct',
 '20-29',
 '30-39',
 '40-49',
 '50-59',
 '60-69',
 '70-79',
 '80+')

In [10]:
X.shape, y.shape

((532, 17), (532,))

In [11]:
X_train, X_other, y_train, y_other = train_test_split(X, y, test_size=0.20, random_state=0)
X_val, X_test, y_val, y_test= train_test_split(X_other, y_other, test_size=0.50, random_state=0)

In [12]:
model = LGR(max_iter=1000)
sfs_code = SFS(model,
           k_features='best', 
           forward=False, 
           floating=False, 
           verbose=0,
               scoring='accuracy',
               #scoring='precision',
           #scoring='recall',
            n_jobs=-1,
            cv=5)

sfs1 = sfs_code.fit(X_train, y_train,custom_feature_names=feature_names)

In [13]:
X_train_sele = sfs1.transform(X_train)
X_val_sele = sfs1.transform(X_val)
X_test_sele = sfs1.transform(X_test)

model.fit(X_train_sele, y_train)
print('Training accuracy:', np.mean(model.predict(X_train_sele) == y_train)*100)
print('Validation accuracy:', np.mean(model.predict(X_val_sele) == y_val)*100)
print('Test accuracy:', np.mean(model.predict(X_test_sele) == y_test)*100)

Training accuracy: 80.94117647058823
Validation accuracy: 75.47169811320755
Test accuracy: 83.33333333333334


In [14]:
# look at the selected feature indices at each step
sfs1.subsets_

In [15]:
sfs1.get_metric_dict()

In [16]:
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
fig1 = plot_sfs(sfs1.get_metric_dict(confidence_interval=0.95),ylabel = 'Accuracy', kind='std_err')
plt.grid()
plt.title('Sequential Backward Selection')

In [17]:
# the best features 
sfs1.k_feature_names_, sfs1.k_feature_idx_

In [18]:
df = pd.DataFrame.from_dict(sfs1.get_metric_dict()).T
df[["feature_idx","avg_score"]]

In [19]:
from scipy import stats
from sklearn.metrics import confusion_matrix, accuracy_score

class ModelSummary:
    """ This class extracts a summary of the model
    
    Methods
    -------
    get_se()
        computes standard error
    get_ci(SE_est)
        computes confidence intervals
    get_pvals()
        computes p-values
    get_summary(name=None)
        prints the summary of the model
    """
    
    def __init__(self, clf, X, y):
        """
        Parameters
        ----------
        clf: class
            the classifier object model
        X: pandas Dataframe
            matrix of predictors
        y: numpy array
            matrix of variable
        """
        self.clf = clf
        self.X = X
        self.y = y
        pass
    
    def get_se(self):
        # from here https://stats.stackexchange.com/questions/89484/how-to-compute-the-standard-errors-of-a-logistic-regressions-coefficients
        predProbs = self.clf.predict_proba(self.X)
        X_design = np.hstack([np.ones((self.X.shape[0], 1)), self.X])
        V = np.diagflat(np.product(predProbs, axis=1))
        covLogit = np.linalg.inv(np.dot(np.dot(X_design.T, V), X_design))
        return np.sqrt(np.diag(covLogit))

    def get_ci(self, SE_est):
        """
        Parameters
        ----------
        SE_est: numpy array
            matrix of standard error estimations
        """
        p = 0.975
        df = len(self.X) - 2
        crit_t_value = stats.t.ppf(p, df)
        coefs = np.concatenate([self.clf.intercept_, self.clf.coef_[0]])
        upper = coefs + (crit_t_value * SE_est)
        lower = coefs - (crit_t_value * SE_est)
        cis = np.zeros((len(coefs), 2))
        cis[:,0] = lower
        cis[:,1] = upper
        return cis
    
    def get_pvals(self):
        # from here https://stackoverflow.com/questions/25122999/scikit-learn-how-to-check-coefficients-significance
        p = self.clf.predict_proba(self.X)
        n = len(p)
        m = len(self.clf.coef_[0]) + 1
        coefs = np.concatenate([self.clf.intercept_, self.clf.coef_[0]])
        se = self.get_se()
        t =  coefs/se  
        p = (1 - stats.norm.cdf(abs(t))) * 2
        return p
    
    def get_summary(self, names=None):
        ses = self.get_se()
        cis = self.get_ci(ses)
        lower = cis[:, 0]
        upper = cis[:, 1]
        pvals = self.get_pvals()
        coefs = np.concatenate([self.clf.intercept_, self.clf.coef_[0]])
        data = []
        for i in range(len(coefs)):
            currlist = []
            currlist.append(np.round(coefs[i], 3))
            currlist.append(np.round(ses[i], 3))
            currlist.append(np.round(pvals[i], 3))
            currlist.append(np.round(lower[i], 3))
            currlist.append(np.round(upper[i], 3))
            data.append(currlist)
        cols = ['coefficient', 'std', 'p-value', '[0.025', '0.975]']
        sumdf = pd.DataFrame(columns=cols, data=data)
        if names is not None:
            new_names = ['intercept']*(len(names) + 1)
            new_names[1:] = [i for i in names]
            sumdf.index = new_names
        else:
            try:
                names = list(self.X.columns)
                new_names = ['intercept']*(len(names) + 1)
                new_names[1:] = [i for i in names]
                sumdf.index = new_names
            except:
                pass
        print(sumdf)
        acc = accuracy_score(self.y, self.clf.predict(self.X))
        confmat = confusion_matrix(self.y, self.clf.predict(self.X))
        print('-'*60)
        print('Confusion Matrix (total:{}) \t Accuracy: \t  {}'.format(len(self.X),np.round(acc, 3)))
        print('  TP: {} | FN: {}'.format(confmat[1][1],confmat[1][0]))
        print('  FP: {} | TN: {}'.format(confmat[0][1],confmat[0][0]))


In [20]:
#output code
modsummary = ModelSummary(model, X_train_sele, y_train)
modsummary.get_summary()

   coefficient     std  p-value  [0.025  0.975]
0       -5.199   2.429    0.032  -9.973  -0.424
1        0.638   0.089    0.000   0.463   0.813
2       -0.082   1.855    0.965  -3.728   3.564
3       -0.376  22.974    0.987 -45.533  44.781
4       -0.805   2.574    0.754  -5.865   4.255
5        0.006   0.004    0.111  -0.001   0.014
6       -0.002   0.001    0.170  -0.004   0.001
7       -0.297  10.779    0.978 -21.485  20.890
8       -0.045  17.557    0.998 -34.556  34.466
------------------------------------------------------------
Confusion Matrix (total:425) 	 Accuracy: 	  0.809
  TP: 125 | FN: 46
  FP: 35 | TN: 219
