# Binary classification task

In [453]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.colors as mcolors
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif, f_regression, mutual_info_classif
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

## Dealing with missing data

The cell below will load in the training and testing datasets, then identify features that are entirely absent in the test dataset and remove these columns from both datasets (because they are not useable).

In [464]:
train_data = pd.read_csv("train_data.csv")
test_data = pd.read_csv("test_data.csv")

# Identify columns that are absent in the test data
missing_in_test = test_data.columns[test_data.isna().all()].tolist()

# #Remove those columns from both datasets
reduced_train_data = train_data.drop(columns=missing_in_test)
reduced_test_data = test_data.drop(columns=missing_in_test)

['aggregated_scores_1', 'aggregated_scores_43', 'aggregated_scores_44', 'aggregated_scores_45', 'aggregated_scores_46', 'aggregated_scores_47', 'aggregated_scores_48', 'aggregated_scores_49', 'aggregated_scores_50', 'aggregated_scores_51', 'aggregated_scores_52', 'aggregated_scores_53', 'aggregated_scores_54', 'aggregated_scores_55', 'aggregated_scores_56', 'aggregated_scores_57', 'aggregated_scores_58', 'aggregated_scores_59', 'aggregated_scores_60', 'aggregated_scores_61']


In [471]:
# Inspect features with missing values for some subejcts
def missing_values_table(df, folds=False):
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * mis_val / len(df)
        mz_table = pd.concat([mis_val, mis_val_percent], axis=1)
        mz_table = mz_table.rename(columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        mz_table['Data Type'] = df.dtypes
        mz_table = mz_table[mz_table.iloc[:,1] != 0].sort_values('% of Total Values', ascending=False).round(1)
        if not folds==True:
            print ("The reduced training dataframe has " + str(df.shape[1]) + " feature columns and " + 
                   str(df.shape[0]) + " instance rows.\n"      
                "There are " + str(mz_table.shape[0]) +
                " feature columns that have missing values. Summary:")
        
        return mz_table

instances_missing = missing_values_table(reduced_train_data, folds=False)
instances_missing

The reduced training dataframe has 278 feature columns and 276 instance rows.
There are 3 feature columns that have missing values. Summary:


Unnamed: 0,Missing Values,% of Total Values,Data Type
demographic_field_4,35,12.7,float64
demographic_field_5,35,12.7,float64
demographic_field_2,22,8.0,float64


In [448]:
reduced_train_data[list(instances_missing.index)].describe()

Unnamed: 0,demographic_field_4,demographic_field_5,demographic_field_2
count,241.0,241.0,254.0
mean,0.946058,0.788382,13.208661
std,0.226373,0.409306,4.145544
min,0.0,0.0,0.0
25%,1.0,1.0,11.0
50%,1.0,1.0,12.0
75%,1.0,1.0,16.0
max,1.0,1.0,30.0


Missing values comprise a small proportion of the data (more than 25-30% might be problematic). Will aim to replace missing values with the mean of their column (for continuous feature) or the mode (for binary features).

In [469]:
training_subset = reduced_train_data 
training_ids = training_subset.iloc[:, [0]]
# draw_histograms(training_subset, training_subset.columns[1:21], 5, 4)

testing_subset = reduced_test_data 
testing_ids = testing_subset.iloc[:,[0]]

# Extract instance classification labels from the training dataset
classification_labels = training_subset.iloc[:, [-1]]
training_subset = training_subset.drop(columns=["id", "label"])
testing_subset = testing_subset.drop(columns="id")


### Include plotting function for data exploration

In [465]:
# Function to plot individual feature distributions  as histograms
colors = list(mcolors.CSS4_COLORS.keys())[10:]
def draw_histograms(dataframe, features, rows, cols):
    fig=plt.figure(figsize=(20,20))
    for i, feature in enumerate(features):
        ax=fig.add_subplot(rows,cols,i+1)
        dataframe[feature].hist(bins=20,ax=ax,facecolor=colors[i])
        ax.set_title(feature+" Histogram",color=colors[35])
#         ax.set_yscale('log')
    fig.tight_layout() 
    #plt.savefig('Histograms.png')
    plt.show()

    # Consider correlation of features
#     plt.figure(figsize = (38,16))
#     sns.heatmap(fold_train_selected.corr(), annot = True)
#     plt.show()

## Data preparation

Normalise feature data using min-max scaling - this will scale all feature values to the range 0-1. This will not affect the values of binary features. Standardization may be preferable for Guassian distributed continuous features, however, this dataset includes many binary features and some non-Gaussian continuous features.

Could consider normalisation for some features and standardisation for others (future

In [161]:
def normalise_train_test(training_subset, testing_subset):
    # fit scaler on training data
    norm = MinMaxScaler().fit(training_subset)

    # transform training data
    normalised_training_subset = training_subset.copy(deep=True)
    normalised_training_subset[normalised_training_subset.columns] = norm.transform(training_subset[training_subset.columns])

    # transform testing data
    # Testing/validation data is normalised using training data feature ranges 
    # -> min and max should not be exactly 0 and 1
    normalised_testing_subset = testing_subset.copy(deep=True)
    normalised_testing_subset[normalised_testing_subset.columns] = norm.transform(testing_subset[testing_subset.columns])
    
    return normalised_training_subset, normalised_testing_subset

# normalised_testing_subset.describe()

## Feature selection

Working with numerical input data (including some binary variables, assumed to be dummy variables in place of categorical data). As such, feature selection will be performed using a filter-based technique with the ANOVA correlation coefficient.
- Not sure how far to reduce the feature set - is k<1/10th the no of samples reasonable?
- This feature selection method will not take feature correlations into account because each feature is considered separately - need to include separate consideration of feature-feature correlations

In [437]:

def select_features(normalised_training_subset, training_labels, k_features):
    # define feature selection
    selector = SelectKBest(score_func=f_classif, k=k_features)

    # apply feature selection
    selector.fit(normalised_training_subset, np.squeeze(training_labels))
    cols = selector.get_support(indices=True)

    return cols

def trim_correlated(df_in, threshold):
    df_corr = df_in.corr(method='pearson')
    df_not_correlated = ~(df_corr.mask(np.tril(np.ones([len(df_corr)]*2, dtype=bool))).abs() > threshold).any()
    un_corr_idx = df_not_correlated.loc[df_not_correlated[df_not_correlated.index] == True].index
#     df_out = df_in[un_corr_idx]
    return un_corr_idx


## Model feature selection and training with 3-fold cross-validation

In [480]:
# prepare cross validation - 3 folds
kfold = KFold(n_splits=3, shuffle=True)

# separate splits
count = 0
LRC_validation_accuracy = []
svc_validation_accuracy = []
KNNC_validation_accuracy = []
for train, valid in kfold.split(training_subset):
    
    # separate data into train and test folds
    fold_train = training_subset.iloc[train].copy(deep=True)
    fold_train_labels = classification_labels.iloc[train].copy(deep=True)
    fold_valid = training_subset.iloc[valid].copy(deep=True)
    fold_valid_labels = classification_labels.iloc[valid].copy(deep=True)
    
    # Fill in missing data - determine if feature is binary or continuous
    # Fill binary using the mode, fill continuous using the mean
    fold_train_instances_missing = fold_train.columns[fold_train.isna().any()].tolist()
    for iterant in fold_train_instances_missing:
        if fold_train[iterant].max() == 1.000000:
            fold_train[iterant].fillna(value = fold_train[iterant].mode()[0], inplace=True)
        else:
            fold_train[iterant].fillna(value = fold_train[iterant].mean(), inplace=True)
            
    fold_valid_instances_missing = fold_valid.columns[fold_valid.isna().any()].tolist()
    for iterant in fold_valid_instances_missing:
        if fold_valid[iterant].max() == 1.000000:
            fold_valid[iterant].fillna(value = fold_valid[iterant].mode()[0], inplace=True)
        else:
            fold_valid[iterant].fillna(value = fold_valid[iterant].mean(), inplace=True)
    
    # Normalise features using MinMax Scaling
    fold_norm_train, fold_norm_valid = normalise_train_test(fold_train, fold_valid)
    
#     # Inspect remaining feature correlations
#     plt.figure(figsize = (38,16))
#     sns.heatmap(fold_norm_train.iloc[:,0:20].corr(), annot = True)
#     plt.show()
    
    # Remove features that are highly correlated with another selected feature
    un_corr_idx = trim_correlated(fold_norm_train, 0.9)
    print('Number of features removed due to correlation {0:2d}'. format((fold_norm_train.shape[1] - len(un_corr_idx))))
    fold_train_uncorr = fold_norm_train[un_corr_idx]
    fold_valid_uncorr = fold_norm_valid[un_corr_idx]
    
    # Run feature selection
    feature_aim = 15
    fold_cols_selected = select_features(fold_train_uncorr, fold_train_labels, feature_aim)
    fold_train_selected = fold_train_uncorr.iloc[:,fold_cols_selected]
    fold_valid_selected = fold_valid_uncorr.iloc[:,fold_cols_selected]
#     print(fold_train_selected)
#     draw_histograms(fold_train_selected,fold_train_selected.columns,8,4)
    
    ### Model options: this would be nicer in a function
    
    # Setup and fit SVM classifier
    svc=SVC(C=1.0) 
    # fit classifier to training set
    svc.fit(fold_train_selected, np.squeeze(fold_train_labels))
    # make predictions on fold validation set
    validation_predictions_svc=svc.predict(fold_valid_selected)
    svc_accuracy = accuracy_score(np.squeeze(fold_valid_labels), validation_predictions_svc)
    svc_validation_accuracy.append(svc_accuracy)
    
    # Setup and fit logistic regression classifier
    LRC = LogisticRegression()
    LRC.fit(fold_train_selected, np.squeeze(fold_train_labels))
    # make predictions on fold validation set
    validation_predictions_LRC=LRC.predict(fold_valid_selected)
    LRC_accuracy = accuracy_score(np.squeeze(fold_valid_labels), validation_predictions_LRC)
    LRC_validation_accuracy.append(LRC_accuracy)
    
    # Setup and fit K-nearest neighbour classifer
    KNNC = KNeighborsClassifier(n_neighbors = 12)
    KNNC.fit(fold_train_selected, np.squeeze(fold_train_labels))
    validation_predictions_KNNC=KNNC.predict(fold_valid_selected)
    KNNC_accuracy = accuracy_score(np.squeeze(fold_valid_labels), validation_predictions_KNNC)
    KNNC_validation_accuracy.append(KNNC_accuracy)

    count += 1
    # Print validation report for each fold
#     print('Fold {0:2d} prediction accuracy is: {1:0.4f}'. format(count, LRC_accuracy))
#     print(classification_report(np.squeeze(fold_valid_labels), validation_predictions_svc))
#     print(classification_report(np.squeeze(fold_valid_labels), validation_predictions_LRC))
#     print(classification_report(np.squeeze(fold_valid_labels), validation_predictions_KNNC))

print('Mean svc accuracy score over {0:2d} folds is: {1:0.4f}'. format(count, np.mean(np.asarray(svc_validation_accuracy))))
print('Mean LRC accuracy score over {0:2d} folds is: {1:0.4f}'. format(count, np.mean(np.asarray(LRC_validation_accuracy))))
print('Mean KNNC accuracy score over {0:2d} folds is: {1:0.4f}'. format(count, np.mean(np.asarray(KNNC_validation_accuracy))))
    


Number of features removed due to correlation 15
Number of features removed due to correlation 18
Number of features removed due to correlation 18
Mean svc accuracy score over  3 folds is: 0.7645
Mean LRC accuracy score over  3 folds is: 0.7790
Mean KNNC accuracy score over  3 folds is: 0.7609


The support vector classifier, logistic regression classifier and K-nearest neighbour classifiers all provide comparable prediction accuracy over 5-fold cross validation, although the KNN's performance is slightly worse (and the classifer's performance is very sensitive to K, and variable across folds).

Selecting the logistic regression classifier for testing on the unseen dataset.

## Training on full dataset and testing on unseen data

Now that model selection is complete, the full training dataset:
- will have missing values filled (with feature mean for continuous features and feature mode for binary features,
- will be normalised using MinMax scaling,
- and will have highly correlated features removed.

Filter based feature selection will then be performed to reduce the number of features to 15. 

Finally, a logistic regression classifier will be trained on the full training dataset and then used to predict classification labels for the full testing dataset.

Questionable assumptions of the logistic regression classifier are:
- that the observations are independent i.e. not repeat tests.
- that there is little multicollinearity (feature-feature correlation) - this has been mitigated to some extent


In [443]:
# Fill in missing data - determine if feature is binary or continuous
# Fill binary using the mode, fill continuous using the mean
train_instances_missing = training_subset.columns[training_subset.isna().any()].tolist()
for iterant in train_instances_missing:
    if training_subset[iterant].max() == 1.000000:
        training_subset[iterant].fillna(value = training_subset[iterant].mode()[0], inplace=True)
    else:
        training_subset[iterant].fillna(value = training_subset[iterant].mean(), inplace=True)
        

# Fill missing data as above for test dataset            
test_instances_missing = testing_subset.columns[testing_subset.isna().any()].tolist()
for iterant in test_instances_missing:
    if testing_subset[iterant].max() == 1.000000:
        testing_subset[iterant].fillna(value = testing_subset[iterant].mode()[0], inplace=True)
    else:
        testing_subset[iterant].fillna(value = testing_subset[iterant].mean(), inplace=True)
    
# Normalise features in training and test sets using MinMax Scaling
norm_train, norm_test = normalise_train_test(training_subset, testing_subset)
    
# Remove features that are highly correlated with another selected feature
un_corr_idx = trim_correlated(norm_train, 0.9)
print('Number of features removed due to correlation {0:2d}'. format((fold_norm_train.shape[1] - len(un_corr_idx))))
train_uncorr = norm_train[un_corr_idx]
test_uncorr = norm_test[un_corr_idx]
    
# Run feature selection
feature_aim = 15
cols_selected = select_features(train_uncorr, classification_labels, feature_aim)
train_selected = train_uncorr.iloc[:,cols_selected]
test_selected = test_uncorr.iloc[:,cols_selected]

# Setup and fit logistic regression classifier using full training dataset
LRC = LogisticRegression()
LRC.fit(train_selected, np.squeeze(classification_labels))

# make predictions on test set
testing_predictions_LRC=pd.DataFrame(data=LRC.predict(test_selected), columns=['label'])
result = testing_ids.join(testing_predictions_LRC)
# Save predictions to CSV file for sharing
result.to_csv('test_labels.csv', sep=',')

Number of features removed due to correlation 15
