In [1]:
###### -*- coding: utf-8 -*-
'''Sample script for Kaggle "Energy Connections" competition

Author:       Kyle Bradbury
Date:         September 27, 2018
Organization: Duke University Energy Initiative
'''

'''
Import the packages needed for classification
'''
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.metrics import accuracy_score
import sklearn.metrics as metrics
from os import path
plt.close()

'''
Set directory parameters
'''
# Set the directories for the data and the CSV files that contain ids/labels
base_dir = '/Users/Atsushi/Documents/BassConnections/KaggleData/Kaggle_First'
dir_train_images  = path.join(base_dir, 'train')
dir_test_images   = path.join(base_dir, 'test')
dir_train_labels  = path.join(base_dir, 'train.csv')
dir_test_ids      = path.join(base_dir, 'sample_submission.csv')

'''
Include the functions used for loading, preprocessing, features extraction, 
classification, and performance evaluation
'''

def load_data(dir_data, dir_labels, training=True):
    ''' Load each of the image files into memory 

    While this is feasible with a smaller dataset, for larger datasets,
    not all the images would be able to be loaded into memory

    When training=True, the labels are also loaded
    '''
    labels_pd = pd.read_csv(dir_labels)
    ids       = labels_pd.id.values
    data      = []
    
    for identifier in ids:
        fname     = path.join(dir_data, identifier)
        image     = mpl.image.imread(fname)
        data.append(image)
    data = np.array(data) # Convert to Numpy array
    if training:
        labels = labels_pd.label.values
        return data, labels
    else:
        return data, ids
    

def preprocess_and_extract_features(data):
    '''Preprocess data and extract features
    
    Preprocess: normalize, scale, repair
    Extract features: transformations and dimensionality reduction
    '''
    # Here, we do something trivially simple: we take the average of the RGB
    # values to produce a grey image, transform that into a vector, then
    # extract the mean and standard deviation as features.
    
    # Make the image grayscale

    '''
    red = data[:,:,0]
    vec_red = red.reshape(red.shape[0],-1)
    red = np.mean(vec_red,axis =1)
    green = data[:,:,1]
    vec_green = green.reshape(green.shape[0],-1)
    green = np.mean(vec_green,axis = 1)
    blue = data[:,:,2]
    vec_blue = blue.reshape(blue.shape[0],-1)
    blue = np.mean(vec_blue,axis =1)
   '''
    # Vectorize the grayscale matrices
    ndata = np.mean(data, axis=3)

    vectorized_data = ndata.reshape(ndata.shape[0],-1)

            
        
    
    # extract the mean and standard deviation of each sample as features
    feature_min = np.min(vectorized_data,axis=1)
    feature_max = np.max(vectorized_data,axis=1)
    feature_mean = np.mean(vectorized_data,axis=1)
    feature_std  = np.std(vectorized_data,axis=1)
    
    

    
    # Combine the extracted features into a single feature vector
    features = np.stack((feature_min,feature_max,feature_mean,feature_std),axis = -1)
    return features

def set_classifier():
    '''Shared function to select the classifier for both performance evaluation
    and testing
    '''
    return svm.SVC(probability=True, gamma = 0.001,kernel = "linear",C=100)

def cv_performance_assessment(X,y,k,clf):
    '''Cross validated performance assessment
    
    X   = training data
    y   = training labels
    k   = number of folds for cross validation
    clf = classifier to use
    
    Divide the training data into k folds of training and validation data. 
    For each fold the classifier will be trained on the training data and
    tested on the validation data. The classifier prediction scores are 
    aggregated and output
    '''
    # Establish the k folds
    prediction_scores = np.empty(y.shape[0],dtype='object')
    kf = StratifiedKFold(n_splits=k, shuffle=True)
    i = 1
    for train_index, val_index in kf.split(X, y):
        # Extract the training and validation data for this fold
        X_train, X_val   = X[train_index], X[val_index]
        y_train          = y[train_index]
        
        # Train the classifier
        X_train_features = preprocess_and_extract_features(X_train)
        clf              = clf.fit(X_train_features,y_train)
        
        # Test the classifier on the validation data for this fold
        X_val_features   = preprocess_and_extract_features(X_val)
        cpred            = clf.predict_proba(X_val_features)
        
        # Save the predictions for this fold
        prediction_scores[val_index] = cpred[:,1]
        print('Training Fold {} of {} Complete'.format(i,k))
        i += 1
    return prediction_scores

def plot_roc(labels, prediction_scores):
    fpr, tpr, _ = metrics.roc_curve(labels, prediction_scores, pos_label=1)
    auc = metrics.roc_auc_score(labels, prediction_scores)
    legend_string = 'AUC = {:0.3f}'.format(auc)
   
    plt.plot([0,1],[0,1],'--', color='gray', label='Chance')
    plt.plot(fpr, tpr, label=legend_string)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.grid(True)
    plt.axis('square')
    plt.legend()
    plt.tight_layout()


'''
Sample script for running cross-validated performance assessment
'''
# Set parameters for the analysis
num_training_folds = 40

# Load the data
data, labels = load_data(dir_train_images, dir_train_labels, training=True)
print('Data Loaded')

# Choose which classifier to use
clf = set_classifier()

# Perform cross validated performance assessment
prediction_scores = cv_performance_assessment(data,labels,num_training_folds,clf)

# Compute and plot the ROC curves
plot_roc(labels, prediction_scores)


'''
Sample script for producing a Kaggle submission
'''

produce_submission = True 
# Switch this to True when you're ready to create a submission for Kaggle

if produce_submission:
    # Load data, extract features, and train the classifier on the training data
    training_data, training_labels = load_data(dir_train_images, dir_train_labels, training=True)
    training_features              = preprocess_and_extract_features(training_data)
    clf                            = set_classifier()
    clf.fit(training_features,training_labels)

    # Load the test data and test the classifier
    test_data, ids = load_data(dir_test_images, dir_test_ids, training=False)
    test_features  = preprocess_and_extract_features(test_data)
    test_scores    = clf.predict_proba(test_features)[:,1]

    # Save the predictions to a CSV file for upload to Kaggle
    submission_file = pd.DataFrame({'id':    ids,
                                   'score':  test_scores})
    submission_file.to_csv('submission.csv',
                           columns=['id','score'],
                           index=False)


Data Loaded
Training Fold 1 of 40 Complete
Training Fold 2 of 40 Complete
Training Fold 3 of 40 Complete
Training Fold 4 of 40 Complete
Training Fold 5 of 40 Complete
Training Fold 6 of 40 Complete
Training Fold 7 of 40 Complete
Training Fold 8 of 40 Complete
Training Fold 9 of 40 Complete
Training Fold 10 of 40 Complete
Training Fold 11 of 40 Complete
Training Fold 12 of 40 Complete
Training Fold 13 of 40 Complete
Training Fold 14 of 40 Complete
Training Fold 15 of 40 Complete
Training Fold 16 of 40 Complete
Training Fold 17 of 40 Complete
Training Fold 18 of 40 Complete
Training Fold 19 of 40 Complete
Training Fold 20 of 40 Complete
Training Fold 21 of 40 Complete
Training Fold 22 of 40 Complete
Training Fold 23 of 40 Complete
Training Fold 24 of 40 Complete
Training Fold 25 of 40 Complete
Training Fold 26 of 40 Complete
Training Fold 27 of 40 Complete
Training Fold 28 of 40 Complete
Training Fold 29 of 40 Complete
Training Fold 30 of 40 Complete
Training Fold 31 of 40 Complete
Train