### Imports

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np

# Add models here
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import CategoricalNB
from sklearn.pipeline import Pipeline
from tensorflow import keras
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import optimizers

# Sklearn imports for processing and evaluation
from sklearn.decomposition import PCA
from sklearn.metrics import median_absolute_error, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
# Plotting
from matplotlib import pyplot as plt

# Misc
import time
from tqdm import tqdm

### Load Data

In [3]:
x_train = np.loadtxt("/content/drive/MyDrive/uci_har_dataset/train/X_train.txt")
y_train = np.loadtxt("/content/drive/MyDrive/uci_har_dataset/train/y_train.txt")

x_test = np.loadtxt("/content/drive/MyDrive/uci_har_dataset/test/X_test.txt")
y_test = np.loadtxt("/content/drive/MyDrive/uci_har_dataset/test/y_test.txt")

### Combine Data

In [4]:
combined_x = np.concatenate((x_train, x_test))
combined_y = np.concatenate((y_train, y_test))

assert combined_x.shape[0] == x_train.shape[0] + x_test.shape[0]
assert combined_y.shape[0] == y_train.shape[0] + y_test.shape[0]

### Baseline model

#### Neural Network

In [5]:
def do_kfoldNN(x_data, y_data, features='all', folds=10):
    start = time.time()
    # Do KFold
    kf = KFold(folds)
    
    # KF training
    pred_accuracy = []
    fit_times = []
    for idx, (train_idx, test_idx) in tqdm(enumerate(kf.split(x_data))):    
        
        # Do PCA
        features = x_data.shape[1] if(features == 'all') else features
        pca = PCA(n_components=features)
        scalar = StandardScaler().fit(x_data[train_idx])
        standardized = scalar.transform(x_data[train_idx])
        transformed = pca.fit_transform(standardized)
        
        # Fit and time model
        
        x = x_data[train_idx].astype('float32')
        n_features = x_data[train_idx].shape[1]
        k1 = transformed.shape[1]
        print(k1)
        model = Sequential()
        model.add(Dense(k1, activation='relu', kernel_initializer='he_normal', input_shape=(k1,)))
        model.add(Dense(384, activation='relu',  kernel_initializer='he_normal'))
        model.add(Dense(6,activation='softmax'))
        
        optimizers.Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False,name='Adam')
        
        model.compile(optimizer='Adam',loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        print(transformed.shape, y_data[train_idx.shape])
        #raise
        print(np.unique(y_data[train_idx]))
        model.fit(transformed, y_data[train_idx], batch_size = 30, epochs = 5)
        # Transform test data and evaluate
        test_transform = np.dot(scalar.transform(x_data[test_idx]), pca.components_.T)
        preds = model.predict(test_transform)
        #print(preds)
        #print(y_data[test_idx])
        loss, score = model.evaluate(test_transform, y_data[test_idx], verbose=0)
        #score = accuracy_score(preds, y_data[test_idx])
        pred_accuracy.append(score)
        fit_times.append(time.time() - start)
        
    print(r"{0} FEATURES PREDICTION ACCURACY: {1:.3f} $\pm$ {2:.3f}".format(features, np.mean(pred_accuracy), np.std(pred_accuracy)))
    return pred_accuracy, fit_times

In [6]:
#for i in range (0,len(combined_y)):
#    combined_y[i] = combined_y[i]-1

combined_y_mod = combined_y - 1
pred_accuracyNN, fit_timesNN = do_kfoldNN(combined_x, combined_y_mod, features=561)

scores_nn = np.mean(pred_accuracyNN)
scores_nn_std = np.std(pred_accuracyNN)
time_nn = np.mean(fit_timesNN)

np.save("/content/drive/MyDrive/uci_har_dataset/results/nn_score.npy", pred_accuracyNN)
np.save("/content/drive/MyDrive/uci_har_dataset/results/nn_time.npy", fit_timesNN)

print(f"The accuracy is {np.mean(scores_nn)*100:1.3f}% with a standard deviation of {scores_nn_std:1.3f}")
print(f"The time taken to complete the model is {time_nn:1.2f}s")

0it [00:00, ?it/s]

561
(9269, 561) 3.0
[0. 1. 2. 3. 4. 5.]
Epoch 1/5
Epoch 2/5

0it [00:05, ?it/s]


KeyboardInterrupt: ignored

#### CV Code

In [None]:
def do_kfold(model, x_data, y_data, folds=10, seed=1):
         
    # Do KFold
    kf = KFold(folds)
    np.random.seed(seed)
    
    # KF training
    pred_accuracy = []
    fit_times = []
    for idx, (train_idx, test_idx) in tqdm(enumerate(kf.split(list(range(len(y_data)))))): 

        scalar = StandardScaler().fit(x_data[train_idx])
        standardized = scalar.transform(x_data[train_idx])

        # Fit and time model
        start = time.time()
        model.fit(standardized, y_data[train_idx])
        fit_times.append(time.time() - start)
        
        # Score model
        preds = model.predict(scalar.transform(x_data[test_idx]))
        score = accuracy_score(preds, y_data[test_idx])
        pred_accuracy.append(score)
        
    print(r"{0} POINTS PREDICTION ACCURACY: {1:.3f} $\pm$ {2:.3f}".format(len(y_data), np.mean(pred_accuracy), np.std(pred_accuracy)))
    return pred_accuracy, fit_times

#### Logistic Regression

In [None]:
# prepare the cross-validation procedure
t0 = time.time()
cv = KFold(n_splits=10)
# create model
model = LogisticRegression(multi_class='ovr', solver='liblinear')
# evaluate model
scaler = StandardScaler()
pipeline = Pipeline([('transformer', scaler), ('estimator', model)])
#scores = cross_val_score(pipeline, combined_x, combined_y, scoring='accuracy', cv=cv)

scores, times = do_kfold(model, combined_x, combined_y)
scores_lr = np.mean(scores)
scores_lr_std = np.std(scores)
#print(f"The accuracy is {np.mean(scores_lr)*100:1.3f}% with a standard deviation of {np.std(scores_lr):1.3f}")
#t1 = time.time()
#t1 = time.time()
#time_lg = t1 - t0
#print(f"The time taken to complete the model is {time_lg:1.2f}s")

np.save("/content/drive/MyDrive/uci_har_dataset/results/lr_score.npy", scores)
np.save("/content/drive/MyDrive/uci_har_dataset/results/lr_time.npy", times)

#### SVM Linear

In [None]:
tsvm_0 = time.time()
cv = KFold(n_splits=10)
# create model
model = svm.SVC(kernel='linear', C=1)
# evaluate model
#scaler = StandardScaler()
#pipeline = Pipeline([('transformer', scaler), ('estimator', model)])
#scores = cross_val_score(pipeline, combined_x, combined_y, scoring='accuracy', cv=cv)
#scores_svm_linear = np.mean(scores)
#scores_svm_linear_std = np.std(scores)
scores, times = do_kfold(model, combined_x, combined_y)


#print(f"The accuracy is {np.mean(scores_svm_linear)*100:1.3f}% with a standard deviation of {scores_svm_linear_std:1.3f}")
#tsvm_1 = time.time()
#time_svm_linear = tsvm_1 - tsvm_0
#print(f"The time taken to complete the model is {time_svm_linear:1.2f}s")

np.save("/content/drive/MyDrive/uci_har_dataset/results/lin_score.npy", scores)
np.save("/content/drive/MyDrive/uci_har_dataset/results/lin_time.npy", times)

#### SVM Poly

In [None]:
tsvm_0 = time.time()
cv = KFold(n_splits=10)
# create model
model = svm.SVC(kernel='poly', C=1)
# evaluate model
#scaler = StandardScaler()
#pipeline = Pipeline([('transformer', scaler), ('estimator', model)])
#scores = cross_val_score(pipeline, combined_x, combined_y, scoring='accuracy', cv=cv)
#scores_svm_linear = np.mean(scores)
#scores_svm_linear_std = np.std(scores)
scores, times = do_kfold(model, combined_x, combined_y)


#print(f"The accuracy is {np.mean(scores_svm_linear)*100:1.3f}% with a standard deviation of {scores_svm_linear_std:1.3f}")
#tsvm_1 = time.time()
#time_svm_linear = tsvm_1 - tsvm_0
#print(f"The time taken to complete the model is {time_svm_linear:1.2f}s")

np.save("/content/drive/MyDrive/uci_har_dataset/results/poly_score.npy", scores)
np.save("/content/drive/MyDrive/uci_har_dataset/results/poly_time.npy", times)

#### Random Forest

In [None]:
trf_0 = time.time()
cv = KFold(n_splits=5)
# create model
model = RandomForestClassifier()
# evaluate model
#scaler = StandardScaler()
#pipeline = Pipeline([('transformer', scaler), ('estimator', model)])
#scores = cross_val_score(pipeline, combined_x, combined_y, scoring='accuracy', cv=cv)
#scores_svm_poly = np.mean(scores)
#scores_rf = np.mean(scores)
#scores_rf_std = np.std(scores)
scores, times = do_kfold(model, combined_x, combined_y)

#print(f"The accuracy is {scores_rf*100:1.3f}% with a standard deviation of {scores_rf_std:1.3f}")
#trf_1 = time.time()
#time_rf = trf_1 - trf_0
#print(f"The time taken to complete the model is {time_rf:1.2f}s")

np.save("/content/drive/MyDrive/uci_har_dataset/results/rf_score.npy", scores)
np.save("/content/drive/MyDrive/uci_har_dataset/results/rf_time.npy", times)

#### Gradient Boost

In [None]:
tgb_0 = time.time()
cv = KFold(n_splits=5)
# create model
model = GradientBoostingClassifier()
# evaluate model
scaler = StandardScaler()
pipeline = Pipeline([('transformer', scaler), ('estimator', model)])
#scores = cross_val_score(pipeline, combined_x, combined_y, scoring='accuracy', cv=cv)
#scores_svm_poly = np.mean(scores)
#scores_gb = np.mean(scores)
#scores_gb_std = np.std(scores)

#print(f"The accuracy is {scores_gb*100:1.3f}% with a standard deviation of {scores_gb_std:1.3f}")
#tgb_1 = time.time()

#tgb_1 = time.time()
#time_gb = tgb_1 - tgb_0
#print(f"The time taken to complete the model is {time_gb:1.2f}s")

#### Plots comparing accuracy and time to run the alogrithms

In [None]:
#run_time = np.array([time_lg, time_svm_linear, time_svm_poly, time_rf, time_nn, time_gb])
#accuracies =  np.array([scores_lr, scores_svm_linear, scores_svm_poly, scores_rf, scores_nn, scores_gb])
#accuracy_std = np.array([scores_lr_std, scores_svm_linear_std, scores_svm_poly_std, scores_rf_std, scores_nn_std, scores_gb_std])
#name_models = ['Log Reg', 'Linear SVM', 'Poly SVM','Rand For', 'Neural Nets', 'Gradient Boost']

name_model_wgb = ['Log Reg', 'Linear SVM', 'Poly SVM','Rand For', 'Neural Nets']
accuracies_wgb =  np.array([scores_lr, scores_svm_linear, scores_svm_poly, scores_rf, scores_nn])
accuracy_std_wgb = np.array([scores_lr_std, scores_svm_linear_std, scores_svm_poly_std, scores_rf_std, scores_nn_std])
run_time_wgb = np.array([time_lg, time_svm_linear, time_svm_poly, time_rf, time_nn])

#plt.figure
#plt.bar(name_models, run_time)
#plt.title('Time taken to run alogrithm')
#plt.ylabel('Time (sec)')
#plt.show()

plt.figure
plt.bar(name_model_wgb, run_time_wgb)
plt.title('Time taken to run alogrithms without Gradient Boost')
plt.ylabel('Time (sec)')
plt.show()

#plt.figure
#plt.bar(name_models, accuracies, yerr=accuracy_std)
#plt.title('Baseline accuracies of each alogrithm')
#plt.ylabel('Accuracy')
#plt.show()

plt.figure
plt.bar(name_model_wgb, accuracies_wgb, yerr=accuracy_std_wgb)
plt.title('Baseline accuracies without Gradient Boost')
plt.ylabel('Accuracy')
plt.show()

plt.plot(time_lg,scores_lr, 'ro', label = 'Logistic Regression')
plt.plot(time_svm_linear,scores_svm_linear, 'bo', label = 'Linear SVM')
plt.plot(time_svm_poly,scores_svm_poly, 'go', label = 'Poly SVM')
plt.plot(time_rf,scores_rf, 'yo', label = 'Random Forest')
plt.plot(time_nn,scores_nn, 'ko', label = 'Neural Network')
#plt.plot(time_gb,scores_gb, 'mo', label = 'Gradient Boost')
plt.xlabel('Time ')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

## Part 2
### Comparing feature selection and feature extraction

### Feature selection using probabilistic mutual information

In [None]:
def do_kfold_mutual_info(model, x_data, y_data, features='all', folds=10):
     
    # Do KFold
    kf = KFold(folds)
    
    # KF training
    pred_accuracy = []
    fit_times = []
    for idx, (train_idx, test_idx) in tqdm(enumerate(kf.split(x_data))):    
        
        # Do Mutual Information Feature Extraction
        features = x_data.shape[1] if(features == 'all') else features
        fs = SelectKBest(score_func=mutual_info_classif, k=features)
        fs.fit(x_data[train_idx], y_data[train_idx])
        x_train_fs = fs.transform(x_data[train_idx])
        x_test_fs = fs.transform(x_data[test_idx])

        # Standardize data
        scaler = StandardScaler().fit(x_train_fs)
        transformed = scaler.transform(x_train_fs)
        
        # Fit and time model
        start = time.time()
        model.fit(transformed, y_data[train_idx])
        fit_times.append(time.time() - start)
        
        # Transform test data and evaluate

        preds = model.predict(scaler.transform(x_test_fs))
        score = accuracy_score(preds, y_data[test_idx])
        pred_accuracy.append(score)
        
    print(r"{0} FEATURES PREDICTION ACCURACY: {1:.3f} $\pm$ {2:.3f}".format(features, np.mean(pred_accuracy), np.std(pred_accuracy)))
    return pred_accuracy, fit_times

### Training for Different Number of Mutual information

In [None]:
# Number of PCA Components
num_features = np.arange(5, 500, 20)

results = []
times = []
for features in num_features:
    print(features)
    pred_accuracy, fit_times = do_kfold_mutual_info(svm.SVC(kernel='linear', C=1), combined_x, combined_y, features=features)
    results.append(pred_accuracy)
    times.append(fit_times)

# Get means
ml_acc_mean = np.mean(results, axis=1)
ml_acc_stds = np.std(results, axis=1)
ml_time_mean = np.mean(times, axis=1)
ml_time_stds = np.std(times, axis=1)

"/content/drive/MyDrive/uci_har_dataset/test/X_test.txt"
np.save("/content/drive/MyDrive/uci_har_dataset/svm_linear_results.npy", results)
np.save("/content/drive/MyDrive/uci_har_dataset/mi_results/svm_linear_time.npy", times)

In [None]:
!zip -r pca_results.zip pca_results/
!zip -r subsampled_results.zip subsampled_results/
!zip -r mi_results.zip mi_results/

### Feature extraction using PCA

In [None]:
def do_kfold_pca(model, x_data, y_data, features='all', folds=10):
     
    # Do KFold
    kf = KFold(folds)
    
    # KF training
    pred_accuracy = []
    fit_times = []
    for idx, (train_idx, test_idx) in tqdm(enumerate(kf.split(x_data))):    
        
        # Do PCA
        start = time.time()
        features = x_data.shape[1] if(features == 'all') else features
        pca = PCA(n_components=features)
        scalar = StandardScaler().fit(x_data[train_idx])
        standardized = scalar.transform(x_data[train_idx])
        transformed = pca.fit_transform(x_data[train_idx])
        
        # Fit and time model
        model.fit(transformed, y_data[train_idx])
        fit_times.append(time.time() - start)
        
        # Transform test data and evaluate
        test_transform = np.dot(scalar.transform(x_data[test_idx]), pca.components_.T)
        #test_transform = np.dot(x_data[test_idx], pca.components_.T)
        preds = model.predict(test_transform)
        score = accuracy_score(preds, y_data[test_idx])
        pred_accuracy.append(score)
        
    print(r"{0} FEATURES PREDICTION ACCURACY: {1:.3f} $\pm$ {2:.3f}".format(features, np.mean(pred_accuracy), np.std(pred_accuracy)))
    return pred_accuracy, fit_times

In [None]:
num_features = np.arange(5, 500, 20)
results = []
times = []
for features in num_features:
    pred_accuracy, fit_times = do_kfold_pca(svm.SVC(kernel='linear', C=1), combined_x, combined_y, features=features)
    results.append(pred_accuracy)
    times.append(fit_times)

# Get means
pca_acc_mean = np.mean(results, axis=1)
pca_acc_stds = np.std(results, axis=1)
pca_time_mean = np.mean(times, axis=1)
pca_time_stds = np.std(times, axis=1)

np.save("/content/drive/MyDrive/uci_har_dataset/results/lin_pre_pca_score.npy", results)
np.save("/content/drive/MyDrive/uci_har_dataset/results/lin_pre_pca_time.npy", times)

### Psuedo Data

In [None]:
def do_kfold_psudodata(model, x_data, y_data, num_points=500, folds=10, seed=1):
         
    # Do KFold
    kf = KFold(folds)
    
    # KF training
    pred_accuracy = []
    fit_times = []
    
    np.random.seed(seed)
    
    for idx, (train_idx, test_idx) in tqdm(enumerate(kf.split(list(range(len(y_data)))))): 
        
        # Get mean and std for each label
        x_1 = combined_x[train_idx][np.argwhere(combined_y[train_idx] == 0)[:,0]]
        x_2 = combined_x[train_idx][np.argwhere(combined_y[train_idx] == 1)[:,0]]
        x_3 = combined_x[train_idx][np.argwhere(combined_y[train_idx] == 2)[:,0]]
        x_4 = combined_x[train_idx][np.argwhere(combined_y[train_idx] == 3)[:,0]]
        x_5 = combined_x[train_idx][np.argwhere(combined_y[train_idx] == 4)[:,0]]
        x_6 = combined_x[train_idx][np.argwhere(combined_y[train_idx] == 5)[:,0]]

        mean_1 = np.mean(x_1, axis=0)
        std_1 = np.std(x_1, axis=0)
        mean_2 = np.mean(x_2, axis=0)
        std_2 = np.std(x_2, axis=0)
        mean_3 = np.mean(x_3, axis=0)
        std_3 = np.std(x_3, axis=0)
        mean_4 = np.mean(x_4, axis=0)
        std_4 = np.std(x_4, axis=0)
        mean_5 = np.mean(x_5, axis=0)
        std_5 = np.std(x_5, axis=0)
        mean_6 = np.mean(x_6, axis=0)
        std_6 = np.std(x_6, axis=0)
    
        # Sample Pseudodata
        sampled_x1 = np.random.multivariate_normal(mean_1, std_1*np.eye(len(std_1)), num_points)
        sampled_x2 = np.random.multivariate_normal(mean_2, std_2*np.eye(len(std_2)), num_points)
        sampled_x3 = np.random.multivariate_normal(mean_3, std_3*np.eye(len(std_3)), num_points)
        sampled_x4 = np.random.multivariate_normal(mean_4, std_4*np.eye(len(std_4)), num_points)
        sampled_x5 = np.random.multivariate_normal(mean_5, std_5*np.eye(len(std_5)), num_points)
        sampled_x6 = np.random.multivariate_normal(mean_6, std_6*np.eye(len(std_6)), num_points)

        # Combine pseudodata
        sampled_combined = np.concatenate([sampled_x1, sampled_x2, sampled_x3, sampled_x4, sampled_x5, sampled_x6])

        # Get labels
        labels = np.ones(len(sampled_combined))
        for i in range(1,7):
            labels[(i-1)*num_points:(i)*num_points] *= i

        # Fit and time model
        start = time.time()
        model.fit(sampled_combined, labels)
        fit_times.append(time.time() - start)
        
        # Score model
        preds = model.predict(x_data[test_idx])
        score = accuracy_score(preds, y_data[test_idx])
        pred_accuracy.append(score)
        
    print(r"{0} POINTS PREDICTION ACCURACY: {1:.3f} $\pm$ {2:.3f}".format(num_points, np.mean(pred_accuracy), np.std(pred_accuracy)))
    return pred_accuracy, fit_times

In [None]:
# Number of Psuedo data points
num_points =np.arange(5, 500, 20)

results = []
times = []
for num in num_points:
    pred_accuracy, fit_times = do_kfold_psudodata(svm.SVC(kernel='linear', C=1), combined_x, combined_y, num_points=num)
    results.append(pred_accuracy)
    times.append(fit_times)
    
# Get means
psd_acc_mean = np.mean(results, axis=1)
psd_acc_stds = np.std(results, axis=1)
psd_time_mean = np.mean(times, axis=1)
psd_time_stds = np.std(times, axis=1)

In [None]:
# Plot prediction accuracies
fig, ax = plt.subplots()
ax.plot(num_features, pca_acc_mean, label= 'PCA')
ax.plot(num_features, ml_acc_mean, label = 'Mutual Information')
# ax.plot(num_features, psd_acc_mean, label = 'Psuedo Data')
ax.fill_between(num_features, ml_acc_mean+ml_acc_stds, ml_acc_mean-ml_acc_stds, alpha=0.3)
ax.fill_between(num_features, pca_acc_mean+pca_acc_stds, pca_acc_mean-pca_acc_stds, alpha=0.3)
# ax.fill_between(num_features, psd_acc_mean+psd_acc_stds, psd_acc_mean-psd_acc_stds, alpha=0.3)
ax.set(title="Prediction Accuracies", xlabel="Number of Features", ylabel="Prediction Acciracy")
plt.legend()
plt.show()

# Plot timing
fig, ax = plt.subplots()
ax.plot(num_features, pca_time_mean, label= 'PCA')
ax.fill_between(num_features, pca_time_mean+pca_time_stds, pca_time_mean-pca_time_stds, alpha=0.3)
ax.plot(num_features, ml_time_mean, label = 'Mutual Information')
ax.fill_between(num_features, ml_time_mean+ml_time_stds, ml_time_mean-ml_time_stds, alpha=0.3)
ax.plot(num_features, psd_time_mean, label = 'Psuedo Data')
ax.fill_between(num_features, psd_time_mean+psd_time_stds, psd_time_mean-psd_time_stds, alpha=0.3)
ax.set(title="Training Time", xlabel="Number of Features", ylabel="Time (s)")
plt.legend()
plt.show()

### Subsampled Data

In [None]:
def do_kfold_subsampled(model, x_data, y_data, num_points=100, folds=10, seed=1):
         
    # Do KFold
    kf = KFold(folds)
    
    np.random.seed(seed)
    
    # KF training
    pred_accuracy = []
    fit_times = []
    for idx, (train_idx, test_idx) in tqdm(enumerate(kf.split(list(range(len(y_data)))))): 
        
        # Get idxs of each label
        idx_1 = np.argwhere(combined_y[train_idx] == 1)[:,0]
        idx_2 = np.argwhere(combined_y[train_idx] == 2)[:,0]
        idx_3 = np.argwhere(combined_y[train_idx] == 3)[:,0]
        idx_4 = np.argwhere(combined_y[train_idx] == 4)[:,0]
        idx_5 = np.argwhere(combined_y[train_idx] == 5)[:,0]
        idx_6 = np.argwhere(combined_y[train_idx] == 6)[:,0]
        
        # Randomly sample num_points from each index
        idx1 = np.random.choice(idx_1, num_points)
        idx2 = np.random.choice(idx_2, num_points)
        idx3 = np.random.choice(idx_3, num_points)
        idx4 = np.random.choice(idx_4, num_points)
        idx5 = np.random.choice(idx_5, num_points)
        idx6 = np.random.choice(idx_6, num_points)
    
        # Sample data
        sampled_x1 = x_data[train_idx][idx1]
        sampled_x2 = x_data[train_idx][idx2]
        sampled_x3 = x_data[train_idx][idx3]
        sampled_x4 = x_data[train_idx][idx4]
        sampled_x5 = x_data[train_idx][idx5]
        sampled_x6 = x_data[train_idx][idx6]

        # Combine subsampled data
        sampled_combined = np.concatenate([sampled_x1, sampled_x2, sampled_x3, sampled_x4, sampled_x5, sampled_x6])

        # Get labels
        labels = np.ones(len(sampled_combined))
        for i in range(1,7):
            labels[(i-1)*num_points:(i)*num_points] *= i

        # Fit and time model
        start = time.time()
        model.fit(sampled_combined, labels)
        fit_times.append(time.time() - start)
        
        # Score model
        preds = model.predict(x_data[test_idx])
        score = accuracy_score(preds, y_data[test_idx])
        pred_accuracy.append(score)
        
    print(r"{0} POINTS PREDICTION ACCURACY: {1:.3f} $\pm$ {2:.3f}".format(num_points, np.mean(pred_accuracy), np.std(pred_accuracy)))
    return pred_accuracy, fit_times

In [None]:
# Number of Psuedo data points
num_points =np.arange(5, 500, 20)

results = []
times = []
for num in num_points:
    pred_accuracy, fit_times = do_kfold_subsampled(svm.SVC(kernel='linear', C=1), combined_x, combined_y, num_points=num)
    results.append(pred_accuracy)
    times.append(fit_times)
    
# Get means
sub_acc_mean = np.mean(results, axis=1)
sub_acc_stds = np.std(results, axis=1)
sub_time_mean = np.mean(times, axis=1)
sub_time_stds = np.std(times, axis=1)

np.save("./subsampled_results/svm_linear_accuracy.npy", results)
np.save("./subsampled_results/svm_linear_time.npy", times)

In [None]:
fig, ax = plt.subplots()
ax.plot(num_points, sub_acc_mean)
ax.fill_between(num_points, sub_acc_mean+sub_acc_stds, sub_acc_mean-sub_acc_stds, alpha=0.5)
plt.show()

fig, ax = plt.subplots()
ax.plot(num_points, sub_time_mean)
ax.fill_between(num_points, sub_time_mean+sub_time_stds, sub_time_mean-sub_time_stds, alpha=0.5)
plt.show()

In [None]:
# Number of Psuedo data points
num_points =np.arange(5, 500, 20)

results = []
times = []
for num in num_points:
    pred_accuracy, fit_times = do_kfold_subsampled(svm.SVC(kernel='poly', C=1), combined_x, combined_y, num_points=num)
    results.append(pred_accuracy)
    times.append(fit_times)
    
# Get means
sub_acc_mean = np.mean(results, axis=1)
sub_acc_stds = np.std(results, axis=1)
sub_time_mean = np.mean(times, axis=1)
sub_time_stds = np.std(times, axis=1)

np.save("./subsampled_results/svm_poly_accuracy.npy", results)
np.save("./subsampled_results/svm_poly_time.npy", times)

In [None]:
fig, ax = plt.subplots()
ax.plot(num_points, sub_acc_mean)
ax.fill_between(num_points, sub_acc_mean+sub_acc_stds, sub_acc_mean-sub_acc_stds, alpha=0.5)
plt.show()

fig, ax = plt.subplots()
ax.plot(num_points, sub_time_mean)
ax.fill_between(num_points, sub_time_mean+sub_time_stds, sub_time_mean-sub_time_stds, alpha=0.5)
plt.show()

In [None]:
# Number of Psuedo data points
num_points =np.arange(5, 500, 20)

results = []
times = []
for num in num_points:
    pred_accuracy, fit_times = do_kfold_subsampled(LogisticRegression(multi_class='ovr', solver='liblinear'), combined_x, combined_y, num_points=num)
    results.append(pred_accuracy)
    times.append(fit_times)
    
# Get means
sub_acc_mean = np.mean(results, axis=1)
sub_acc_stds = np.std(results, axis=1)
sub_time_mean = np.mean(times, axis=1)
sub_time_stds = np.std(times, axis=1)

np.save("./subsampled_results/lr_accuracy.npy", pred_accuracy)
np.save("./subsampled_results/lr_time.npy", fit_times)

In [None]:
fig, ax = plt.subplots()
ax.plot(num_points, sub_acc_mean)
ax.fill_between(num_points, sub_acc_mean+sub_acc_stds, sub_acc_mean-sub_acc_stds, alpha=0.5)
plt.show()

fig, ax = plt.subplots()
ax.plot(num_points, sub_time_mean)
ax.fill_between(num_points, sub_time_mean+sub_time_stds, sub_time_mean-sub_time_stds, alpha=0.5)
plt.show()

In [None]:
# Number of Psuedo data points
num_points =np.arange(5, 500, 20)

results = []
times = []
for num in num_points:
    pred_accuracy, fit_times = do_kfold_subsampled(RandomForestClassifier(), combined_x, combined_y, num_points=num)
    results.append(pred_accuracy)
    times.append(fit_times)
    
# Get means
sub_acc_mean = np.mean(results, axis=1)
sub_acc_stds = np.std(results, axis=1)
sub_time_mean = np.mean(times, axis=1)
sub_time_stds = np.std(times, axis=1)

np.save("./subsampled_results/rf_accuracy.npy", pred_accuracy)
np.save("./subsampled_results/rf_time.npy", fit_times)

### Fractional Subsample

In [None]:
def do_kfold_frac_subsampled(model, x_data, y_data, sample_frac=0.1, folds=10, seed=1):
         
    # Do KFold
    kf = KFold(folds)
    np.random.seed(seed)
    
    # KF training
    pred_accuracy = []
    fit_times = []
    for idx, (train_idx, test_idx) in tqdm(enumerate(kf.split(list(range(len(y_data)))))): 
        
        # Sample idx
        start = time.time()
        sample_idx = np.random.choice(range(len(train_idx)), size=int(len(train_idx)*sample_frac))

        sampled_combined = x_data[train_idx][sample_idx]
        labels = y_data[train_idx][sample_idx]

        scalar = StandardScaler().fit(sampled_combined)
        standardized = scalar.transform(sampled_combined)

        # Fit and time model
        model.fit(standardized, labels)
        fit_times.append(time.time() - start)
        
        # Score model
        preds = model.predict(scalar.transform(x_data[test_idx]))
        score = accuracy_score(preds, y_data[test_idx])
        pred_accuracy.append(score)
        
    print(r"{0} FRAC  {1} POINTS PREDICTION ACCURACY: {2:.3f} $\pm$ {3:.3f}".format(sample_frac, len(sample_idx), np.mean(pred_accuracy), np.std(pred_accuracy)))
    return pred_accuracy, fit_times

#### SVM Linear

In [None]:
# Number of Psuedo data points
num_points = [0.001, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05, 0.075, 0.1, 0.125, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5]

results = []
times = []
for num in num_points:
    pred_accuracy, fit_times = do_kfold_frac_subsampled(svm.SVC(kernel='linear', C=1), combined_x, combined_y, sample_frac=num)
    results.append(pred_accuracy)
    times.append(fit_times)
    
# Get means
sub_acc_mean = np.mean(results, axis=1)
sub_acc_stds = np.std(results, axis=1)
sub_time_mean = np.mean(times, axis=1)
sub_time_stds = np.std(times, axis=1)

np.save("/content/drive/MyDrive/uci_har_dataset/results/svm_pre_sub_linear_accuracy.npy", results)
np.save("/content/drive/MyDrive/uci_har_dataset/results/svm_pre_sub_linear_time.npy", times)

In [None]:
fig, ax = plt.subplots()
ax.plot(num_points, sub_acc_mean)
ax.fill_between(num_points, sub_acc_mean+sub_acc_stds, sub_acc_mean-sub_acc_stds, alpha=0.5)
plt.show()

fig, ax = plt.subplots()
ax.plot(num_points, sub_time_mean)
ax.fill_between(num_points, sub_time_mean+sub_time_stds, sub_time_mean-sub_time_stds, alpha=0.5)
plt.show()

#### SVM Poly

In [None]:
# Number of Psuedo data points
num_points = [0.001, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05, 0.075, 0.1, 0.125, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5]

results = []
times = []
for num in num_points:
    pred_accuracy, fit_times = do_kfold_frac_subsampled(svm.SVC(kernel='poly', C=1), combined_x, combined_y, sample_frac=num)
    results.append(pred_accuracy)
    times.append(fit_times)
    
# Get means
sub_acc_mean = np.mean(results, axis=1)
sub_acc_stds = np.std(results, axis=1)
sub_time_mean = np.mean(times, axis=1)
sub_time_stds = np.std(times, axis=1)

np.save("./subsampled_results/svm_poly_accuracy.npy", results)
np.save("./subsampled_results/svm_poly_time.npy", times)

In [None]:
fig, ax = plt.subplots()
ax.plot(num_points, sub_acc_mean)
ax.fill_between(num_points, sub_acc_mean+sub_acc_stds, sub_acc_mean-sub_acc_stds, alpha=0.5)
plt.show()

fig, ax = plt.subplots()
ax.plot(num_points, sub_time_mean)
ax.fill_between(num_points, sub_time_mean+sub_time_stds, sub_time_mean-sub_time_stds, alpha=0.5)
plt.show()

#### Logistic Regression

In [None]:
# Number of Psuedo data points
num_points = [0.001, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05, 0.075, 0.1, 0.125, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5]

results = []
times = []
for num in num_points:
    pred_accuracy, fit_times = do_kfold_frac_subsampled(LogisticRegression(multi_class='ovr', solver='liblinear'), combined_x, combined_y, sample_frac=num)
    results.append(pred_accuracy)
    times.append(fit_times)
    
# Get means
sub_acc_mean = np.mean(results, axis=1)
sub_acc_stds = np.std(results, axis=1)
sub_time_mean = np.mean(times, axis=1)
sub_time_stds = np.std(times, axis=1)

np.save("./subsampled_results/lr_accuracy.npy", results)
np.save("./subsampled_results/lr_time.npy", times)

In [None]:
fig, ax = plt.subplots()
ax.plot(num_points, sub_acc_mean)
ax.fill_between(num_points, sub_acc_mean+sub_acc_stds, sub_acc_mean-sub_acc_stds, alpha=0.5)
plt.show()

fig, ax = plt.subplots()
ax.plot(num_points, sub_time_mean)
ax.fill_between(num_points, sub_time_mean+sub_time_stds, sub_time_mean-sub_time_stds, alpha=0.5)
plt.show()

#### Random Forest

In [None]:
# Number of Psuedo data points
num_points = [0.001, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05, 0.075, 0.1, 0.125, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5]

results = []
times = []
for num in num_points:
    pred_accuracy, fit_times = do_kfold_frac_subsampled(RandomForestClassifier(), combined_x, combined_y, sample_frac=num)
    results.append(pred_accuracy)
    times.append(fit_times)
    
# Get means
sub_acc_mean = np.mean(results, axis=1)
sub_acc_stds = np.std(results, axis=1)
sub_time_mean = np.mean(times, axis=1)
sub_time_stds = np.std(times, axis=1)

np.save("./subsampled_results/rf_accuracy.npy", results)
np.save("./subsampled_results/rf_time.npy", times)

#### NN

In [None]:
def do_kfold_nn_frac_subsampled(model, x_data, y_data, sample_frac=0.1, folds=10, seed=1):
         
    # Do KFold
    kf = KFold(folds)
    np.random.seed(seed)
    
    # KF training
    pred_accuracy = []
    fit_times = []
    for idx, (train_idx, test_idx) in tqdm(enumerate(kf.split(list(range(len(y_data)))))): 
        
        # Sample idx
        sample_idx = np.random.choice(range(len(train_idx)), size=int(len(train_idx)*sample_frac))

        sampled_combined = x_data[train_idx][sample_idx].astype('float32')
        labels = y_data[train_idx][sample_idx]

        # Fit and time model
        start = time.time()
        print(labels.shape)
        model.fit(sampled_combined, labels, batch_size = 30, epochs = 5)

        fit_times.append(time.time() - start)
        
        # Score model
        #preds = model.predict(x_data[test_idx])
        #score = accuracy_score(preds, y_data[test_idx])
        loss, score = model.evaluate(x_data[test_idx], y_data[test_idx], verbose=0)
        pred_accuracy.append(score)
        
    print(r"{0} FRAC  {1} POINTS PREDICTION ACCURACY: {2:.3f} $\pm$ {3:.3f}".format(sample_frac, len(sample_idx), np.mean(pred_accuracy), np.std(pred_accuracy)))
    return pred_accuracy, fit_times

In [None]:
# Number of Psuedo data points
num_points = [0.001, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05, 0.075, 0.1, 0.125, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5]

results = []
times = []
for num in num_points:
    model = Sequential()
    model.add(Dense(561, activation='relu', kernel_initializer='he_normal', input_shape=(561,)))
    model.add(Dense(384, activation='relu',  kernel_initializer='he_normal'))
    model.add(Dense(6,activation='softmax'))
        
    optimizers.Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False,name='Adam')
        
    model.compile(optimizer='Adam',loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    #model.fit(transformed, y_data[train_idx], batch_size = 30, epochs = 5)

    print(combined_x.shape)
    pred_accuracy, fit_times = do_kfold_nn_frac_subsampled(model, combined_x, combined_y-1, sample_frac=num)
    results.append(pred_accuracy)
    times.append(fit_times)
    
# Get means
sub_acc_mean = np.mean(results, axis=1)
sub_acc_stds = np.std(results, axis=1)
sub_time_mean = np.mean(times, axis=1)
sub_time_stds = np.std(times, axis=1)

np.save("./subsampled_results/nn_accuracy.npy", results)
np.save("./subsampled_results/nn_time.npy", times)

In [None]:
svm_lin = np.load("./subsampled_results/svm_linear_accuracy.npy")
lin_mean = np.mean(svm_lin, axis=1)
lin_std = np.std(svm_lin, axis=1)

svm_poly = np.load("./subsampled_results/svm_poly_accuracy.npy")
poly_mean = np.mean(svm_poly, axis=1)
poly_std = np.std(svm_poly, axis=1)

lr = np.load("./subsampled_results/lr_accuracy.npy")
lr_mean = np.mean(lr, axis=1)
lr_std = np.std(lr, axis=1)

rf = np.load("./subsampled_results/rf_accuracy.npy")
rf_mean = np.mean(rf, axis=1)
rf_std = np.std(rf, axis=1)

nn = np.load("./subsampled_results/nn_accuracy.npy")
nn_mean = np.mean(nn, axis=1)
nn_std = np.std(nn, axis=1)


fig, ax = plt.subplots()
ax.plot(num_points, lin_mean, label="SVM Linear")
ax.fill_between(num_points, lin_mean+lin_std, lin_mean-lin_std, alpha=0.3)

ax.plot(num_points, poly_mean, label="SVM Poly")
ax.fill_between(num_points, poly_mean+poly_std, poly_mean-poly_std, alpha=0.3)

ax.plot(num_points, np.mean(lr, axis=1), label="Logistic Regression")
ax.fill_between(num_points, lr_mean+lr_std, lr_mean-lr_std, alpha=0.3)

ax.plot(num_points, np.mean(rf, axis=1), label="Random Forest")
ax.fill_between(num_points, rf_mean+rf_std, rf_mean-rf_std, alpha=0.3)

ax.plot(num_points, np.mean(nn, axis=1), label="Neural Network")
ax.fill_between(num_points, nn_mean+nn_std, nn_mean-nn_std, alpha=0.3)

ax.set(ylim=(0.9, 0.97))
ax.legend(loc='best')
ax.set(xlabel="Fraction of Data Used in Training", ylabel="Prediction Accuracy")
plt.show()

In [None]:
svm_lin = np.load("./subsampled_results/svm_linear_time.npy")
lin_mean = np.mean(svm_lin, axis=1)
lin_std = np.std(svm_lin, axis=1)

svm_poly = np.load("./subsampled_results/svm_poly_time.npy")
poly_mean = np.mean(svm_poly, axis=1)
poly_std = np.std(svm_poly, axis=1)

lr = np.load("./subsampled_results/lr_time.npy")
lr_mean = np.mean(lr, axis=1)
lr_std = np.std(lr, axis=1)

rf = np.load("./subsampled_results/rf_time.npy")
rf_mean = np.mean(rf, axis=1)
rf_std = np.std(rf, axis=1)

nn = np.load("./subsampled_results/nn_time.npy")
nn_mean = np.mean(nn, axis=1)
nn_std = np.std(nn, axis=1)


fig, ax = plt.subplots()
ax.plot(num_points, lin_mean, label="SVM Linear")
ax.fill_between(num_points, lin_mean+lin_std, lin_mean-lin_std, alpha=0.3)

ax.plot(num_points, poly_mean, label="SVM Poly")
ax.fill_between(num_points, poly_mean+poly_std, poly_mean-poly_std, alpha=0.3)

ax.plot(num_points, np.mean(lr, axis=1), label="Logistic Regression")
ax.fill_between(num_points, lr_mean+lr_std, lr_mean-lr_std, alpha=0.3)

ax.plot(num_points, np.mean(rf, axis=1), label="Random Forest")
ax.fill_between(num_points, rf_mean+rf_std, rf_mean-rf_std, alpha=0.3)

ax.plot(num_points, np.mean(nn, axis=1), label="Neural Network")
ax.fill_between(num_points, nn_mean+nn_std, nn_mean-nn_std, alpha=0.3)

ax.legend(loc='best')
ax.set(xlabel="Fraction of Data Used in Training", ylabel="Training Time")
plt.show()

#### Plot for 'Ideal' Parameters

In [None]:
svm_lin = np.load("./subsampled_results/svm_linear_accuracy.npy")
lin_mean = np.mean(svm_lin, axis=1)
lin_std = np.std(svm_lin, axis=1)

svm_poly = np.load("./subsampled_results/svm_poly_accuracy.npy")
poly_mean = np.mean(svm_poly, axis=1)
poly_std = np.std(svm_poly, axis=1)

lr = np.load("./subsampled_results/lr_accuracy.npy")
lr_mean = np.mean(lr, axis=1)
lr_std = np.std(lr, axis=1)

rf = np.load("./subsampled_results/rf_accuracy.npy")
rf_mean = np.mean(rf, axis=1)
rf_std = np.std(rf, axis=1)

nn = np.load("./subsampled_results/nn_accuracy.npy")
nn_mean = np.mean(nn, axis=1)
nn_std = np.std(nn, axis=1)

svm_lintime = np.load("./subsampled_results/svm_linear_time.npy")
lin_meantime = np.mean(svm_lintime, axis=1)
lin_stdtime = np.std(svm_lintime, axis=1)

svm_polytime = np.load("./subsampled_results/svm_poly_time.npy")
poly_meantime = np.mean(svm_polytime, axis=1)
poly_stdtime = np.std(svm_polytime, axis=1)

lrtime = np.load("./subsampled_results/lr_time.npy")
lr_meantime = np.mean(lrtime, axis=1)
lr_stdtime = np.std(lrtime, axis=1)

rftime = np.load("./subsampled_results/rf_time.npy")
rf_meantime = np.mean(rftime, axis=1)
rf_stdtime = np.std(rftime, axis=1)

nntime = np.load("./subsampled_results/nn_time.npy")
nn_meantime = np.mean(nntime, axis=1)
nn_stdtime = np.std(nntime, axis=1)

best_idx = np.where(np.array(num_points)==0.25)
fig, ax = plt.subplots()
ax.scatter(lin_meantime[best_idx], lin_mean[best_idx])
ax.errorbar(lin_meantime[best_idx], lin_mean[best_idx], xerr=lin_stdtime[best_idx], yerr=lin_std[best_idx], capsize=3, fmt="none")

ax.scatter(poly_meantime[best_idx], poly_mean[best_idx])
ax.errorbar(poly_meantime[best_idx], poly_mean[best_idx], xerr=poly_stdtime[best_idx], yerr=poly_std[best_idx], capsize=3, fmt="none")

ax.scatter(lr_meantime[best_idx], lr_mean[best_idx])
ax.errorbar(lr_meantime[best_idx], lr_mean[best_idx], xerr=lr_stdtime[best_idx], yerr=lr_std[best_idx], capsize=3, fmt="none")

ax.scatter(rf_meantime[best_idx], rf_mean[best_idx])
ax.errorbar(rf_meantime[best_idx], rf_mean[best_idx], xerr=rf_stdtime[best_idx], yerr=rf_std[best_idx], capsize=3, fmt="none")

ax.scatter(nn_meantime[best_idx], nn_mean[best_idx])
ax.errorbar(nn_meantime[best_idx], nn_mean[best_idx], xerr=nn_stdtime[best_idx], yerr=nn_std[best_idx], capsize=3, fmt="none")

### Subsampled with PCA

In [None]:
def do_kfold_frac_subsampled_pca(model, x_data, y_data, sample_frac=0.1, features=20, folds=10, seed=1):
         
    # Do KFold
    kf = KFold(folds)
    np.random.seed(seed)
    
    # KF training
    pred_accuracy = []
    fit_times = []
    for idx, (train_idx, test_idx) in tqdm(enumerate(kf.split(list(range(len(y_data)))))): 
        
        # Sample idx
        start = time.time()
        sample_idx = np.random.choice(range(len(train_idx)), size=int(len(train_idx)*sample_frac))

        sampled_combined = x_data[train_idx][sample_idx]
        labels = y_data[train_idx][sample_idx]

        features = x_data.shape[1] if(features == 'all') else features
        pca = PCA(n_components=features)
        scalar = StandardScaler().fit(sampled_combined)
        standardized = scalar.transform(sampled_combined)
        transformed = pca.fit_transform(standardized)
        
        # Fit and time model
        model.fit(transformed, labels)
        fit_times.append(time.time() - start)

        # Transform test data and evaluate
        test_transform = np.dot(scalar.transform(x_data[test_idx]), pca.components_.T)
        preds = model.predict(test_transform)
        score = accuracy_score(preds, y_data[test_idx])
        pred_accuracy.append(score)      
        
    print(r"{0} FRAC  {1} POINTS PREDICTION ACCURACY: {2:.3f} $\pm$ {3:.3f}".format(sample_frac, len(sample_idx), np.mean(pred_accuracy), np.std(pred_accuracy)))
    return pred_accuracy, fit_times

In [None]:
# Number of Psuedo data points
num_points = [0.001, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05, 0.075, 0.1, 0.125, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5]
num_points = [0.02, 0.03, 0.04, 0.05, 0.075, 0.1, 0.125, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5]
num_features = np.arange(5, 500, 20)

results = []
times = []
for num in num_features:
    pred_accuracy, fit_times = do_kfold_frac_subsampled_pca(svm.SVC(kernel='linear', C=1), combined_x, combined_y, sample_frac=0.25, features=num)
    results.append(pred_accuracy)
    times.append(fit_times)
    
# Get means
nsub_acc_mean = np.mean(results, axis=1)
nsub_acc_stds = np.std(results, axis=1)
nsub_time_mean = np.mean(times, axis=1)
nsub_time_stds = np.std(times, axis=1)

np.save("/content/drive/MyDrive/uci_har_dataset/results/svm_pre_sub_pca_linear_accuracy.npy", results)
np.save("/content/drive/MyDrive/uci_har_dataset/results/svm_pre_sub_pca_linear_time.npy", times)

In [None]:
fig, ax = plt.subplots()
ax.errorbar(nsub_time_mean, nsub_acc_mean, xerr=nsub_time_stds, yerr=nsub_acc_stds, capsize=3, zorder=0)
ax.scatter(nsub_time_mean, nsub_acc_mean, marker='s', s=50, edgecolor='k', linewidth=2)
plt.show()

In [None]:
fig, ax = plt.subplots()

print(len(num_features))
ax.plot(num_features, pca_acc_mean)
ax.fill_between(num_features, pca_acc_mean+pca_acc_stds, pca_acc_mean-pca_acc_stds, alpha=0.5)
ax.plot(num_features, nsub_acc_mean)
ax.fill_between(num_features, nsub_acc_mean+nsub_acc_stds, nsub_acc_mean-nsub_acc_stds, alpha=0.5)
ax.plot(num_points, sub_acc_mean)
plt.show()

fig, ax = plt.subplots()
ax.plot(num_features, pca_time_mean)
ax.fill_between(num_features, pca_time_mean+pca_time_stds, pca_time_mean-pca_time_stds, alpha=0.5)
ax.plot(num_features, nsub_time_mean)
ax.fill_between(num_features, nsub_time_mean+nsub_time_stds, nsub_time_mean-nsub_time_stds, alpha=0.5)
ax.plot(num_points, sub_time_mean)
plt.show()

### Subsampled with MI

In [None]:
def do_kfold_sub_mutual_info(model, x_data, y_data, features='all', sample_frac=0.25, folds=10):
     
    # Do KFold
    kf = KFold(folds)
    
    # KF training
    pred_accuracy = []
    fit_times = []
    for idx, (train_idx, test_idx) in tqdm(enumerate(kf.split(x_data))): 

        # Sample idx
        start = time.time()
        sample_idx = np.random.choice(range(len(train_idx)), size=int(len(train_idx)*sample_frac))

        sampled_combined = x_data[train_idx][sample_idx]
        labels = y_data[train_idx][sample_idx]   
        
        # Do Mutual Information Feature Extraction
        features = x_data.shape[1] if(features == 'all') else features
        fs = SelectKBest(score_func=mutual_info_classif, k=features)
        fs.fit(sampled_combined, labels)
        x_train_fs = fs.transform(sampled_combined)
        x_test_fs = fs.transform(x_data[test_idx])

        # Standardize data
        scaler = StandardScaler().fit(x_train_fs)
        transformed = scaler.transform(x_train_fs)
        
        # Fit and time model
        model.fit(transformed, labels)
        fit_times.append(time.time() - start)
        
        # Transform test data and evaluate

        preds = model.predict(scaler.transform(x_test_fs))
        score = accuracy_score(preds, y_data[test_idx])
        pred_accuracy.append(score)
        
    print(r"{0} FEATURES PREDICTION ACCURACY: {1:.3f} $\pm$ {2:.3f}".format(features, np.mean(pred_accuracy), np.std(pred_accuracy)))
    return pred_accuracy, fit_times

In [None]:
# Number of PCA Components
num_features = np.arange(5, 500, 20)

results = []
times = []
for features in num_features:
    print(features)
    pred_accuracy, fit_times = do_kfold_sub_mutual_info(svm.SVC(kernel='linear', C=1), combined_x, combined_y, features=features, sample_frac=0.25)
    #pred_accuracy, fit_times = do_kfold_frac_subsampled_pca(svm.SVC(kernel='linear', C=1), combined_x, combined_y, sample_frac=0.25, features=num)

    results.append(pred_accuracy)
    times.append(fit_times)

# Get means
ml_acc_mean = np.mean(results, axis=1)
ml_acc_stds = np.std(results, axis=1)
ml_time_mean = np.mean(times, axis=1)
ml_time_stds = np.std(times, axis=1)

#"/content/drive/MyDrive/uci_har_dataset/test/X_test.txt"
np.save("/content/drive/MyDrive/uci_har_dataset/results/svm_pre_sub_mi_linear_accuracy.npy", results)
np.save("/content/drive/MyDrive/uci_har_dataset/results/svm_pre_sub_mi_linear_time.npy", times)

## Part 3
- Identify the optimal number of features using feature engineering technique based on time and accuracy. 

In [None]:
err = 0.003
#For PCA
for i in range (0,len(num_features)):
    if pca_acc_mean[i+1] - pca_acc_mean[i] <= err:
        PCA_f = i
        print(f'For {num_features[i]}, the accuracy is {pca_acc_mean[i]}')
        break
        
# For MI
for j in range (0,len(num_features)):
    if ml_acc_mean[j+1] - ml_acc_mean[j] <= err:
        MI_f = j
        print(f'For {num_features[j]}, the accuracy is {ml_acc_mean[j]}')
        break

### From our results above, PCA generates the best accuracy for the same number of features. However, based on the time plot, MI takes less time to run than PCA.  

In [None]:
pred_accuracy, fit_times = do_kfold_pca(svm.SVC(kernel='linear', C=1), combined_x, combined_y, features= num_features[PCA_f])
pred_accuracy
fit_times