FBK-NILab/DecMeg2014

0df8a03 May 20, 2014
1 contributor

Users who have contributed to this file

126 lines (100 sloc) 3.63 KB
 """DecMeg2014 example code. Simple prediction of the class labels of the test set by: - pooling all the triaining trials of all subjects in one dataset. - Extracting the MEG data in the first 500ms from when the stimulus starts. - Using a linear classifier (logistic regression). Copyright Emanuele Olivetti 2014, BSD license, 3 clauses. """ import numpy as np from sklearn.linear_model import LogisticRegression from scipy.io import loadmat def create_features(XX, tmin, tmax, sfreq, tmin_original=-0.5): """Creation of the feature space: - restricting the time window of MEG data to [tmin, tmax]sec. - Concatenating the 306 timeseries of each trial in one long vector. - Normalizing each feature independently (z-scoring). """ print "Applying the desired time window." beginning = np.round((tmin - tmin_original) * sfreq).astype(np.int) end = np.round((tmax - tmin_original) * sfreq).astype(np.int) XX = XX[:, :, beginning:end].copy() print "2D Reshaping: concatenating all 306 timeseries." XX = XX.reshape(XX.shape[0], XX.shape[1] * XX.shape[2]) print "Features Normalization." XX -= XX.mean(0) XX = np.nan_to_num(XX / XX.std(0)) return XX if __name__ == '__main__': print "DecMeg2014: https://www.kaggle.com/c/decoding-the-human-brain" print subjects_train = range(1, 7) # use range(1, 17) for all subjects print "Training on subjects", subjects_train # We throw away all the MEG data outside the first 0.5sec from when # the visual stimulus start: tmin = 0.0 tmax = 0.500 print "Restricting MEG data to the interval [%s, %s]sec." % (tmin, tmax) X_train = [] y_train = [] X_test = [] ids_test = [] print print "Creating the trainset." for subject in subjects_train: filename = 'data/train_subject%02d.mat' % subject print "Loading", filename data = loadmat(filename, squeeze_me=True) XX = data['X'] yy = data['y'] sfreq = data['sfreq'] tmin_original = data['tmin'] print "Dataset summary:" print "XX:", XX.shape print "yy:", yy.shape print "sfreq:", sfreq XX = create_features(XX, tmin, tmax, sfreq) X_train.append(XX) y_train.append(yy) X_train = np.vstack(X_train) y_train = np.concatenate(y_train) print "Trainset:", X_train.shape print print "Creating the testset." subjects_test = range(17, 24) for subject in subjects_test: filename = 'data/test_subject%02d.mat' % subject print "Loading", filename data = loadmat(filename, squeeze_me=True) XX = data['X'] ids = data['Id'] sfreq = data['sfreq'] tmin_original = data['tmin'] print "Dataset summary:" print "XX:", XX.shape print "ids:", ids.shape print "sfreq:", sfreq XX = create_features(XX, tmin, tmax, sfreq) X_test.append(XX) ids_test.append(ids) X_test = np.vstack(X_test) ids_test = np.concatenate(ids_test) print "Testset:", X_test.shape print clf = LogisticRegression(random_state=0) # Beware! You need 10Gb RAM to train LogisticRegression on all 16 subjects! print "Classifier:" print clf print "Training." clf.fit(X_train, y_train) print "Predicting." y_pred = clf.predict(X_test) print filename_submission = "submission.csv" print "Creating submission file", filename_submission f = open(filename_submission, "w") print >> f, "Id,Prediction" for i in range(len(y_pred)): print >> f, str(ids_test[i]) + "," + str(y_pred[i]) f.close() print "Done."