In [1]:
# Import required modules
import glob
import scipy.io as sio
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
import sklearn
import analysisFunctions as af

from scipy.stats import zscore

In [2]:
# Read in and store the framewise displacement (fd) for the given dataset in a variable called fdAvgs,
# and create the TS_path_names and indices2Keep variables

# Store the fdAvgs and set a threshold fd
filePath = '/Users/AV/Dropbox/COBRE/movementData/fdAvgs_COBRE.txt'
fdAvgs = pd.read_csv(filePath,header=None);
threshold_fd = 0.5

# Store the path of the folder containing the subject data for the given dataset
subPath = '/Users/AV/Dropbox/COBRE/cfgData/'

# Need to alphabetise and store the subject file names into a variable
TS_path_names = sorted(glob.glob(subPath + '*.mat'))

# Filter the subjects based on their fd, and retain the subjects that have an fd < threshold_fd
TS_path_names, indices2Keep = af.removePathNames(filePath, threshold_fd, TS_path_names)
indices2Keep = indices2Keep.tolist()

# Adding 1 to every element in the array to convert to MATLAB indexing
indices2KeepMat = list(np.asarray(indices2Keep) + 1)

# print(indices2KeepMat)

In [3]:
# Add a multi-level index to the tsData and store some key variables

element = 'element1_COBRE.txt' # Read in the feature matrix data from the saved .txt file
PyFeatList = 'PythonFeatureList.txt' # This text file contains the 22 feature names

# Add a multi-level index to the feature matrix and save into the variable, tsData
# Also store the number of ROIs and subjects in the data
tsData, ROIs, subjects, feats, featList = af.addIndices(element,subPath,PyFeatList)

In [4]:
# Select a given feature from the dataframe
''' This function should take the tsData, feature name and the indices to be kept (based on the threshold fd) and
    return the selected featSlice as a dataframe '''

# Choose which feature to analyse
feature = 1

featureName = featList[feature-1]

featSlice = af.getFeatSlice(ROIs,subjects,tsData,featureName,indices2KeepMat)

# featSlice

In [5]:
# Select a given ROI from the dataframe - Replaces the analysis function 'getROISlice'
''' Take the tsData, ROI and the indices to be kept (based on the threshold fd) and
    return the selected ROISlice as a dataframe '''

ROI = 1 # Select the first ROI

ROISlice = tsData.loc[ROI,indices2KeepMat,:]

# ROISlice

# ROISlice.loc[:,featList[:2]] # Sub-selection

In [6]:
# Create the target column - unique for each dataset

# Select which dataset is being used
dataset = 'COBRE'

if dataset == 'UCLA':

    # Creating the target column
    targetCol = af.getTargetCol(TS_path_names)

elif dataset == 'COBRE':

    # Creating the target column
    csvPath = '/Users/AV/Dropbox/COBRE/participants.csv'
    COBRE = pd.read_csv(csvPath,header=None);

    targetCol = COBRE.iloc[1:,2]
    targetCol = targetCol.tolist()
    targetCol = pd.DataFrame(data=targetCol, columns=['target'])
    
    targetCol = targetCol.iloc[indices2Keep,:]
    targetCol = np.asarray(targetCol,dtype=np.int)

    # A '0' indicates a control subject and a '1' indicates a subject with SCZ
    targetColModified = np.where(targetCol==1, 0, targetCol) # First change the pre-existing 1s to 0s
    targetCol = np.where(targetCol==2, 1, targetColModified) # Then change the 2s to 1s

In [7]:
# Store and print the subject numbers within the dataset

Control, SCZ, Total, SCZ2Ctrl = af.giveMeSubjectNums(targetCol)

print('Control = ' + str(Control))
print('SCZ = ' + str(SCZ))
print('Total = ' + str(Total))
print('SCZ : Control = ' + str(SCZ2Ctrl) + ' : 1')

Control = 69
SCZ = 55
Total = 124
SCZ : Control = 0.80 : 1


In [8]:
# Assign the data to variables
DataSlice = ROISlice # featSlice or ROISlice
DataSlice_zscored = DataSlice.apply(zscore)

X = DataSlice_zscored
y = np.ravel(targetCol)

# X

In [9]:
# Perform 10-fold CV

def get10FoldCVScore(X,y):
    ''' This function returns a 10-fold CV score after balancing the classes '''

    # Import the support vector classifier and balance the classes
    from sklearn.svm import SVC
    svclassifier = SVC(kernel='linear')

    # Split the data into training and test sets
    from sklearn.model_selection import StratifiedKFold
    skf = StratifiedKFold(n_splits=10)

    # Import accuracy score
    from sklearn.metrics import balanced_accuracy_score

    # Initialise a few variables
    scores = np.zeros(10)
    i = 0

    for train_index, test_index in skf.split(X,y):

        train_index = train_index.tolist()
        test_index = test_index.tolist()

#         print("Train:", train_index)
#         print('')
#         print("Validation:",test_index)

        X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
        y_train, y_test = y[train_index], y[test_index]

        svclassifier.fit(X_train, y_train)
        y_pred = svclassifier.predict(X_test)

#         print('')
#         print('y_test = ', y_test)
#         print('')
#         print('y_pred = ', y_pred)
#         print('')

        scores[i] = '{0:.2f}'.format(balanced_accuracy_score(y_test, y_pred)*100)
#         print('Acc % = ', scores[i])
#         print('')

        # Increment index
        i += 1
    return scores

scores = get10FoldCVScore(X,y)

# Print scores
print('10-fold CV scores as a percentage: ' + str(scores))
print('')

# Mean 10-fold CV score with an error of 1 std dev
print("Accuracy as a percentage: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))

10-fold CV scores as a percentage: [53.57 36.9  44.05 44.05 55.95 58.57 58.57 51.43 58.57 26.67]

Accuracy as a percentage: 48.83 (+/- 10.22)


In [10]:
# Store the first five indices of the ROIs / features with the most significant p-values (the third output)

tpValDf, tpValDf_sorted, sigPValInds = af.getTPVals(targetCol, DataSlice)

# tpValDf_sorted

In [11]:
# Show me the PCA figure

# af.showMePCAFig(DataSlice, targetCol)

In [12]:
# Show me the top five features / ROIs as violin plots in the ROI / feature being analysed

# af.showMeViolinPlts(targetCol, sigPValInds, DataSlice, 1, ROI) # When looking at ROISlices

# af.showMeViolinPlts(targetCol, sigPValInds, DataSlice, 0, feature) # When looking at featSlices

In [13]:
# Initialise a few boolean variables which decide what the outputs are

dispFigs = False
regAccOnly = False
featAccOnly = False
    
af.showMeROIAccPlot(ROIs, tsData, indices2KeepMat, targetCol, dispFigs)

('50.66', '12.40')

In [14]:
# Define a function, showMeFeatAccPlot
    
af.showMeFeatAccPlot(element, subPath, PyFeatList, indices2KeepMat, targetCol, dispFigs)

('54.14', '13.94')

In [15]:
# Region by Region Analysis

# af.Reg_by_Reg_Anal(ROI, tsData, targetCol, ROIs, indices2KeepMat, regAccOnly, dispFigs)

In [16]:
# Feature by Feature Analysis

# af.Feat_by_Feat_Anal(feature, featureName, element, subPath, PyFeatList, indices2KeepMat, targetCol, featAccOnly, dispFigs)