# Google form analysis tests

Purpose: determine in what extent the current data can accurately describe correlations, underlying factors on the score.
Especially concerning the answerTemporalities[0] groups: are there underlying groups explaining the discrepancies in score? Are those groups tied to certain questions?

## Table of Contents


[Sorted total answers to questions](#sortedtotalanswers)

[Cross-samples t-tests](#crossttests)

   - [biologists vs non-biologists](#biologistsvsnonbiologists)
   
   - [biologists vs non-biologists *before*](#biologistsvsnonbiologistsbefore)
   
[PCAs](#PCAs)
<br>
<br>
<br>
<br>

In [None]:
%run "../Functions/2. Google form analysis.ipynb"

## PCAs
<a id=PCAs />

Purpose: find out which questions have the more weight in the computation of the score.


Other leads: LDA, ANOVA.


Source for PCA: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html

In [None]:
sampledForm = getPerfectPretestPostestPairs(gform)
sampledForm.index = range(0, len(sampledForm))

In [None]:
binarized = getAllBinarized(_form = sampledForm)
binarized.index = sampledForm.index

In [None]:
sampledForm.shape, binarized.shape

In [None]:
score = np.dot(binarized,np.ones(len(binarized.columns)))

In [None]:
dimensions = binarized.shape[1]
dimensions

In [None]:
binarized['class'] = 'default'

In [None]:
# split data table into data X and class labels y

X = binarized.iloc[:,0:dimensions].values
y = binarized.iloc[:,dimensions].values

### Standardizing

In [None]:
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)

## 1 - Eigendecomposition - Computing Eigenvectors and Eigenvalues

### Covariance Matrix

In [None]:
mean_vec = np.mean(X_std, axis=0)
cov_mat = (X_std - mean_vec).T.dot((X_std - mean_vec)) / (X_std.shape[0]-1)
print('Covariance matrix \n%s' %cov_mat)

In [None]:
print('NumPy covariance matrix: \n%s' %np.cov(X_std.T))

#### eigendecomposition on the covariance matrix:

In [None]:
cov_mat = np.cov(X_std.T)
eig_vals, eig_vecs = np.linalg.eig(cov_mat)

#print('Eigenvectors \n%s' %eig_vecs)
print('\nEigenvalues \n%s' %eig_vals)

### Correlation Matrix

#### Eigendecomposition of the standardized data based on the correlation matrix:

In [None]:
cor_mat1 = np.corrcoef(X_std.T)
eig_vals, eig_vecs = np.linalg.eig(cor_mat1)

#print('Eigenvectors \n%s' %eig_vecs)
print('\nEigenvalues \n%s' %eig_vals)

#### Eigendecomposition of the raw data based on the correlation matrix:

cor_mat2 = np.corrcoef(binarized.T)
eig_vals, eig_vecs = np.linalg.eig(cor_mat2)

#print('Eigenvectors \n%s' %eig_vecs)
print('\nEigenvalues \n%s' %eig_vals)

### Singular Vector Decomposition

In [None]:
u,s,v = np.linalg.svd(X_std.T)

In [None]:
s

## 2 - Selecting Principal Components

In [None]:
for ev in eig_vecs:
    np.testing.assert_array_almost_equal(1.0, np.linalg.norm(ev))
print('Everything ok!')

In [None]:
# Make a list of (eigenvalue, eigenvector) tuples
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]

# Sort the (eigenvalue, eigenvector) tuples from high to low
eig_pairs.sort()
eig_pairs.reverse()

# Visually confirm that the list is correctly sorted by decreasing eigenvalues
print('Eigenvalues in descending order:')
for i in eig_pairs:
    print(i[0])

In [None]:
tot = sum(eig_vals)
var_exp = [(i / tot)*100 for i in sorted(eig_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)

with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(6, 4))

    plt.bar(range(dimensions), var_exp, alpha=0.5, align='center',
            label='individual explained variance')
    plt.step(range(dimensions), cum_var_exp, where='mid',
             label='cumulative explained variance')
    plt.ylabel('Explained variance ratio')
    plt.xlabel('Principal components')
    plt.legend(loc='best')
    plt.tight_layout()

In [None]:
var_exp[:5]

In [None]:
cum_var_exp[:5]

### Projection Matrix

In [None]:
matrix_w = np.hstack((eig_pairs[0][1].reshape(dimensions,1),
                      eig_pairs[1][1].reshape(dimensions,1)))

print('Matrix W:\n', matrix_w)

## 3 - Projection Onto the New Feature Space

In [None]:
sampledForm.columns

In [None]:
colors = ('blue','red','green','magenta','cyan','purple','yellow','black','white')
len(colors)

In [None]:
Y = X_std.dot(matrix_w)

In [None]:
with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(6, 4))
    ax = plt.subplot(111)
    plt.scatter(Y[:, 0], Y[:, 1])
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.title("base PCA")
    plt.show()

In [None]:
# creates a scatter plot using different colors for different classes
# answerIndices: index of 'sampledForm' and 'binarized' DataFrames
# Y: 2D position in PCA for answers
# classNames: list of class names
# classes: list of series of class-index indexed UserIds
# title: str
# rainbow: whether to use rainbow colors
# figsize: for matplotlib
def classifyAndPlot(answerIndices, Y, classNames, classes, title = '', rainbow = False, figsize = (12, 8)):
    %matplotlib nbagg
    defaultClassName = ''

    sampleSize = 0
    # sets the name of the default class
    for classIndex in range(0, len(classes)):
        sampleSize += len(classes[classIndex])
    if(sampleSize < len(answerIndices)):
        if(len(classNames) == len(classes) + 1):
            defaultClassName = classNames[-1]
        else:
            defaultClassName = 'other'
            classNames.append(defaultClassName)

    # y is the 'class' container
    y = pd.Series(index = answerIndices, data = defaultClassName)

    # set the class of each answer
    for classIndex in range(0, len(classes)):
        y[classes[classIndex].index] = classNames[classIndex]

    if (defaultClassName in y.values) and (not (defaultClassName in classNames)):
        print("unexpected error: check the exhaustiveness of the provided classes")

    with plt.style.context('seaborn-whitegrid'):
        thisFigure = plt.figure(figsize=figsize)
        ax = plt.subplot(111)

        colors = ('blue','red','green','magenta','cyan','purple','yellow','black','white')
        if (rainbow or len(classNames) > len(colors)):
            colors = plt.cm.rainbow(np.linspace(1, 0, len(classNames)))
        colors = colors[:len(classNames)]

        plots = pd.Series()
        for lab, col in zip(classNames,colors):
            # y == lab is a selector:
            # Y[y==lab, 0] selects all Y.x of class lab
            # Y[y==lab, 0] selects all Y.y of class lab

            xvalues = Y[y==lab, 0]
            yvalues = Y[y==lab, 1]

            print("'" + str(lab) + "': " + str(len(xvalues)) + " values in " + str(col))

            plots.loc[lab] = plt.scatter( xvalues,
                                        yvalues,
                                        label=lab,
                                        c=col,
                                        alpha=0.2,
                                            )
        plt.xlabel('Principal Component 1')
        plt.ylabel('Principal Component 2')

        # source https://stackoverflow.com/questions/4700614/how-to-put-the-legend-out-of-the-plot
        # Put a legend to the right of the current axis
        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

        if(len(title) > 0):
            plt.title(title)
        plt.show()

        # update function to control the alpha channel
        def updateAlpha(x):
            print(x)
            for lab in classNames:
                plots.loc[lab].set_alpha(x)
            plt.show()

        # creates the slider to control the alpha channel
        interact(updateAlpha, x=(0.0,1.0,0.01));

In [None]:
answered = binarized[binarized[QDevicePbadRbsAraTer] == 1]
#indices = answered.index.map(lambda label: int(label[len('corrections'):]))
indices = answered.index
surveys = sampledForm.iloc[indices][localplayerguidkey]
classifyAndPlot(sampledForm.index, Y, ['guessed', 'did not'], [surveys])

In [None]:
classifyAndPlot(sampledForm.index, Y, ['biologist', 'other'], [getSurveysOfBiologists(sampledForm, True)[localplayerguidkey]], title = 'biologists and non-biologists')

In [None]:
classifyAndPlot(sampledForm.index, Y, ['gamer', 'other'], [getSurveysOfGamers(sampledForm, True)[localplayerguidkey]], title = 'gamers and non-gamers')

In [None]:
classNames = []
classes = []
for answer in sampledForm[QInterestBiology].value_counts().index:
    classNames.append(answer)
    classes.append(sampledForm[sampledForm[QInterestBiology] == answer][localplayerguidkey])
classNames.append('other')
classifyAndPlot(sampledForm.index, Y, classNames, classes, rainbow = True, title = 'interest in biology')

### TODO: find simple way to plot scores

In [None]:
#np.plot(score)

In [None]:
classNames = []
classes = []
for thisScore in np.unique(score):
    classNames.append(thisScore)
    index = np.where(score == thisScore)[0]
    classes.append( sampledForm.loc[index][localplayerguidkey])
classifyAndPlot(sampledForm.index, Y, classNames, classes, rainbow = True, title = 'score')

In [None]:
classNames = []
classes = []
question = QAge
pretests = sampledForm[sampledForm[QTemporality] == answerTemporalities[0]]

for answer in np.sort(pretests[question].unique()):
    classNames.append(answer)
    classes.append(pretests[pretests[question] == answer][localplayerguidkey])
classifyAndPlot(sampledForm.index, Y, classNames, classes, rainbow = True, title = 'age')

In [None]:
#np.sort(pretests[question].apply(int).unique())
#T2 = [list(map(int, x)) for x in T1]

In [None]:
sampledForm.columns[:5]

In [None]:
sampledForm.columns.shape

In [None]:
#questionRange = range(0,45)
#for questionIndex in questionRange:
#    question = sampledForm.columns[questionIndex]
#    print(str(questionIndex) + " " + question)

In [None]:
sampledForm[question].value_counts()

In [None]:
questionIndex = 5
question = sampledForm.columns[questionIndex]
print(str(questionIndex) + " : " + question)
classNames = []
classes = []
for answer in sampledForm[question].value_counts().index:
    classNames.append(str(answer))
    classes.append(sampledForm[sampledForm[question] == answer][localplayerguidkey])
classNames

In [None]:
for className, classMembers in zip(classNames,classes):
    print(className + " : " + str(len(classMembers)))

In [None]:
classifyAndPlot(sampledForm.index, Y, classNames, classes, title = question, rainbow = False)

In [None]:
#%matplotlib nbagg
# questions to avoid:
#1 Timestamp
#7 Age
#43 Remarks
#44 UserId

from itertools import chain
# 1.52
#questionRange = chain(range(1,3), range(4,40), range(42,44))
# 1.52.2
#questionRange = chain(range(1,6), range(7,42), range(44,45))
questionRange = range(1,6)
for questionIndex in questionRange:
    question = sampledForm.columns[questionIndex]
    classNames = []
    classes = []
    for answer in sampledForm[question].value_counts().index:
        classNames.append(str(answer))
        classes.append(sampledForm[sampledForm[question] == answer][localplayerguidkey])
    classifyAndPlot(sampledForm.index, Y, classNames, classes, title = question, rainbow = False)

In [None]:
pd.Series(y).value_counts()

In [None]:
sampledForm[QPlayed].value_counts()

In [None]:
eig_vals

In [None]:
eig_vecs[0]

In [None]:
maxComponentIndex = np.argmax(abs(eig_vecs[0]))
binarized.columns[maxComponentIndex]

In [None]:
sum(eig_vecs[0]*eig_vecs[0])
eig_vecs[0]

In [None]:
sortedIndices = []
descendingWeights = np.sort(abs(eig_vecs[0]))[::-1]
for sortedComponent in descendingWeights:
    sortedIndices.append(np.where(abs(eig_vecs[0]) == sortedComponent)[0][0])
sortedQuestions0 = pd.DataFrame(index = descendingWeights, data = binarized.columns[sortedIndices])
sortedQuestions0

In [None]:
def accessFirst(a):
    return a[0]
sortedQuestionsLastIndex = 10
array1 = np.arange(sortedQuestionsLastIndex+1.)/(sortedQuestionsLastIndex + 1.)
import matplotlib.cm as cm
sortedQuestionsLastIndex+1,\
len(np.unique(np.apply_along_axis(accessFirst, 1, cm.Accent(array1)))),\
len(np.unique(np.apply_along_axis(accessFirst, 1, cm.Dark2(array1)))),\
len(np.unique(np.apply_along_axis(accessFirst, 1, cm.Paired(array1)))),\
len(np.unique(np.apply_along_axis(accessFirst, 1, cm.Pastel1(array1)))),\
len(np.unique(np.apply_along_axis(accessFirst, 1, cm.Pastel2(array1)))),\
len(np.unique(np.apply_along_axis(accessFirst, 1, cm.Set1(array1)))),\
len(np.unique(np.apply_along_axis(accessFirst, 1, cm.Set2(array1)))),\
len(np.unique(np.apply_along_axis(accessFirst, 1, cm.Set3(array1)))),\

In [None]:
from matplotlib import cm

def displayQuestionsContributions(\
                                  sortedQuestions,\
                                  title = "Contributions of questions to component",\
                                  sortedQuestionsLastIndex = 10\
                                 ):
    colors=cm.Set3(np.arange(sortedQuestionsLastIndex+1.)/(sortedQuestionsLastIndex + 1.))
    
    sortedQuestionsLabelsArray = np.append(sortedQuestions.values.flatten()[:sortedQuestionsLastIndex], 'others')
    sortedQuestionsValuesArray = np.append(sortedQuestions.index[:sortedQuestionsLastIndex], sum(sortedQuestions.index[sortedQuestionsLastIndex:]))

    fig1, ax1 = plt.subplots()

    ax1.pie(sortedQuestionsValuesArray, labels=sortedQuestionsLabelsArray, autopct='%1.1f%%', startangle=100, colors = colors)
    ax1.axis('equal')
    
    # cf https://matplotlib.org/users/customizing.html
    plt.rcParams['patch.linewidth'] = 0
    plt.rcParams['text.color'] = '#2b2b2b'
    
    plt.title(title)
    plt.tight_layout()
    plt.show()

In [None]:
displayQuestionsContributions(sortedQuestions0, sortedQuestionsLastIndex = 10, title = 'Contributions of questions to component 1')

In [None]:
sum(sortedQuestions0.index**2)

In [None]:
sortedIndices = []
descendingWeights = np.sort(abs(eig_vecs[1]))[::-1]
for sortedComponent in descendingWeights:
    sortedIndices.append(np.where(abs(eig_vecs[1]) == sortedComponent)[0][0])
sortedQuestions1 = pd.DataFrame(index = descendingWeights, data = binarized.columns[sortedIndices])
sortedQuestions1

In [None]:
displayQuestionsContributions(sortedQuestions1, sortedQuestionsLastIndex = 10, title = 'Contributions of questions to component 2')

In [None]:
sum(sortedQuestions1.index**2)