# Google form analysis

Analysis of results extracted from Google forms in csv format.

## Table of Contents


[Preparation](#preparation)

[Constants](#constants)

[Functions](#functions)
   
   - [general purpose](#genpurpose)
   
   - [sessions and temporalities](#sessions)
   
   - [score](#score)
   
   - [checkpoint validation](#checkvalidation)
   
   - [p(answered question N | answered question P)](#condproba)
   
[Initialization of gform](#gforminit)



## Preparation
<a id=preparation />

In [None]:
%run "../Functions/2. Game sessions.ipynb"

# Constants
<a id=constants />

In [None]:
# special user ids
userIDThatDidNotAnswer = '"001c95c6-8207-43dc-a51b-adf0c6e005d7"'

userID1AnswerEN = '"00dbbdca-d86c-4bc9-803c-0602e0153f68"'
userIDAnswersEN = '"5977184a-1be2-4725-9b48-f2782dc03efb"'
userID1ScoreEN = '"6b5d392d-b737-49ef-99af-e8c445ff6379"'
userIDScoresEN = '"5ecf601d-4eac-433e-8056-3a5b9eda0555"'

userID1AnswerFR = '"2734a37d-4ba5-454f-bf85-1f7b767138f6"'
userIDAnswersFR = '"01e85778-2903-447b-bbab-dd750564ee2d"'
userID1ScoreFR = '"3d733347-0313-441a-b77c-3e4046042a53"'
userIDScoresFR = '"58d22690-8604-41cf-a5b7-d71fb3b9ad5b"'

userIDAnswersENFR = '"a7936587-8b71-43b6-9c61-17b2c2b55de3"'

In [None]:
#localplayerguidkey = 'Ne pas modifier - identifiant anonyme prérempli'
localplayerguidkey = 'Do not edit -  pre-filled anonymous ID'
localplayerguidindex = gform.columns.get_loc(localplayerguidkey)
localplayerguidindex

In [None]:
firstEvaluationQuestionKey = 'In order to modify the abilities of the bacterium, you have to...'
firstEvaluationQuestionIndex = gform.columns.get_loc(firstEvaluationQuestionKey)
firstEvaluationQuestionIndex

In [None]:
answersColumnNameStem = "answers"
correctionsColumnNameStem = "corrections"

# Functions
<a id=functions />

## general purpose
<a id=genpurpose />

In [None]:
#GForm
#user id in format localplayerguid = '8d352896-a3f1-471c-8439-0f426df901c1'
GFormGUIDpattern = '^' + GUIDpattern + '$'

def isGFormGUIDFormat( guid ):
    return re.search(GFormGUIDpattern, guid)

def unsafeGetRedMetricsGUIDFormat( guid ):
        return '"' + guid + '"'

def getRedMetricsGUIDFormat( guid ):
    if(isRedMetricsGUIDFormat(guid)):
        return guid
    elif(isGFormGUIDFormat(guid)):
        return '"' + guid + '"'
    else:
        print("incorrect GUID: check 1. Google form analysis' getRedMetricsGUIDFormat")

def unsafeGetGFormGUIDFormat( guid ):
        return guid.replace('"','')
        
def getGFormGUIDFormat( guid ):
    if(isRedMetricsGUIDFormat(guid)):
        return guid.replace('"','')
    elif(isGFormGUIDFormat(guid)):
        return guid
    else:
        print("incorrect GUID: check 1. Google form analysis' getGFormGUIDFormat")

In [None]:
# returns all responders in GForm format
def getAllRespondersGFormGUID( _form = gform ):
    userIds = np.unique(_form[localplayerguidkey].values)
    return userIds

# returns all responders in RedMetrics format
def getAllResponders( _form = gform ):
    userIds = getAllRespondersGFormGUID(_form)
    result = []
    for id in userIds:
        result.append('"' + id + '"')
    return result

def getRandomGFormGUID():
    _uniqueUsers = getAllRespondersGFormGUID()
    _userCount = len(_uniqueUsers)
    _guid = '0'
    while (not isGFormGUIDFormat(_guid)):
        _userIndex = randint(0,_userCount-1)
        _guid = _uniqueUsers[_userIndex]
    return _guid

def hasAnswered( _rmUserId, _form = gform ):
    return _rmUserId.replace('"','') in _form[localplayerguidkey].values

def getAnswers( _userId, _form = gform ):
    answers = _form[_form[localplayerguidkey]==_userId.replace('"','')]
    _columnAnswers = answers.T
    
    if 0 != len(answers):
        _newColumns = []
        for column in _columnAnswers.columns:
            _newColumns.append(answersColumnNameStem + str(column))
        _columnAnswers.columns = _newColumns
    else:
        # user has never answered
        print("user " + str(_userId) + " has never answered")
        
    return _columnAnswers

## sessions and temporalities
<a id=sessions />

In [None]:
def setAnswerTemporalities( _df ):
    # check
    if(len(np.unique(gform['Temporality'].values)) > 1):
        return
    
    for _index in _df.index:
        _firstEventDate = getFirstEventDate(getRedMetricsGUIDFormat(_df.loc[_index,localplayerguidkey]))
        _df.loc[_index,'Temporality'] = \
        getTemporality(_df.loc[_index,'Timestamp'],_firstEventDate)

# when did the user answer the questionnaire? 
# After gameEventDate, before gameEventDate, undefined?
# answerDate is assumed to be a string coding for tz='Europe/Berlin' time, from gform
# gameEventDate is assumed to be of type pandas._libs.tslib.Timestamp, UTC, from RedMetrics
def getTemporality( answerDate, gameEventDate ):
    
    if(gameEventDate == pd.Timestamp.max.tz_localize('utc')):
        return answerTemporalities[2]
    
    # pd.to_datetime applies twice the localization to Berlin when GMT is present in the string
    # so remove it
    noGMT = answerDate.split(' GMT')[0]
    tzAnswerDate = pd.Timestamp(noGMT, tz='Europe/Berlin')
    
    if(tzAnswerDate < gameEventDate):
        return answerTemporalities[0]
    elif (tzAnswerDate > gameEventDate):
        return answerTemporalities[1]
    else:
        return answerTemporalities[2]

## score
<a id=score />

In [None]:
def getCorrections( _userId, _source = correctAnswers, _form = gform ):
    _columnAnswers = getAnswers( _userId, _form = _form )

    if 0 != len(_columnAnswers.columns):

        _questionsCount = len(_columnAnswers.values)

        for _columnName in _columnAnswers.columns:
            if answersColumnNameStem in _columnName:
                _answerNumber = _columnName.replace(answersColumnNameStem,"")
                newCorrectionsColumnName = correctionsColumnNameStem + _answerNumber

                _columnAnswers[newCorrectionsColumnName] = _columnAnswers[_columnName]
                _columnAnswers[newCorrectionsColumnName] = pd.Series(np.full(_questionsCount, np.nan))

                for question in _columnAnswers[_columnName].index:
                    _correctAnswers = _source.loc[question]
                    
                    if(len(_correctAnswers) > 0):
                        _columnAnswers.loc[question,newCorrectionsColumnName] = False
                        for _correctAnswer in _correctAnswers:
                            if str(_columnAnswers.loc[question,_columnName])\
                            .startswith(str(_correctAnswer)):
                                _columnAnswers.loc[question,newCorrectionsColumnName] = True
                                break
                        

    else:
        # user has never answered
        print("can't give correct answers")
    return _columnAnswers

# edits in-place
# _corrections must be a dataframe full of corrections as produced above
def getBinarizedCorrections( _corrections ):
    for _columnName in _corrections.columns:
        for _index in _corrections[_columnName].index:
            if(True==_corrections.loc[_index,_columnName]):
                _corrections.loc[_index,_columnName] = 1.0
            elif (False==_corrections.loc[_index,_columnName]):
                _corrections.loc[_index,_columnName] = 0.0
    return _corrections

def getAllBinarized(_source = correctAnswers, _form = gform ):
    _notEmptyIndexes = []
    for _index in _source.index:
        if(len(_source.loc[_index]) > 0):
            _notEmptyIndexes.append(_index)
    
    _first = True
    for _userId in getAllResponders( _form = _form ):
        _corrections = getCorrections(_userId, _source=_source, _form = _form)
        _binarized = getBinarizedCorrections(_corrections)
        _slicedBinarized =\
    _binarized.loc[_notEmptyIndexes][_binarized.columns[\
    _binarized.columns.to_series().str.contains(correctionsColumnNameStem)\
                                       ]]

        if (_first):
            _result = _slicedBinarized
            _first = False
        else:
            _result = pd.concat([_result, _slicedBinarized], axis=1)
    return _result.T

methods = ['pearson', 'kendall', 'spearman']
def plotCorrelationMatrix( 
    _binarizedMatrix, 
    _method = methods[0], 
    _title='Questions\' Correlations', 
    _abs=False, 
    _clustered=False, 
    _questionNumbers=False,
    _annot = False,
    _figsize = (10,10)
):
    
    _progress = FloatProgress(min=0, max=5)
    display(_progress)
    
    _overlay = False
    
    # optional computation of overlay
    if(_annot):
        _overlay = getCrossCorrectAnswers(_binarizedMatrix).astype(int)
    _progress.value += 1
    
    # computation of correlation matrix
    _m = _method
    if(not (_method in methods)):
        _m = methods[0]
    _correlation = _binarizedMatrix.astype(float).corr(_m)
    _progress.value += 1
    if(_abs):
        _correlation = _correlation.abs()
    _progress.value += 1
    
    # preparation of plot labels
    if(_questionNumbers):
        _correlation.columns = pd.Series(_correlation.columns).apply(\
                lambda x: x + ' #' + str(_correlation.columns.get_loc(x) + 1))
        if(_clustered):
            _correlation.index = pd.Series(_correlation.columns).apply(\
                lambda x: '#' + str(_correlation.columns.get_loc(x) + 1) + ' ' + x)
        else:
            _correlation.index = _correlation.columns
    _progress.value += 1
    
    # plot
    if(_clustered):
        sns.clustermap(_correlation,cmap=plt.cm.jet,square=True,figsize=_figsize,annot=_overlay, fmt='d')
    else:
        _fig = plt.figure(figsize=_figsize)
        _ax = plt.subplot(111)
        _ax.set_title(_title)
        sns.heatmap(_correlation,ax=_ax,cmap=plt.cm.jet,square=True,annot=_overlay, fmt='d')
    _progress.value += 1

# CCA.iloc[i,j] is the number of users who correctly answered questions number i and j
# CCA[i,j] = Sum(A[u,i] * A[u,j], u in users) = Sum(tA[i,u] * A[u,j], u in users) = tA.A[i,j]
# CCA[i,j] is an int
def getCrossCorrectAnswers( _binarizedAnswers ):
    return _binarizedAnswers.T.dot(_binarizedAnswers)

#function that returns the score from user id
scoreLabel = 'score'
def getScore( _userId, _form = gform ):
    _score = pd.DataFrame({}, columns = answerTemporalities)
    _score.loc[scoreLabel,:] = np.nan
    for _column in _score.columns:
        _score.loc[scoreLabel, _column] = []

    if hasAnswered( _userId, _form = _form ):
        _columnAnswers = getCorrections(_userId, _form = _form)
        for _columnName in _columnAnswers.columns:
            # only work on corrected columns
            if correctionsColumnNameStem in _columnName:
                _answerColumnName = _columnName.replace(correctionsColumnNameStem,\
                                                      answersColumnNameStem)
                _temporality = _columnAnswers.loc['Temporality',_answerColumnName]

                _counts = (_columnAnswers[_columnName]).value_counts()
                _thisScore = 0
                if(True in _counts):
                    _thisScore = _counts[True]
                _score.loc[scoreLabel,_temporality].append(_thisScore)
    else:
        print("user " + str(_userId) + " has never answered")

    return _score

## checkpoint validation
<a id=checkvalidation />

In [None]:
#function that returns the list of checkpoints from user id
def getValidatedCheckpoints( userId, _form = gform ):
    _validatedCheckpoints = []
    
    if hasAnswered( userId, _form = _form ):
        _columnAnswers = getCorrections( userId, _form = _form)
        
        for _columnName in _columnAnswers.columns:
            # only work on corrected columns
            if correctionsColumnNameStem in _columnName:        
                _questionnaireValidatedCheckpointsPerQuestion = pd.Series(np.nan, index=range(len(checkpointQuestionMatching)))

                for _index in range(0, len(_questionnaireValidatedCheckpointsPerQuestion)):
                    if _columnAnswers[_columnName][_index]==True:
                        _questionnaireValidatedCheckpointsPerQuestion[_index] = checkpointQuestionMatching['checkpoint'][_index]
                    else:
                        _questionnaireValidatedCheckpointsPerQuestion[_index] = ''

                _questionnaireValidatedCheckpoints = _questionnaireValidatedCheckpointsPerQuestion.unique()
                _questionnaireValidatedCheckpoints = _questionnaireValidatedCheckpoints[_questionnaireValidatedCheckpoints!='']
                _questionnaireValidatedCheckpoints = pd.Series(_questionnaireValidatedCheckpoints)
                _questionnaireValidatedCheckpoints = _questionnaireValidatedCheckpoints.sort_values()
                _questionnaireValidatedCheckpoints.index = range(0, len(_questionnaireValidatedCheckpoints))
                
                _validatedCheckpoints.append(_questionnaireValidatedCheckpoints) 
    else:
        print("user " + str(userId) + " has never answered")
    return pd.Series(_validatedCheckpoints)

def getValidatedCheckpointsCounts( _userId, _form = gform ):
    _validatedCheckpoints = getValidatedCheckpoints(_userId, _form = _form)
    _counts = []
    for checkpointsList in _validatedCheckpoints:
        _counts.append(len(checkpointsList))
    return _counts

def getNonValidated( checkpoints ):
    _validationLists = []
    
    if 0!=len(checkpoints):
        for _validation in checkpoints:
            _result = pd.Series(np.setdiff1d(validableCheckpoints.values, _validation.values))
            _result = _result[_result != '']
            _result.index = range(0, len(_result))
            _validationLists.append(_result)
        return pd.Series(_validationLists)
    else:
        return validableCheckpoints

def getNonValidatedCheckpoints( userId, _form = gform ):
    validated = getValidatedCheckpoints( userId, _form = _form )
    return getNonValidated(validated)

def getNonValidatedCheckpointsCounts( userId, _form = gform ):
    _nonValidatedCheckpoints = getNonValidatedCheckpoints(userId, _form = _form)
    _counts = []
    for checkpointsList in _nonValidatedCheckpoints:
        _counts.append(len(checkpointsList))
    return _counts

## p(answered question N | answered question P)
<a id=condproba />

In [None]:
# returns all rows of Google form's answers that contain an element 
#   of the array 'choice' for question number 'questionIndex'
def getAllAnswerRows(questionIndex, choice, _form = gform ):
    return _form[_form.iloc[:, questionIndex].isin(choice)]

def getPercentCorrectPerColumn(_df):
    _count = len(_df)
    _percents = pd.Series(np.full(len(_df.columns), np.nan), index=_df.columns)
    for _rowIndex in _df.index:
        for _columnName in _df.columns:
            _columnIndex = _df.columns.get_loc(_columnName)
            if ((_columnIndex >= firstEvaluationQuestionIndex) \
                and (_columnIndex < len(_df.columns)-3)):
                if(str(_df[_columnName][_rowIndex]).startswith(str(correctAnswers[_columnIndex]))):
                    if (np.isnan(_percents[_columnName])):
                        _percents[_columnName] = 1;
                    else:
                        _percents[_columnName] = _percents[_columnName]+1
                else:
                    if (np.isnan(_percents[_columnName])):
                        _percents[_columnName] = 0;
                
    _percents = _percents/_count
    _percents['Count'] = _count
    return _percents

def getPercentCorrectKnowingAnswer(questionIndex, choice, _form = gform):
    _answerRows = getAllAnswerRows(questionIndex, choice, _form = _form);
    return getPercentCorrectPerColumn(_answerRows)

# Initialization of gform
<a id=gforminit />

In [None]:
setAnswerTemporalities(gform)