# Question groups variation analysis

Pretest posttest answers variation analysis

## Table of Contents

[Preparation](#Preparation)

[Functions](#Functions)

# Preparation

In [None]:
from pySankey import sankey

%run "../Functions/6. Time analysis.ipynb"

# Functions

## Per question analysis

### Interest variation

### Binary analysis

In [None]:
def analyseQuestion(allData, q):
    pretestScores  = allData.loc[answerTemporalities[0] + " " + q, :]
    posttestScores = allData.loc[answerTemporalities[1] + " " + q, :]
    deltaScores    = allData.loc[deltaPrefix            + " " + q, :]
    print("variation: %0.2f (+/- %0.2f)" % (deltaScores.mean(), deltaScores.std()))
    print("from %0.2f (+/- %0.2f) to %0.2f (+/- %0.2f)" % \
          (pretestScores.mean(), pretestScores.std(),\
           posttestScores.mean(), posttestScores.std(),))
    plt.boxplot(deltaScores)
    plt.show()

In [None]:
# questionsCoding contains points attributed to each answer
def compareUsingCustomCorrection(gfdf, questions, questionsCoding):
    
    minPotentialScore = 0
    maxPotentialScore = 0
    for gradingDictionary in questionsCoding:
        minPotentialScore += min(gradingDictionary.values())
        maxPotentialScore += max(gradingDictionary.values())
    minPotentialScore, maxPotentialScore 
    print("%s < score < %s" % (minPotentialScore, maxPotentialScore))
    
    # split temporalities
    gfdfPretest = gfdf[gfdf[QTemporality]==answerTemporalities[0]]
    gfdfPretest.index = gfdfPretest[QUserId]
    gfdfPostest = gfdf[gfdf[QTemporality]==answerTemporalities[1]]
    gfdfPostest.index = gfdfPostest[QUserId]

    # only keep relevant questions
    gfdfPretest = gfdfPretest.loc[:, questions]
    gfdfPostest = gfdfPostest.loc[:, questions]

    # code the answers
    for (q, c) in zip(questions, questionsCoding):
        gfdfPretest[q] = gfdfPretest[q].apply(lambda t: c[t])
        gfdfPostest[q] = gfdfPostest[q].apply(lambda t: c[t])

    # compute delta
    # gfdfDelta = gfdfPostest - gfdfPretest
    
    gfdfResult = gfdfPostest - gfdfPretest
    gfdfResult.columns =  [deltaPrefix + " " + q for q in questions]
    gfdfResult[[answerTemporalities[0] + " " + q for q in questions]] = gfdfPretest
    gfdfResult[[answerTemporalities[1] + " " + q for q in questions]] = gfdfPostest
     
    return gfdfResult.T

In [None]:
def plotPretestPosttestDeltaGfdf(allData, questions,
                                 plotGraphs = True, printData = True, saveFiles = False,
                                 title = "", suffix = ""):
    
    variationSuffix = ' - variation'
    pretestPosttestSuffix = ' - pretest posttest'
    sankeySuffix = ' - Sankey'
            
    # sample size
    print("n = " + str(len(allData.columns)))
    print()
    print()
    for q in questions:
        deltaScores    = allData.loc[deltaPrefix + " " +q             ,:]
        pretestScores  = allData.loc[answerTemporalities[0] + " " + q ,:]
        posttestScores = allData.loc[answerTemporalities[1] + " " + q ,:]

        if printData:
            print(q)
            print("variation: %0.2f (+/- %0.2f)" % (deltaScores.mean(), deltaScores.std()))
            print("from %0.2f (+/- %0.2f) to %0.2f (+/- %0.2f)" % \
                  (pretestScores.mean(),  pretestScores.std(),\
                   posttestScores.mean(), posttestScores.std(),))
            print(ttest_ind(pretestScores, posttestScores))
        if plotGraphs:
            #plt.boxplot(deltaScores)
            #plt.show()
            
            fig = plt.figure()
            ax = plt.subplot(111)
#            if pd.isnull(deltaScores).any():
#                print("pd.isnull(deltaScores).any(): " + str(deltaScores.index[pd.isnull(deltaScores)]))
            plt.hist(deltaScores, bins=int(max(deltaScores) - min(deltaScores) + 1), figure = fig)
            #sns.distplot(deltaScores, bins = np.arange(min(deltaScores),max(deltaScores)))
            if len(title) == 0:
                _title = '"' + q + '"' + variationSuffix + suffix
            else:
                _title = title + variationSuffix
            plt.title(_title)    
            plt.xlabel("score variation")
            plt.ylabel("count")
            plt.show()
            if saveFiles:
                fig.savefig(_title.replace('"', ""))

        
            fig = plt.figure()
            ax = plt.subplot(111)
            plt.hist(pretestScores, bins=int(max(pretestScores) - min(pretestScores) + 1), label='pretest', alpha=0.5, figure = fig)
            plt.hist(posttestScores, bins=int(max(posttestScores) - min(posttestScores) + 1), label='posttest', alpha=0.5, figure = fig)
            plt.legend()
            
            if len(title) == 0:
                _title = '"' + q + '"' + pretestPosttestSuffix + suffix
            else:
                _title = title + pretestPosttestSuffix
            plt.title(_title)
            plt.xlabel("score")
            plt.ylabel("count")
            plt.show()
            if saveFiles:
                fig.savefig(_title.replace('"', ""))
            
            classesDF = pd.DataFrame(columns = ['pretest', 'posttest'])
            # indexes should be scorePretest->scorePosttest for each such existing pair
            # label is then the str(score)

            weight = pd.Series()
            for userId in pretestScores.index:
                changeIndex  = "{0:0=2d}".format(int(pretestScores[userId]))+"->{0:0=2d}".format(int(posttestScores[userId]))
                if changeIndex in weight.index:
                    weight[changeIndex] += 1
                else:
                    weight[changeIndex] = 1
                classesDF.loc[changeIndex, 'pretest'] = str(int(pretestScores[userId])) + " (" + str(len(pretestScores[pretestScores == pretestScores[userId]])) + ")"
                classesDF.loc[changeIndex, 'posttest'] = str(int(posttestScores[userId])) + " (" + str(len(posttestScores[posttestScores == posttestScores[userId]])) + ")"
            classesDF = classesDF.sort_index()
            weight = weight.sort_index()
            weight = weight.values.astype(float)
            
            if len(title) == 0:
                _title = '"' + q + '"' + sankeySuffix + suffix
            else:
                _title = title + sankeySuffix
            if saveFiles:
                filename = _title.replace('"', "")
            else:
                filename = None
            sankey.sankey(
                left=classesDF['pretest'].values, right=classesDF['posttest'].values,
                rightWeight=weight, leftWeight=weight, aspect=20, fontsize=20,
                figureName=filename
            )
            plt.title(_title)
        
        if printData:
            print()
            print()
            print()