# Hero.Coli Data Analysis Summary

List of readworthy results from Hero.Coli data analysis.

## Table of Contents

[Preparation](#preparation)
1. [Google form analysis](#gform)
2. [Game sessions](#sessions)
3. [Per session and per user analysis](#peruser)
4. [User comparison](#usercomp)
5. [Game map](#map)
    1. [List of questions](#qlist)
    2. [English](#enform)
    3. [French](#frform)
    4. [Language selection](#langsel)
3. [Basic operations](#basicops)
4. [Checkpoint / Question matching](#checkquestmatch)

# Preparation
<a id=preparation />

In [None]:
%run "../Functions/1. Google form analysis.ipynb"
%run "../Functions/4. User comparison.ipynb"
%run "../Utilities/Plot.ipynb"

# 1. Google form analysis
<a id=gform />

### 1.1 answers to scientific questions

In [None]:
setAnswerTemporalities(gform)

In [None]:
allSciBinarized = getAllBinarized()

In [None]:
#plotCorrelationMatrix( _binarizedMatrix, _title='Questions\' Correlations', _abs=False, _clustered=False, _questionNumbers=False ):
plotCorrelationMatrix(
                        allSciBinarized,
                        _abs=True,
                        _clustered=False,
                        _questionNumbers=True,
                        _annot = True,
                        _figsize = (20,20),
                        _title='Correlations on game questions',
                    )

In [None]:
#plotCorrelationMatrix( _binarizedMatrix, _title='Questions\' Correlations', _abs=False, _clustered=False, _questionNumbers=False ):
plotCorrelationMatrix(
                        allSciBinarized,
                        _abs=True,
                        _clustered=True,
                        _questionNumbers=True,
                        _annot = False,
                        _figsize = (20,20),
                    )

### 1.2 answers to all questions

In [None]:
setAnswerTemporalities(gform)

In [None]:
allBinarized = getAllBinarized( _source = correctAnswers + demographicAnswers)

In [None]:
#plotCorrelationMatrix( _binarizedMatrix, _title='Questions\' Correlations', _abs=False, _clustered=False, _questionNumbers=False ):
plotCorrelationMatrix(
                        allBinarized,
                        _abs=True,
                        _clustered=False,
                        _questionNumbers=True,
                        _annot = True,
                        _figsize = (20,20),
                        _title='Correlation of all answers',
                    )

In [None]:
#plotCorrelationMatrix( _binarizedMatrix, _title='Questions\' Correlations', _abs=False, _clustered=False, _questionNumbers=False ):
plotCorrelationMatrix(
                        allBinarized,
                        _abs=True,
                        _clustered=True,
                        _questionNumbers=True,
                        _annot = False,
                        _figsize = (20,20),
                    )

### 1.3 answers to all questions, only before having played

In [None]:
setAnswerTemporalities(gform)

In [None]:
befores = gform.copy()
befores = befores[befores['Temporality'] == 'before']
print(len(befores))
allBeforesBinarized = getAllBinarized( _source = correctAnswers + demographicAnswers, _form = befores)

In [None]:
#plotCorrelationMatrix( _binarizedMatrix, _title='Questions\' Correlations', _abs=False, _clustered=False, _questionNumbers=False ):
plotCorrelationMatrix(
                        allBeforesBinarized,
                        _abs=True,
                        _clustered=False,
                        _questionNumbers=True,
                        _annot = True,
                        _figsize = (20,20),
                        _title='Correlation of all answers - before playing',
                    )

### 1.4 answers to all questions, only after having played

In [None]:
setAnswerTemporalities(gform)

In [None]:
afters = gform.copy()
afters = afters[afters['Temporality'] == 'after']
print(len(afters))
allAftersBinarized = getAllBinarized( _source = correctAnswers + demographicAnswers, _form = afters)

In [None]:
#plotCorrelationMatrix( _binarizedMatrix, _title='Questions\' Correlations', _abs=False, _clustered=False, _questionNumbers=False ):
plotCorrelationMatrix(
                        allAftersBinarized,
                        _abs=True,
                        _clustered=False,
                        _questionNumbers=True,
                        _annot = True,
                        _figsize = (20,20),
                        _title='Correlation of all answers - after playing',
                    )

# 2. Game sessions
<a id=sessions />

# 3. Per session and per user analysis
<a id=peruser />

# 4. User comparison
<a id=usercomp />

In [None]:
import pandas as pd

In [None]:
scoresBefore = pd.Series()

In [None]:
#getScore(befores.loc[userIndex,localplayerguidkey])['before'][0][0]

In [None]:
for userIndex in befores.index:
    scoresBefore[str(userIndex)] = getScore(befores.loc[userIndex,localplayerguidkey])['before'][0][0]

In [None]:
scoresBefore.mean()

In [None]:
scoresBefore.std()

In [None]:
scoresAfter = pd.Series()
for userIndex in afters.index:
    scoresAfter[str(userIndex)] = getScore(afters.loc[userIndex,localplayerguidkey])['after'][0][0]

In [None]:
scoresAfter.mean()

In [None]:
scoresAfter.std()

In [None]:
# score on each question

In [None]:
allSciBinarized.shape

In [None]:
totalPerQuestion = np.dot(np.ones(allSciBinarized.shape[0]), allSciBinarized)
totalPerQuestion.shape

totalPerQuestionDF = pd.DataFrame(data=np.dot(np.ones(allSciBinarized.shape[0]), allSciBinarized), index=allSciBinarized.columns)

In [None]:
totalPerQuestionDF

In [None]:
percentagePerQuestion = totalPerQuestionDF*100 / allSciBinarized.shape[0]
percentagePerQuestion

In [None]:
_fig = plt.figure(figsize=(20,20))
_ax = plt.subplot(111)
_ax.set_title('percentage correct per question')
sns.heatmap(percentagePerQuestion.astype(int),ax=_ax,cmap=plt.cm.jet,square=True,annot=True,fmt='d')

In [None]:
percentagesCrossCorrect = getCrossCorrectAnswers(allSciBinarized).astype(int)*100 / allSciBinarized.shape[0]
percentagesCrossCorrect

In [None]:
_fig = plt.figure(figsize=(20,20))
_ax = plt.subplot(111)
_ax.set_title('percentage correct')
sns.heatmap(percentagesCrossCorrect.astype(int),ax=_ax,cmap=plt.cm.jet,square=True,annot=True,fmt='d')

In [None]:
percentagesConditionalCrossCorrect = getCrossCorrectAnswers(allSciBinarized).astype(int)*100 / totalPerQuestion
percentagesConditionalCrossCorrect

In [None]:
_fig = plt.figure(figsize=(20,20))
_ax = plt.subplot(111)
_ax.set_title('percentage correct, conditionnally: p(y | x)')
sns.heatmap(percentagesConditionalCrossCorrect.astype(int).fillna(0),ax=_ax,cmap=plt.cm.jet,square=True,annot=True,fmt='d')

In [None]:
# small sample
#allData = getAllUserVectorData( getAllUsers( df152 )[:10] )

# complete set
#allData = getAllUserVectorData( getAllUsers( df152 ) )

# subjects who answered the gform
allData = getAllUserVectorData( getAllResponders() )

# 10 subjects who answered the gform
#allData = getAllUserVectorData( getAllResponders()[:10] )

In [None]:
len(allData.index)

In [None]:
allBinarized

In [None]:
%run "../Functions/1. Google form analysis.ipynb"
%run "../Functions/4. User comparison.ipynb"
%run "../Utilities/Plot.ipynb"

In [None]:
plotAllUserVectorDataCorrelationMatrix(allData.T, _abs=True)

# 5. Game map
<a id=map />

# Player filtering

In [None]:
#players = df152.loc[:, playerFilteringColumns]
players = safeGetNormalizedRedMetricsCSV( df152 )
players.head(1)

In [None]:
#players = players.dropna(how='any')
#players.head(1)
#df152.head(1)

In [None]:
players.shape[0]

In [None]:
players = players[~players['userId'].isin(excludedIDs)];
players.shape[0]

## Sessions (filtered)

In [None]:
sessionscount = players["sessionId"].nunique()
sessionscount

## Sessions of dev IDs

## Unique players

In [None]:
uniqueplayers = players['userId']
uniqueplayers = uniqueplayers.unique()
uniqueplayers.shape[0]

In [None]:
#uniqueplayers

## Unique platforms

In [None]:
uniqueplatforms = players['customData.platform'].unique()
uniqueplatforms

## Checkpoints passed / furthest checkpoint (unfiltered)

In [None]:
checkpoints = df152.loc[:, checkpointsRelevantColumns]
checkpoints = checkpoints[checkpoints['type']=='reach'].loc[:,['section','sessionId']]
checkpoints = checkpoints[checkpoints['section'].str.startswith('tutorial', na=False)]
checkpoints = checkpoints.groupby("sessionId")
checkpoints = checkpoints.max()
checkpoints.head()

In [None]:
maxCheckpointTable = pd.DataFrame({"maxCheckpoint" : checkpoints.values.flatten()})
maxCheckpointCounts = maxCheckpointTable["maxCheckpoint"].value_counts()
maxCheckpointCounts['Start'] = None
maxCheckpointCounts = maxCheckpointCounts.sort_index()
print('\nmaxCheckpointCounts=\n{0}'.format(str(maxCheckpointCounts)))

In [None]:
maxCheckpointCountsTable = pd.DataFrame({"maxCheckpoint" : maxCheckpointCounts.values})
maxCheckpointCountsTableCount = maxCheckpointCountsTable.sum(0)[0]
maxCheckpointCountsTableCount

In [None]:
checkpoints.count()

In [None]:
maxCheckpointCountsTable.head()

In [None]:
maxCheckpointCountsTable.describe()

In [None]:
genericTreatment( maxCheckpointCountsTable, "best checkpoint reached", "game sessions", 0, maxCheckpointCountsTableCount, False, True )

## Session starts

In [None]:
#starts = df152.loc[:, checkpointsRelevantColumns]
#starts = checkpoints[checkpoints['type']=='start'].loc[:,['playerId']]
#starts = checkpoints[checkpoints['section'].str.startswith('tutorial', na=False)]
#starts = checkpoints.groupby("playerId")
#starts = checkpoints.max()
#starts.head()

In [None]:
startTutorial1Count = sessionscount
neverReachedGameSessionCount = startTutorial1Count - maxCheckpointCountsTableCount
fullMaxCheckpointCounts = maxCheckpointCounts
fullMaxCheckpointCounts['Start'] = neverReachedGameSessionCount
fullMaxCheckpointCountsTable = pd.DataFrame({"fullMaxCheckpoint" : fullMaxCheckpointCounts.values})

genericTreatment( fullMaxCheckpointCountsTable, "best checkpoint reached", "game sessions", 0, startTutorial1Count, False, True )

print('\nfullMaxCheckpointCountsTable=\n{0}'.format(fullMaxCheckpointCountsTable))
fullMaxCheckpointCountsTable.describe()

## Duration

Duration of playing sessions

In [None]:
durations = players.groupby("sessionId").agg({ "serverTime": [ np.min, np.max  ] })
durations["duration"] = pd.to_datetime(durations["serverTime"]["amax"]) - pd.to_datetime(durations["serverTime"]["amin"])
durations["duration"] = durations["duration"].map(lambda x: np.timedelta64(x, 's'))
durations = durations.sort_values(by=['duration'], ascending=[False])
durations.head()

Duration plot

In [None]:
durations.loc[:,'duration']
durations = durations[4:]
durations["duration_seconds"] = durations["duration"].map(lambda x: pd.Timedelta(x).seconds)
maxDuration = np.max(durations["duration_seconds"])
durations["duration_rank"] = durations["duration_seconds"].rank(ascending=False)
durations.plot(x="duration_rank", y="duration_seconds")
plt.xlabel("game session")
plt.ylabel("time played (s)")
plt.legend('')
plt.xlim(0, sessionscount)
plt.ylim(0, maxDuration)
durations["duration_seconds"].describe()
durations.head()