# Preparation

Imports libraries

Loads RedMetrics data

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import re
import json

from random import randint
from ipywidgets import FloatProgress,IntProgress,IntText
from IPython.display import display

In [None]:
pd.__version__

In [None]:
np.__version__

In [None]:
processRMDF152 = not ('rmdf152' in globals())
processRMDFTest = not ('rmrdftest' in globals())

### common variables

In [None]:
dataFilesNamesStem = "2018-03-23"
dataFolderPath = "../../data/"
csvEncoding = 'utf-8'

checkpointsStem = "tutorial1.Checkpoint"

# Version 1.52

In [None]:
if processRMDF152 or processRMDFTest:
    # date string to pd.Timestamp
    #  RedMetrics timestamps are always UTC according to doc
    #  https://github.com/CyberCRI/RedMetrics/blob/master/API.md
    dateparse = lambda x: pd.to_datetime(x, utc=True)

if processRMDF152:
    ## Try loading the pre-processed dataframe
    processedRMDF152Path = dataFolderPath + dataFilesNamesStem + ".1.52-processed.csv"
    testUsers152Path = dataFolderPath + dataFilesNamesStem + ".1.52-testUsers.csv"
    processRMDF152 = False
    try:
        rmdf152 = pd.read_csv(\
                            processedRMDF152Path,\
                            dtype=str, parse_dates=['serverTime','userTime'],\
                            date_parser=dateparse,\
                           )
        testUsers = pd.read_csv(testUsers152Path, dtype=str).loc[:,['userId']]
        print("rmdf152 read_csv success")
    except FileNotFoundError:
        print("rmdf152 read_csv failed: now loading, processing, saving rmdf152")
        processRMDF152 = True

### columns

In [None]:
if processRMDF152 or processRMDFTest:
    # In RedMetrics data, 'playerId' is actually a session ID.
    # Permanent player IDs are stored as 'localplayerguid' in 'customdata' attached to 'start' events.
    minimalInitializationColumns = ['customData.localplayerguid']
    initializationColumns = ['customData.localplayerguid', 'playerId', 'type', 'serverTime', 'customData.platform']
    relevantColumns = ['sessionId', 'serverTime', 'userId', 'customData.platform']
    playerFilteringColumns = ['sessionId', 'userId', 'customData.platform', 'serverTime']
    checkpointsRelevantColumns = ['sessionId', 'userId', 'type', 'section']

### dataframes for the different game versions

In [None]:
if processRMDF152 or processRMDFTest:
    def getNormalizedRedMetricsCSV( df ):
        newColumns = np.unique(np.concatenate((minimalInitializationColumns, df.columns.values)))
        return safeGetNormalizedRedMetricsCSV(df.loc[:,newColumns])

    def safeGetNormalizedRedMetricsCSV( df ):
        return df.rename(index=str, columns={'customData.localplayerguid' : 'userId', 'playerId': 'sessionId'})

    relevantColumns = ['customData.localplayerguid', 'playerId', 'type']
    renamedRelevantColumns = ['userId', 'sessionId', 'type']

# Old versions

rdf100 = pd.read_csv("../../data/1.0.csv")
rdf131 = pd.read_csv("../../data/1.31.csv")
rdf132 = pd.read_csv("../../data/1.32.csv")
rdf133 = pd.read_csv("../../data/1.33.csv")
rdf140 = pd.read_csv("../../data/1.40.csv")
rdf150 = pd.read_csv("../../data/1.50.csv")
rdf151 = pd.read_csv("../../data/1.51.csv")

part100 = rdf100.loc[:,relevantColumns]
part131 = rdf131.loc[:,relevantColumns]
part132 = rdf132.loc[:,relevantColumns]
part133 = rdf133.loc[:,relevantColumns]
part140 = rdf140.loc[:,relevantColumns]
part150 = rdf150.loc[:,relevantColumns]
part151 = rdf151.loc[:,relevantColumns]

# Tests

rdftest = pd.read_csv("../../data/2017-10-11.test.csv")
dftest = getNormalizedRedMetricsCSV(rdftest)

## Filtering

In [None]:
if processRMDF152:
    
    def getAllSessions( _rmDF, dropna ):
        _result = _rmDF.loc[:, renamedRelevantColumns]
        _result = _result[_result['type']=='start']
        _result = _result.drop('type', 1)
        if dropna:
            _result = _result.dropna(how='any')
        return _result

    def getTestSessions(_rmDF, _rmTestDF, includeAndroid = True, includeEditor = True, includeTest = True):

        rmDFTestSessions = set()
        rmTestDFTestSessions = set()

        #  - have 'android' or '...editor' as platform
        if(includeAndroid):
            rmDFTestSessions |= set(_rmDF[_rmDF['customData.platform'].isin(['"android"'])]['sessionId'])
        if(includeEditor):
            rmDFTestSessions |= set(_rmDF[_rmDF['customData.platform'].apply(lambda s: str(s).endswith('editor"'))]['sessionId'])
        #print(str(len(rmDFTestSessions)))

        #  - are in the RedMetrics test channel
        if(includeTest):
            rmTestDFTestSessions = set(_rmTestDF['sessionId'])
        #print(str(len(rmTestDFTestSessions)))

        #  - belong to a user who has a session of the type above
        # all the sessions above
        testSessions = rmDFTestSessions | rmTestDFTestSessions

        return testSessions

    # gets sessions which either:
    #  - have 'android' or '...editor' as platform
    #  - are in the RedMetrics test channel
    #  - belong to a user who has a session of the type above
    def getTestUsersSessions(_rmDF, _rmTestDF, includeAndroid = True, includeEditor = True, includeTest = True):

        # tables of association of user-sessions
        rmDFUsersSessions = getAllSessions(_rmDF, False)
        rmTestDFUsersSessions = getAllSessions(_rmTestDF, False)
        userSessions = pd.concat([rmDFUsersSessions,rmTestDFUsersSessions])

        testSessions = getTestSessions(_rmDF, _rmTestDF,
                                       includeAndroid = includeAndroid, includeEditor = includeEditor, includeTest = includeTest)

        # all the users
        rmDFTestUsers = set(rmDFUsersSessions[rmDFUsersSessions['sessionId'].isin(testSessions)]['userId'].dropna())
        rmTestDFTestUsers = set(_rmTestDF['userId'].dropna())
        rmTestDFTestUsers.remove('')
        testUsers = rmDFTestUsers | rmTestDFTestUsers
        # all the sessions which belong to these users
        allTestSessions = set(rmDFUsersSessions[rmDFUsersSessions['userId'].isin(testUsers)]['sessionId'].dropna())

        return (testUsers,allTestSessions)

TOD: get rid of warning
    
    DtypeWarning: Columns (18,22,28,32,38) have mixed types. Specify dtype option on import or set low_memory=False.
    interactivity=interactivity, compiler=compiler, result=result)

using https://stackoverflow.com/questions/24251219/pandas-read-csv-low-memory-and-dtype-options

## Loading
### Data format fixes

In [None]:
if processRMDF152 or processRMDFTest:
    
    def userIdConverter(uId):
        sUID = str(uId)
        if(sUID.startswith('n')):# == 'nan' or == 'null'):
            return ''
        else:
            return sUID.replace('"','')

    loadColumnNames = ['id', 'serverTime', 'userTime',\
                       'playerId', 'playerCustomData',\
                       'type', 'coordinates','section',\
                       'customData.biobrick', 'customData.devices',\
                       'customData.slot', 'customData.sound',\
                       'customData','customData.duration',\
                       'customData.nanobot', 'customData.language',\
                       'customData.controls', 'customData.chapter',\
                       'customData.life','customData.source',\
                       'customData.platform','customData.localplayerguid',\
                       'customData.sametab', 'customData.device',\
                       'customData.energy', 'customData.option',\
                       'customData.newtab','customData.dnabit',\
                       'customData.count', 'customData.plasmid',\
                       'customData.total', 'customData.message',\
                       'customData.graphics']

### RMDFTest loading

In [None]:
# necessary variables for RMDFTest loading:
# dataFolderPath
# dataFilesNamesStem
# dateparse
# userIdConverter
# loadColumnNames
# getNormalizedRedMetricsCSV

In [None]:
if processRMDFTest:
    rmrdftest = pd.read_csv(\
                                dataFolderPath + dataFilesNamesStem + ".test.csv",\
                                dtype=str,\
                                parse_dates=['serverTime','userTime'],\
                                date_parser=dateparse,\
                                converters={'customData.localplayerguid':userIdConverter},\
                           )
    rmrdftest = rmrdftest.loc[:,loadColumnNames]
    normalizedRMDFTest = getNormalizedRedMetricsCSV(rmrdftest)


if(processRMDF152):
    
    rmrdf152  = pd.read_csv(\
                            dataFolderPath + dataFilesNamesStem + ".1.52.csv",\
                            dtype=str,\
                            parse_dates=['serverTime','userTime'],\
                            date_parser=dateparse,\
                            converters={'customData.localplayerguid':userIdConverter},\
                         )
    rmrdf152 = rmrdf152.loc[:,loadColumnNames]
    part152 = rmrdf152.loc[:,relevantColumns]
    normalizedRMDF152  = getNormalizedRedMetricsCSV(rmrdf152)

    (testUsers, allTestSessions) = getTestUsersSessions(_rmDF = normalizedRMDF152, _rmTestDF = normalizedRMDFTest)
    rmdf152 = normalizedRMDF152[~normalizedRMDF152['sessionId'].isin(allTestSessions)]
    
    testUsers = pd.DataFrame(data=list(testUsers), columns=['userId'])
    testUsers.to_csv(testUsers152Path, encoding=csvEncoding)

    userSessions = rmdf152[rmdf152['userId']!=''].loc[:,['userId','sessionId']]
    
    intProgress = IntProgress(min=0, max=len(userSessions.index))
    display(intProgress)
    intText = IntText(0)
    display(intText)
    
    for userSessionsIndex in userSessions.index:
        intProgress.value += 1
        intText.value += 1
        
        userId = userSessions.loc[userSessionsIndex, 'userId']
        sessionId = userSessions.loc[userSessionsIndex, 'sessionId']
        rmdf152.loc[rmdf152['sessionId']==sessionId,'userId'] = userId

    rmdf152['userId'].nunique(),userSessions['userId'].nunique(),\
    rmdf152[~rmdf152['userId'].isin(userSessions['userId'].unique())],\
    userSessions[~userSessions['userId'].isin(rmdf152['userId'].unique())]

#### Saving to csv

In [None]:
if(processRMDF152):
    rmdf152.to_csv(processedRMDF152Path, encoding=csvEncoding)

# All versions

rdf = pd.concat([part100, 
                      part131, part132, part133, 
                      part140, 
                      part150, part151, part152])

df = getNormalizedRedMetricsCSV(rdf)