# User comparison

# Table of Contents

1.  [Preparation](#preparation)

2.  [Functions](#functions)

3.  [Tests](#tests)

# Preparation
<a id=preparation />

In [None]:
%run "../Functions/3. Per session and per user analysis.ipynb"

# Functions
<a id=functions />

In [None]:
def getAllUsers( dataframe ):
    allUserIds = np.array(dataframe['userId'].unique())
    allUserIds = [i for i in allUserIds if not i in ['nan', np.nan, 'null']]
    return allUserIds

In [None]:
# _source is used as correction source, if we want to include answers to these questions
def getAllUserVectorData( userIds, _rmDF, _gfDF, _source = correctAnswers):
    
    # result
    isInitialized = False
    allData = []

    f = FloatProgress(min=0, max=len(userIds))
    display(f)
    
    for userId in userIds:
        #print(str(userId))
        f.value += 1
        dataVector = getUserDataVector(userId, _rmDF = _rmDF, _gfDF = _gfDF, _source = _source)
        if not isInitialized:
            isInitialized = True
            allData = dataVector
        else:
            allData = pd.concat([allData, dataVector], axis=1)

    #print('done')
    return allData

In [None]:
def getAllUserVectorDataCustom(_rmDF, _gfDF, before, after, gfMode = False, rmMode = True, sessionCount = 1):
    userIds = []

    if (before and after):
        userIds = getSurveysOfUsersWhoAnsweredBoth(_gfDF, gfMode = gfMode, rmMode = rmMode)
    elif before:
        if rmMode:
            userIds = getRMBefores(_gfDF)
        else:
            userIds = getGFBefores(_gfDF)
    elif after:
        if rmMode:
            userIds = getRMAfters(_gfDF)
        else:
            userIds = getGFormAfters(_gfDF)
    if(len(userIds) > 0):
        userIds = userIds[localplayerguidkey]
        allUserVectorData = getAllUserVectorData(userIds, _rmDF = _rmDF, _gfDF = _gfDF)
        allUserVectorData = allUserVectorData.T
        result = allUserVectorData[allUserVectorData['sessionsCount'] == sessionCount].T
        return result
    else:
        print("no matching user")
        return []

In [None]:
methods = ['pearson', 'kendall', 'spearman']
def plotAllUserVectorDataCorrelationMatrix(
    _allUserVectorData,
    _method = methods[0], 
    _title='RedMetrics Correlations', 
    _abs=False,
    _clustered=False, 
    _figsize = (20,20),
    columnSubset=[] 
):
    
    _progress = FloatProgress(min=0, max=4)
    display(_progress)
    
    # computation of subset
    if len(columnSubset) > 0 and pd.Series(columnSubset).isin(_allUserVectorData.columns).all():
        _allUserVectorData = _allUserVectorData.loc[:,columnSubset]
    
    # computation of correlation matrix
    _m = _method
    if(not (_method in methods)):
        _m = methods[0]
    _correlation = _allUserVectorData.astype(float).corr(_m)
    _progress.value += 1
    if(_abs):
        _correlation = _correlation.abs()
    _progress.value += 1
        
    vmin=-1
    if _abs:
        vmin=0
    vmax=1
        
    # plot
    
    if(_clustered):
    # removing NaNs
    # can't cluster NaN lines in _correlation
    # copied/pasted from '2. Google form analysis.ipynb' plotCorrelationMatrix
        _notNaNsIndices = []
        _notNaNsColumns = []
        for index in _correlation.index:
            if(~pd.isnull(_correlation.loc[index,:]).all()):
                _notNaNsIndices.append(index)
        
        _correlation = _correlation.loc[_notNaNsIndices,_notNaNsIndices]
        _progress.value += 1
        sns.clustermap(
            _correlation,
            cmap=plt.cm.jet,
            square=True,
            figsize=_figsize,
            vmin=vmin,
            vmax=vmax,
        )
    else:
        _fig = plt.figure(figsize=_figsize)
        _ax = plt.subplot(111)
        _ax.set_title(_title)
        _progress.value += 1
        sns.heatmap(
            _correlation,
            ax=_ax,
            cmap=plt.cm.jet,
            square=True,
            vmin=vmin,
            vmax=vmax,
        )
    _progress.value += 1