## This code codebook was used to calculate the attention score for each session

Input: aggregated data with metadata of conversations from the system

Output: dataset with attention score

@author: Yucan Xu (chicoxu@connect.hku.hk)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

## Merge two dataset from 2020 to 2021

In [None]:
data1 = pd.read_csv('datapart1.csv')

In [None]:
data2 = pd.read_csv('datapart2.csv')

In [None]:
data = data1.append(data2)

In [None]:
data.conversationId.unique().shape

In [None]:
data = data.reset_index(drop=True)

In [None]:
## check outliers
data[data['datetime']>data['endDateTime']]

## Calculate concurrent group for every message

In [None]:
data = data.sort_values(by=['startDateTime','conversationId','datetime'])

In [None]:
data = data.rename(columns={'sum':'concurrent_group'})

In [None]:
data['concurrent_group_shift'] = data.groupby('conversationId')['concurrent_group'].shift(1)

In [None]:
data['concurrent_group_shift'] = data.groupby(
    'conversationId')['concurrent_group_shift'].transform(
    lambda x: x.bfill())

In [None]:
data['inc'] = np.where((data['concurrent_group'] != data['concurrent_group_shift']),
                       'inc', 'no_inc')

In [None]:
data_join = data.join((data[data['inc']=='inc'].groupby(
    'conversationId').cumcount()+1).to_frame())

In [None]:
data_join.rename(columns={0: 'concurrent_group_pp'}, inplace=True)

In [None]:
data_join['concurrent_group_pp'] = data_join.groupby('conversationId')['concurrent_group_pp'].transform(
    lambda x: x.ffill())

In [None]:
data_join['concurrent_group_pp'].fillna(0, inplace=True)

## Calculate session duration

In [None]:
for column in data_join.columns:
    if 'time' in column.lower():
        data_join[column] = pd.to_datetime(data_join[column])

In [None]:
sstart = data_join.drop_duplicates(subset=['conversationId','concurrent_group_pp'],keep='first')

In [None]:
send = data_join.drop_duplicates(subset=['conversationId','concurrent_group_pp'],keep='last')

In [None]:
data_join = pd.merge(data_join,sstart[['datetime','concurrent_group_pp','conversationId']].rename(
    columns={'datetime':'session_starttime'}),on=['concurrent_group_pp','conversationId'])

In [None]:
data_join = pd.merge(data_join,send[['datetime','concurrent_group_pp','conversationId']].rename(
    columns={'datetime':'session_endtime'}),on=['concurrent_group_pp','conversationId'])

In [None]:
data_join['session_duration'] = data_join['session_endtime'] - data_join['session_starttime']

In [None]:
#data_join['session_duration'].min()

In [None]:
#data_join['session_duration'].max()

In [None]:
session_group = data_join.drop_duplicates(subset=['conversationId','concurrent_group_pp']).copy()

In [None]:
session_group

In [None]:
session_group['chat_time'] = session_group['ending_time'] - session_group['first_counselor_message_time']

In [None]:
#session_group['chat_time'].min()

In [None]:
#session_group['chat_time'].max()

In [None]:
session_agg = session_group.groupby(['conversationId','concurrent_group'])['session_duration'].sum().reset_index()

In [None]:
chat_time = session_agg.groupby('conversationId')['session_duration'].sum().reset_index()

In [None]:
chat_time = chat_time.rename(columns={'session_duration':'chat_duration'})

In [None]:
session_agg2 = session_agg.join(chat_time.set_index('conversationId'),on='conversationId',how='left')

In [None]:
#session_agg2

## Calculate attention score for each session

In [None]:
session_agg2['concurrent_group_reverse'] = session_agg['concurrent_group'].apply(lambda x:1/x)

In [None]:
session_agg2['sharing'] = session_agg2['session_duration'] * session_agg2['concurrent_group_reverse']

In [None]:
session_agg3 = session_agg2.groupby('conversationId')['sharing'].sum().reset_index()

In [None]:
session_agg4 = session_agg3.join(session_agg2[['conversationId','chat_duration']].drop_duplicates(
    subset=['conversationId']).set_index('conversationId'),on='conversationId',how='left')

In [None]:
session_agg4['attention_score'] = session_agg4['sharing']/session_agg4['chat_duration']

In [None]:
session_agg4['attention_score'] = session_agg4['attention_score'].apply(lambda x:round(x))

In [None]:
#session_agg4

In [None]:
## join table and export csv with attention score

In [None]:
session_agg4 = session_agg4.join(max_pp,on='conversationId',how='left')

In [None]:
data_info = data.drop_duplicates(subset=['conversationId']).copy()

In [None]:
session_agg4 = session_agg4.join(data_info[['conversationId','startDateTime']].set_index(
    'conversationId'),on='conversationId',how='left')

In [None]:
session_agg4.to_csv('attention_score.csv')