## This code codebook serves multiple purposes

- Join counselors' role and login time and calculate their accumulative hours on duty when handling that particular session
- Exclude outliers with missing login time
- Regression analysis on attention score along with counselor's roles, counselor's accumulative hours on duty, users visiting hours and downstream outcomes.

@author: Yucan Xu (chicoxu@connect.hku.hk)

In [None]:
import pandas as pd
import numpy as np
import glob,os,re,string
import pytz
import datetime
import pymongo
import re
from datetime import timedelta
from pingouin import ttest
import researchpy as rp
from scipy.stats import pearsonr
import statsmodels.api as sm
import statsmodels.formula.api as smf

## Read and join counselors' info and login time from database

In [None]:
# try:
#     # password authentication to login to MongoDB
#     client = pymongo.MongoClient(
#     )
#     # access Openup Database and authenticate
#     db = client['OpenupDB']
#     print(f'Collections: {db.list_collection_names()}')

# except:
#     print('Cannot access OpenUp database.')
#     if client:
#         client.close()

In [None]:
log_records = pd.DataFrame(list(db['OauthToken'].find()))

In [None]:
log_records

In [None]:
d=timedelta(hours=8)

In [None]:
## check na

In [None]:
log_records[log_records['loginDateTime'].isna()]

In [None]:
log_records_drop = log_records[(log_records['loginDateTime'].notna()) & 
                               (log_records['logoutDateTime'].notna())].copy()

In [None]:
log_records_drop['shift'] = log_records_drop['logoutDateTime'] - log_records_drop['loginDateTime']

In [None]:
log_records_drop2 = log_records_drop[log_records_drop['shift']<timedelta(hours=24)].copy()
log_records_drop2.shape

In [None]:
log_records_drop2['loginDateTime'] = log_records_drop2['loginDateTime'] + d

In [None]:
pred = pd.read_csv('premature_depature_detected_20-21.csv')

In [None]:
login_times = []
counselors = []
for i in pred.index:
    counselor_id = pred.loc[i]['counselorId1']
    start_datetime = pred.loc[i]['startDateTime']
    counselor_records = log_records_drop2[log_records_drop2['systemId'] == counselor_id].copy()
    login_time = counselor_records[counselor_records['loginDateTime']<=pred.loc[i]['startDateTime']].drop_duplicates(
        subset=['systemId'],keep='last')['loginDateTime']
    login_times.append(login_time)
    counselor_role = counselor_records[counselor_records['loginDateTime']<=pred.loc[i]['startDateTime']].drop_duplicates(
        subset=['systemId'],keep='last')['type']
    counselors.append(counselor_role)

In [None]:
pred['login_time'] = login_times

In [None]:
pred['counselor_role'] = counselors

In [None]:
pred['login_time'] = pred['login_time'].apply(lambda x:x.values)

In [None]:
pred['counselor_role'] = pred['counselor_role'].apply(lambda x:x.values)

In [None]:
len(pred['login_time'][0])

## Exclude sessions with login time error

In [None]:
pd_drop = pred[pred['login_time'].map(len)==1].copy()

In [None]:
pd_drop['login_time'] = pd_drop['login_time'].apply(lambda x:x[0])

In [None]:
pd_drop.columns

In [None]:
pd_drop.login_time = pd.to_datetime(pd_drop.login_time)

In [None]:
pd_drop.startDateTime = pd.to_datetime(pd_drop.startDateTime)

In [None]:
pd_drop['counselor_onduty_hours'] = pd_drop['startDateTime'] - pd_drop['login_time']

In [None]:
pd_drop['startDateTime'][0]

## Exlucde outliers that counselor_onduty_hours >=1 day

In [None]:
pd_drop['counselor_onduty_hours'].min()

In [None]:
concurrent = pd.read_csv('concurrent_master_join.csv')

In [None]:
concurrent_join = concurrent.join(
    pd_drop[['conversationId','counselor_onduty_hours']].set_index(
    'conversationId'),on='conversationId',how='left')

In [None]:
concurrent_drop = concurrent_join[concurrent_join['counselor_onduty_hours']<=timedelta(hours=24)].copy()

In [None]:
concurrent_drop['hours_onduty'] = concurrent_drop['counselor_onduty_hours'].dt.seconds/3600

In [None]:
concurrent_drop['hours_onduty'].max()

In [None]:
concurrent_drop.columns

In [None]:
concurrent_drop = concurrent_drop.rename(columns={
    '1. 比起對話前，你現在的感覺如何?.1':'Q1',
    '3. 你認為本對話服務對你有多大幫助？[1至5選一]':'Q3',
    '7. 如果你的朋友有需要，你會向他推薦我們的服務嗎？':'Q7'
})

In [None]:
concurrent_drop

In [None]:
concurrent_drop2 = concurrent_drop.join(pd.get_dummies(concurrent_drop[['role','visting_period']]))

In [None]:
concurrent_drop2.columns

In [None]:
concurrent_drop2 = concurrent_drop2.drop(columns=['role_Volunteer','visting_period_1-8'])

## Regression on Q1

In [None]:
Q1 = concurrent_drop2[concurrent_drop2['Q1'].notna()].copy()

In [None]:
Q1.shape

In [None]:
x = sm.add_constant(Q1[['attention_score','hours_onduty',
                                   'role_Paid_staff','visting_period_17-24','visting_period_9-16']], prepend=False)

In [None]:
y = Q1['Q1']

In [None]:
mod = sm.OLS(y, x)

In [None]:
res = mod.fit()

In [None]:
print(res.summary2())

## Regression on Q3

In [None]:
Q3 = concurrent_drop2[concurrent_drop2['Q3'].notna()].copy()

In [None]:
Q3.shape

In [None]:
x = sm.add_constant(Q3[['attention_score','hours_onduty',
                                   'role_Paid_staff','visting_period_17-24','visting_period_9-16']], prepend=False)

In [None]:
y = Q3['Q3']

In [None]:
mod = sm.OLS(y, x)

In [None]:
res = mod.fit()

In [None]:
print(res.summary2())

## Regression on Q7

In [None]:
Q7 = concurrent_drop2[concurrent_drop2['Q7'].notna()].copy()

In [None]:
Q7.shape

In [None]:
x = sm.add_constant(Q7[['attention_score','hours_onduty',
                                   'role_Paid_staff','visting_period_17-24','visting_period_9-16']], prepend=False)

In [None]:
logistic_model = sm.GLM(Q7[['Q7']], x,
                        family=sm.families.Binomial())

In [None]:
results = logistic_model.fit()

In [None]:
print(results.summary())

In [None]:
params = results.params
conf = results.conf_int()
conf['Odds Ratio'] = params
conf.columns = ['5%', '95%', 'Odds Ratio']
np.exp(conf)

## Regression on Premature departure

In [None]:
concurrent_drop2.columns

In [None]:
concurrent_drop2.shape

In [None]:
x = sm.add_constant(concurrent_drop2[['attention_score','hours_onduty',
                                   'role_Paid_staff','visting_period_17-24','visting_period_9-16']], prepend=False)

In [None]:
logistic_model = sm.GLM(concurrent_drop2[['premature_departure_predicted']], x,
                        family=sm.families.Binomial())

In [None]:
results = logistic_model.fit()

In [None]:
print(results.summary())

In [None]:
params = results.params
conf = results.conf_int()
conf['Odds Ratio'] = params
conf.columns = ['5%', '95%', 'Odds Ratio']
np.exp(conf)