## Interaction Analysis

### Preprocess

In [1]:
import pandas as pd
import numpy as np

In [108]:
# Read the data
survey_res = pd.read_csv('data/survey_responses.csv', skiprows=[1, 2]) # survey responses

logs1 = pd.read_csv('data/interactions.csv')     # interaction logs on rating page
logs2 = pd.read_csv('data/vis_interactions.csv') # interaction logs on visualization page

rating_history = pd.read_csv('data/rating_history.csv')   # rating history
ratings = pd.read_csv('data/ratings.csv')                 # ratings

applications = pd.read_csv('data/applications.csv')       # application information


# get the user id for thoes who completed the survey
user_ids = survey_res['user_id'].tolist()

# filter the data to only include the users who completed the survey
logs1 = logs1[logs1['user_id'].isin(user_ids)]
logs1 = logs1[logs1['applicant_id'] > 0]
logs2 = logs2[logs2['user_id'].isin(user_ids)]
rating_history = rating_history[rating_history['user_id'].isin(user_ids)]
ratings = ratings[ratings['user_id'].isin(user_ids)]

# save filtered data
ratings.to_csv('data/ratings.csv', index=False)
rating_history.to_csv('data/rating_history.csv', index=False)
logs1.to_csv('data/interactions.csv', index=False)
logs2.to_csv('data/vis_interactions.csv', index=False)

applications.rename(columns={'id': 'applicant_id'}, inplace=True)
rating_history = pd.merge(rating_history, applications[['applicant_id', 'GENDER', 'RACE']], on='applicant_id')
logs2_merged = pd.merge(logs2, applications[['applicant_id', 'GENDER', 'RACE']], on='applicant_id')




# for each user, get the timestamp that they finished the tour
tour_finished_time_stamps = {}
for user in user_ids:
    tour_finished_time_stamps[user] = logs2[(logs2['user_id'] == user) & (logs2['interaction_type'] == 'tourFinished')]['timestamp'].iloc[0]

#### Rating Changes

In [109]:
# get the rating history for each user after the tour finish timestamp
ratings_after_tour = rating_history[rating_history.apply(lambda x: x['timestamp'] > tour_finished_time_stamps[x['user_id']]/1000, axis=1)]


# add a column to ratings for the ratings before users interacted with the visualization

ratings['rating_before'] = ratings['rating']

ratings.rename(columns={'rating': 'rating_after'}, inplace=True)

# add a column for the rating changes
ratings['rating_change'] = 0

# iterate through each row of the ratings_after_tour
for row in ratings_after_tour.iterrows():
    prev_rating = rating_history[(rating_history['user_id'] == row[1]['user_id']) & 
                                 (rating_history['applicant_id'] == row[1]['applicant_id']) &
                                 (rating_history['timestamp'] < tour_finished_time_stamps[row[1]['user_id']]/1000)]

    row_id = ratings[(ratings['user_id'] == row[1]['user_id']) & (ratings['applicant_id'] == row[1]['applicant_id'])].index[0]
    ratings.loc[row_id, 'rating_before'] = prev_rating['rating'].iloc[-1]
    ratings.loc[row_id, 'rating_change'] = row[1]['rating'] - prev_rating['rating'].iloc[-1]

    # print("rating change: ", ratings.loc[row_id, 'rating_change'])

# save to csv
# drop the timsstamp column
ratings.drop(['id','add_timestamp'], axis=1, inplace=True)
ratings.to_csv('data/ratings_with_change.csv', index=False)

#### Time Spent Changes

In [110]:
# Calculate the focus time before intervention  
from itertools import product

focus_time = pd.DataFrame(list(product(user_ids, applications['applicant_id'].tolist())), columns=['user_id', 'applicant_id'])
focus_time['total_before'] = 0
focus_time['total_after'] = 0
focus_time['total_change'] = 0

# get the logs before the summary page tour finished timestamp -- these interactions happened before the intervention
logs1_before_tour = logs1[logs1.apply(lambda x: x['timestamp'] < tour_finished_time_stamps[x['user_id']], axis=1)]

for user_id, logs in logs1_before_tour.groupby('user_id'):
    intervals = {}
    durations = []
    
    if logs.shape[0] > 1: # need at least 2 interactions to get an interval
        for i in range(logs.shape[0] - 1):  
            log1 = logs.iloc[i]
            log2 = logs.iloc[i+1]

            # Leave individual rating page, ignore the interval
            if log1['interaction_type'] ==  'hidden' or log1['interaction_type'] == 'leave' or log1['interaction_type'] ==  'close': 
                continue
            # The second log is page visiable , ignore the interval
            # Usually visiable event should follows a hidden event which is handled, have this additional test incase something goes wrong such that a hidden event is not logged 
            if log2['interaction_type'] ==  'visible':
                continue

            applicant_id = log1['applicant_id'] 
            duration = (log2['timestamp'] - log1['timestamp']) / 1000.0  # in seconds

            if duration > 0:
                durations.append(duration)

                if applicant_id not in intervals:
                    intervals[applicant_id] = []

                intervals[applicant_id].append({'duration': duration })

    log_durations = np.log(durations)
    Q1 = np.quantile(log_durations, .25)
    Q3 = np.quantile(log_durations, .75)
    IQR = Q3 -Q1
    upper_bound = Q3 + 1.5*IQR

    for key, val in intervals.items():
        if (key !=0):
            total = 0
            for interval in val:
                if (np.log(interval['duration']) < upper_bound): # filter out outliers
                    total += (interval['duration'] / 60.0)
            
            row_id = focus_time[(focus_time['user_id'] == user_id) & (focus_time['applicant_id'] == key)].index[0]
            focus_time.loc[row_id, 'total_before'] = total 

In [111]:
# Calculate the focus time after intervention  

# get the logs after the summary page tour finished timestamp -- these interactions happened after the intervention
logs1_after_tour = logs1[logs1.apply(lambda x: x['timestamp'] > tour_finished_time_stamps[x['user_id']], axis=1)]
for user_id, logs in logs1_after_tour.groupby('user_id'):
    intervals = {}
    durations = []
    
    if logs.shape[0] > 1: # need at least 2 interactions to get an interval
        for i in range(logs.shape[0] - 1):  
            log1 = logs.iloc[i]
            log2 = logs.iloc[i+1]

            # Leave individual rating page, ignore the interval
            if log1['interaction_type'] ==  'hidden' or log1['interaction_type'] == 'leave' or log1['interaction_type'] ==  'close': 
                continue
            # The second log is page visiable , ignore the interval
            # Usually visiable event should follows a hidden event which is handled, have this additional test incase something goes wrong such that a hidden event is not logged 
            if log2['interaction_type'] ==  'visible':
                continue

            applicant_id = log1['applicant_id'] 
            duration = (log2['timestamp'] - log1['timestamp']) / 1000.0  # in seconds

            if duration > 0:
                durations.append(duration)

                if applicant_id not in intervals:
                    intervals[applicant_id] = []

                intervals[applicant_id].append({'duration': duration })

    for key, val in intervals.items():

        row_id = focus_time[(focus_time['user_id'] == user_id) & (focus_time['applicant_id'] == key)].index[0]
        total_time = sum([i['duration'] for i in val]) / 60.0
        focus_time.loc[row_id, 'total_change'] = total_time 

In [118]:
focus_time['total_after'] = focus_time['total_before'] + focus_time['total_change']
focus_time.to_csv('data/focus_time_with_change.csv', index=False)

### Interaction Analysis

In [55]:
# get click and hover interactions
logs2_click = logs2_merged[logs2_merged['interaction_type'] == 'click' ]
logs2_hover = logs2_merged[logs2_merged['interaction_type'] == 'hover']

# count by component
# print("Click interaction count by component:\n",logs2_click['element'].value_counts())

# print("\nHover interaction count by component:\n",logs2_hover['element'].value_counts())

print("\nCLick interaction count on time spent plots: ", logs2_click[logs2_click['element'].isin(['raceTimePoints', 'genderTimePoints'])].shape[0])
print("Hover interaction count on time spent plots: ", logs2_hover[logs2_hover['element'].isin(['time_race', 'time_gender'])].shape[0])

print("\nCLick interaction count on competitivenss rating plots: ", logs2_click[logs2_click['element'].isin(['raceRatingPoints', 'genderRatingPoints'])].shape[0])
print("Hover interaction count on competitivenss rating plots: ", logs2_hover[logs2_hover['element'].isin(['rating_race', 'rating_gender'])].shape[0])

print("\nCLick interaction count on applicant list: ", logs2_click[logs2_click['element'] == 'applicantList'].shape[0])
print("Hover interaction count on applicant list: ", logs2_hover[logs2_hover['element'] == 'applicantList'].shape[0])


CLick interaction count on time spent plots:  14
Hover interaction count on time spent plots:  418

CLick interaction count on competitivenss rating plots:  11
Hover interaction count on competitivenss rating plots:  208

CLick interaction count on applicant list:  54
Hover interaction count on applicant list:  1441


In [40]:
#  get click interactions on the applicants
revisits = logs2_click[~logs2_click['element'].isin(['comment', 'recommendation'])]

# total clicks (revisits) on the applicants
print("Total revisits: ", revisits.shape[0])

# number of users who revisited the applicants
print("Number of users who revisited the applicants: ", len(revisits['user_id'].unique()))

# number of applicants revisted by each user
print("Min, max, average number of applicants revisted by each user: ", revisits.groupby('user_id')['applicant_id'].nunique().min(),
      revisits.groupby('user_id')['applicant_id'].nunique().max(),
      round(revisits.groupby('user_id')['applicant_id'].nunique().mean()))

Total revisits:  79
Number of users who revisited the applicants:  46
Min, max, average number of applicants revisted by each user:  1 7 2


In [41]:
# hover and click interactions on the Time Spent by Gender and Competitiveness Rating by Gender plots

print("Hover interaction count on the Time Spent by Gender plot: \n",
      logs2_hover[logs2_hover['element'] == 'time_gender'].groupby(['GENDER'])['applicant_id'].count()
)

print("\nCLick interaction count on the Time Spent by Gender plot: \n",
      revisits[revisits['element'] == 'genderTimePoints'].groupby(['GENDER'])['applicant_id'].count()
)

print("\nHover interaction count on the Competitiveness Rating by Gender plot: \n",
      logs2_hover[logs2_hover['element'] == 'rating_gender'].groupby(['GENDER'])['applicant_id'].count()
)

print("\nCLick interaction count on the Competitiveness Rating by Gender plot: \n",
      revisits[revisits['element'] == 'genderRatingPoints'].groupby(['GENDER'])['applicant_id'].count()
)

#  hover and interactions on the Time Spent by Race and Competitiveness Rating by Race plots
print("\nHover interaction count on the Time Spent by Race plot: \n",
      logs2_hover[logs2_hover['element'] == 'time_race'].groupby(['RACE'])['applicant_id'].count()
)

print("\nCLick interaction count on the Time Spent by Race plot: \n",
      revisits[revisits['element'] == 'raceTimePoints'].groupby(['RACE'])['applicant_id'].count()
)

print("\nHover interaction count on the Competitiveness Rating by Race plot: \n",
      logs2_hover[logs2_hover['element'] == 'rating_race'].groupby(['RACE'])['applicant_id'].count()
)

print("\nCLick interaction count on the Competitiveness Rating by Race plot: \n",
      revisits[revisits['element'] == 'raceRatingPoints'].groupby(['RACE'])['applicant_id'].count()
)



Hover interaction count on the Time Spent by Gender plot: 
 GENDER
Female        118
Male           73
Non-binary     92
Name: applicant_id, dtype: int64

CLick interaction count on the Time Spent by Gender plot: 
 GENDER
Female        3
Male          3
Non-binary    5
Name: applicant_id, dtype: int64

Hover interaction count on the Competitiveness Rating by Gender plot: 
 GENDER
Female        45
Male          42
Non-binary    38
Name: applicant_id, dtype: int64

CLick interaction count on the Competitiveness Rating by Gender plot: 
 GENDER
Female        1
Male          5
Non-binary    2
Name: applicant_id, dtype: int64

Hover interaction count on the Time Spent by Race plot: 
 RACE
Black    79
White    56
Name: applicant_id, dtype: int64

CLick interaction count on the Time Spent by Race plot: 
 RACE
Black    3
Name: applicant_id, dtype: int64

Hover interaction count on the Competitiveness Rating by Race plot: 
 RACE
Black    45
White    38
Name: applicant_id, dtype: int64

CLick int

In [97]:
# Ratings and time spents for the applicants who were revisited
pd.options.mode.chained_assignment = None

logs2_hover['mean_time'] = logs2_hover['user_id'].map(focus_time.groupby('user_id')['total_before'].mean())
logs2_hover['mean_rating'] = logs2_hover['user_id'].map(ratings.groupby('user_id')['rating_before'].median())

revisits['mean_time'] = revisits['user_id'].map(focus_time.groupby('user_id')['total_before'].mean())
revisits['mean_rating'] = revisits['user_id'].map(ratings.groupby('user_id')['rating_before'].median())

click_rating = revisits[revisits['element'].isin(['genderRatingPoints', 'raceRatingPoints'])]
hover_rating = logs2_hover[logs2_hover['element'].isin(['rating_race', 'rating_gender'])]

click_time = revisits[revisits['element'].isin(['genderTimePoints', 'raceTimePoints'])]
hover_time = logs2_hover[logs2_hover['element'].isin(['time_race', 'time_gender'])]

In [98]:
print("Number of applicants who were hovered that were rated lower than average: ",
       len(hover_rating[hover_rating['rating'] < hover_rating['mean_rating']]))

print("Number of applicants who were revisited that were rated lower than average: ",
       len(click_rating[click_rating['rating'] < click_rating['mean_rating']]))

print("Number of applicants who were hovered that were spent less time than average: ",
       len(hover_time[hover_time['focus_time'] < hover_time['mean_time']]))

print("Number of applicants who were revisited that were spent less time than average: ",
       len(click_time[click_time['focus_time'] < click_time['mean_time']]))


Number of applicants who were hovered that were rated lower than average:  145
Number of applicants who were revisited that were rated lower than average:  9
Number of applicants who were hovered that were spent less time than average:  247
Number of applicants who were revisited that were spent less time than average:  8


In [50]:
# Rating changes
rating_changes = ratings[ratings['rating_change']!=0]

print("Total number of rating changes: ", rating_changes.shape[0])

print("Number of upgrades: ", len(rating_changes[rating_changes['rating_change'] > 0]))

print("Number of downgrades: ", len(rating_changes[rating_changes['rating_change'] < 0]))

print("Number of users who changed their ratings: ", len(rating_changes['user_id'].unique()))



Total number of rating changes:  26
Number of upgrades:  19
Number of downgrades:  7
Number of users who changed their ratings:  9


In [128]:
# Rating and time spent changes of P69 and P40
ratings_merged = pd.merge(ratings, applications[['applicant_id', 'GENDER', 'RACE']], on='applicant_id')
time_merged = pd.merge(focus_time, applications[['applicant_id', 'GENDER', 'RACE']], on='applicant_id')
rating_changes_p69 = ratings_merged[ratings_merged['user_id'] == 69]
time_changes_p69 = time_merged[time_merged['user_id'] == 69]  
rating_changes_p40 = ratings_merged[ratings_merged['user_id'] == 40]

print("P69 time spent by race before and after: \n", 
      time_changes_p69[['RACE','total_before']].groupby('RACE').mean(),  
      "\n",
      time_changes_p69[['RACE','total_after']].groupby('RACE').mean()
      )

print("\nP69 rating by race before and after: \n", 
      rating_changes_p69[['RACE','rating_before']].groupby('RACE').mean(),  
      "\n",
      rating_changes_p69[['RACE','rating_after']].groupby('RACE').mean()
      )

print("\nP40 rating by gender before and after: \n", 
      rating_changes_p40[['GENDER','rating_before']].groupby('GENDER').mean(),  
      "\n",
      rating_changes_p40[['GENDER','rating_after']].groupby('GENDER').mean()
      )

P69 time spent by race before and after: 
        total_before
RACE               
Black      1.843147
White      2.245006 
        total_after
RACE              
Black     2.292156
White     2.407522

P69 rating by race before and after: 
        rating_before
RACE                
Black      72.333333
White      78.000000 
        rating_after
RACE               
Black     74.166667
White     77.666667

P40 rating by gender before and after: 
             rating_before
GENDER                   
Female              64.75
Male                83.75
Non-binary          67.25 
             rating_after
GENDER                  
Female             67.25
Male               64.00
Non-binary         70.25
