# Setup

In [None]:
import pandas as pd
import itertools
import os 

CutOffDate = pd.to_datetime('2021-08-01', utc = 'UTC')
Recent1YearDate = pd.to_datetime('2020-08-01', utc = 'UTC')
Recent2YearDate = pd.to_datetime('2019-08-01', utc = 'UTC')
Recent3YearDate = pd.to_datetime('2018-08-01', utc = 'UTC')


cutoffDates = {
    1:Recent1YearDate, 
    2:Recent2YearDate, 
    3:Recent3YearDate,
}

# Settings

In [None]:
MIN_TEXT_LEN = 1

# Get Website Data

In [None]:
def get_website2rvw(Folder):

    suffix = ''
    website2path = {
        'healthgrades': 'healthgrades{}.p'.format(suffix), 
        'vitals': 'vitals{}.p'.format(suffix), 
        'ratemds': 'ratemds{}.p'.format(suffix), 
        'yelp': 'yelp{}.p'.format(suffix),
        'zocdoc': 'zocdoc{}.p'.format(suffix)
    }
    
    website2data = {}
    for name, path in website2path.items():
        path = os.path.join(Folder, path)
        df = pd.read_pickle(path)
        website2data[name] = df
    return website2data
    
Folder = 'Output/MinText{}'.format(MIN_TEXT_LEN)
website2data = get_website2rvw(Folder)

# NPPES Profile Score

In [None]:
import pandas as pd
import numpy as np
import missingno as msno
import itertools


NPPES = pd.read_pickle('Data/NPPES/NPI2InfoMDDO.p')
nppes_cols = ['NPI', 'FirstName', 'LastName']
NPPES = NPPES[nppes_cols]
print(NPPES.shape)


def get_list_sum(x, finalreview_cols):
    tmp = x[finalreview_cols].to_list()
    tmp_list = [i for i in tmp if type(i) == list]
    if len(tmp_list) == 0:
        return np.nan
    else:
        return list(itertools.chain(*tmp_list))
    
def get_prfscore_avg(x, finalpflscore_cols):
    tmp = x[finalpflscore_cols].to_list()
    tmp = [i for i in tmp if not pd.isnull(i)]
    # tmp_list = [i for i in tmp if type(i) == list]
    if len(tmp) == 0:
        return np.nan
    else:
        return np.mean(tmp)
    
def get_NPPESReview(NPPES, website2data):
    for website, df in website2data.items():
        tmp = df[['NPI', 'final_review', 'reported_profile_score']]
        tmp.columns = ['NPI', 'review_{}'.format(website), 'pflscore_{}'.format(website)]
        NPPES = pd.merge(NPPES, tmp, on = 'NPI', how = 'left')
        
    # all 
    cols = ['review_{}'.format(i) for i in website2data if 'zocdoc' not in i]
    NPPES['review_all'] = NPPES.apply(lambda x: get_list_sum(x, cols), axis = 1)
    cols = ['pflscore_{}'.format(i) for i in website2data if 'zocdoc' not in i]
    NPPES['pflscore_all'] = NPPES.apply(lambda x: get_prfscore_avg(x, cols), axis = 1)
    
    # all(zocdoc)
    cols = ['review_{}'.format(i) for i in website2data]
    NPPES['review_all(zocdoc)'] = NPPES.apply(lambda x: get_list_sum(x, cols), axis = 1)
    cols = ['pflscore_{}'.format(i) for i in website2data]
    NPPES['pflscore_all(zocdoc)'] = NPPES.apply(lambda x: get_prfscore_avg(x, cols), axis = 1)
    return NPPES


NPPESReview = get_NPPESReview(NPPES, website2data)
NPPESReview

# Recent vs Past

In [None]:
def get_recent_past_reviews(x, CutOffDate):
    recent = []
    past = []
    for i in x:
        if i['ReviewDate'] >= CutOffDate:
            recent.append(i)
        else:
            past.append(i)
    return recent, past

In [None]:



def get_recent_past_review_score(col, NPPESReview, RecentYear, PAST_MIN_NUM):
    info = {}
    info['Platform'] = col.split('_')[1]
    
    HasRecentReview = 'HasReviewsRct{}Y'.format(RecentYear)
    CutOffDate = cutoffDates[RecentYear]
    # info['PAST_MIN_NUM'] = PAST_MIN_NUM
    info['Recent'] = 'Rct{}Y ({})'.format(RecentYear, str(CutOffDate.date()))
    
    
    # get profile info
    df = NPPESReview[['NPI', col]]
    df = df[-df[col].isna()].reset_index(drop = True)
    df['LatestReviewDate'] = df[col].apply(lambda x: np.max([i['ReviewDate'] for i in x]) if len(x) > 0 else np.nan)
    info['WebPhyNum'] = len(df)
    
    # has review
    df = df[-df['LatestReviewDate'].isna()].reset_index(drop = True)
    info['HasRvwNum'] = len(df)
    
    # has recent review
    df[HasRecentReview] = df['LatestReviewDate'].apply(lambda x: True if x >= CutOffDate else False)
    df = df[df[HasRecentReview]].reset_index(drop = True)
    info['HasRctNum'] = len(df)
    
    
    # has recent and past review and Filter by PAST_MIN_NUM
    recent_past = df[col].apply(lambda x: get_recent_past_reviews(x, CutOffDate))
    df['recent'] = [i[0] for i in recent_past.values]
    df['past'] = [i[1] for i in recent_past.values]
    df['recent_scores'] = df['recent'].apply(lambda x: [float(i['ReviewScore']) for i in x])
    df['past_scores'] = df['past'].apply(lambda x: [float(i['ReviewScore']) for i in x])
    df['recent_num'] = df['recent'].apply(lambda x: len(x))
    df['past_num'] = df['past'].apply(lambda x: len(x))
    df = df[df['past_num'] >= PAST_MIN_NUM].reset_index(drop = True)
    info['HasRctPst{}Num'.format(PAST_MIN_NUM)] = len(df)
    
    # Recent Score and Past Score
    df['recent_mean_score'] = df['recent_scores'].apply(lambda x: np.mean(x))
    df['past_mean_score'] = df['past_scores'].apply(lambda x: np.mean(x))
    
    # T, I, D, M
    df['rencetpast_diff'] = df['recent_mean_score'] - df['past_mean_score']
    increase_df = df[df['rencetpast_diff'] >= 0.5].reset_index(drop = True)
    decrease_df = df[df['rencetpast_diff'] <= -0.5].reset_index(drop = True)
    unchange_df = df[df['rencetpast_diff'].apply(lambda x: x > -0.5 and x < 0.5)].reset_index(drop = True)
    
    info['T'] = '{} ({:.2f}%)'.format(len(df), 100*len(df)/len(df))
    info['T_recentscore'] = '{:.2f} ({:.2f})'.format(df['recent_mean_score'].mean(), df['recent_mean_score'].std())
    info['T_pastscore'] = '{:.2f} ({:.2f})'.format(df['past_mean_score'].mean(), df['past_mean_score'].std())
    
    info['D'] = '{} ({:.2f}%)'.format(len(decrease_df), 100*len(decrease_df)/len(df))
    info['D_recentscore'] = '{:.2f} ({:.2f})'.format(decrease_df['recent_mean_score'].mean(), decrease_df['recent_mean_score'].std())
    info['D_pastscore'] = '{:.2f} ({:.2f})'.format(decrease_df['past_mean_score'].mean(), decrease_df['past_mean_score'].std())
    
    info['M'] = '{} ({:.2f}%)'.format(len(unchange_df), 100*len(unchange_df)/len(df))
    info['M_recentscore'] = '{:.2f} ({:.2f})'.format(unchange_df['recent_mean_score'].mean(), unchange_df['recent_mean_score'].std())
    info['M_pastscore'] = '{:.2f} ({:.2f})'.format(unchange_df['past_mean_score'].mean(), unchange_df['past_mean_score'].std())
    
    info['I'] = '{} ({:.2f}%)'.format(len(increase_df), 100*len(increase_df)/len(df))
    info['I_recentscore'] = '{:.2f} ({:.2f})'.format(increase_df['recent_mean_score'].mean(), increase_df['recent_mean_score'].std())
    info['I_pastscore'] = '{:.2f} ({:.2f})'.format(increase_df['past_mean_score'].mean(), increase_df['past_mean_score'].std())
   
    return info


In [None]:
folder = 'Output/RecentVSPast'

for PAST_MIN_NUM in [5, 1]:
    for RecentYear in [1, 3]:
        L = []
        for col in [i for i in NPPESReview.columns if 'review' in i]:
            # print(col, RecentYear)
            info = get_recent_past_review_score(col, NPPESReview, RecentYear, PAST_MIN_NUM)
            L.append(info)

        Recent_Past_Report = pd.DataFrame(L)
        path = '{}/MinTxt{}_RctY{}_PastRvw{}_Report.csv'.format(folder, MIN_TEXT_LEN, RecentYear, PAST_MIN_NUM)
        print('save to: {}'.format(path))
        Recent_Past_Report.to_csv(path, index = False)

Recent_Past_Report

# Recent vs Profile

In [None]:

def get_recent_profile_review_score(col, NPPESReview, RecentYear, PAST_MIN_NUM):
    info = {}
    info['Platform'] = col.split('_')[1]
    
    HasRecentReview = 'HasReviewsRct{}Y'.format(RecentYear)
    CutOffDate = cutoffDates[RecentYear]
    # info['PAST_MIN_NUM'] = PAST_MIN_NUM
    info['Recent'] = 'Rct{}Y ({})'.format(RecentYear, str(CutOffDate.date()))
    
    
    # get profile info
    pflscore_col = col.replace('review', 'pflscore')
    df = NPPESReview[['NPI', col, pflscore_col]]
    df = df[-df[col].isna()].reset_index(drop = True)
    df['LatestReviewDate'] = df[col].apply(lambda x: np.max([i['ReviewDate'] for i in x]) if len(x) > 0 else np.nan)
    info['WebPhyNum'] = len(df)
    
    # has review
    df = df[-df['LatestReviewDate'].isna()].reset_index(drop = True)
    info['HasRvwNum'] = len(df)
    
    # has recent review
    df[HasRecentReview] = df['LatestReviewDate'].apply(lambda x: True if x >= CutOffDate else False)
    df = df[df[HasRecentReview]].reset_index(drop = True)
    info['HasRctNum'] = len(df)
    
    
    # has recent and past review and Filter by PAST_MIN_NUM
    recent_past = df[col].apply(lambda x: get_recent_past_reviews(x, CutOffDate))
    df['recent'] = [i[0] for i in recent_past.values]
    df['past'] = [i[1] for i in recent_past.values]
    df['recent_scores'] = df['recent'].apply(lambda x: [float(i['ReviewScore']) for i in x])
    
    df['past_num'] = df['past'].apply(lambda x: len(x))
    df = df[df['past_num'] >= PAST_MIN_NUM].reset_index(drop = True)
    info['HasRctPst{}Num'.format(PAST_MIN_NUM)] = len(df)
    
    # Recent Score and Past Score
    df['recent_mean_score'] = df['recent_scores'].apply(lambda x: np.mean(x))
    df['profile_score'] = df[pflscore_col].apply(lambda x: float(x))
    
    # T, I, D, M
    df['rencetprfl_diff'] = df['recent_mean_score'] - df['profile_score']
    increase_df = df[df['rencetprfl_diff'] >= 0.5].reset_index(drop = True)
    decrease_df = df[df['rencetprfl_diff'] <= -0.5].reset_index(drop = True)
    unchange_df = df[df['rencetprfl_diff'].apply(lambda x: x > -0.5 and x < 0.5)].reset_index(drop = True)
    
    info['T'] = '{} ({:.2f}%)'.format(len(df), 100*len(df)/len(df))
    info['T_recentscore'] = '{:.2f} ({:.2f})'.format(df['recent_mean_score'].mean(), df['recent_mean_score'].std())
    info['T_profilescore'] = '{:.2f} ({:.2f})'.format(df['profile_score'].mean(), df['profile_score'].std())
    
    info['D'] = '{} ({:.2f}%)'.format(len(decrease_df), 100*len(decrease_df)/len(df))
    info['D_recentscore'] = '{:.2f} ({:.2f})'.format(decrease_df['recent_mean_score'].mean(), decrease_df['recent_mean_score'].std())
    info['D_profilescore'] = '{:.2f} ({:.2f})'.format(decrease_df['profile_score'].mean(), decrease_df['profile_score'].std())
    
    info['M'] = '{} ({:.2f}%)'.format(len(unchange_df), 100*len(unchange_df)/len(df))
    info['M_recentscore'] = '{:.2f} ({:.2f})'.format(unchange_df['recent_mean_score'].mean(), unchange_df['recent_mean_score'].std())
    info['M_profilescore'] = '{:.2f} ({:.2f})'.format(unchange_df['profile_score'].mean(), unchange_df['profile_score'].std())
    
    info['I'] = '{} ({:.2f}%)'.format(len(increase_df), 100*len(increase_df)/len(df))
    info['I_recentscore'] = '{:.2f} ({:.2f})'.format(increase_df['recent_mean_score'].mean(), increase_df['recent_mean_score'].std())
    info['I_profilescore'] = '{:.2f} ({:.2f})'.format(increase_df['profile_score'].mean(), increase_df['profile_score'].std())
   
    return info


In [None]:
folder = 'Output/RecentVSProfile'

for PAST_MIN_NUM in [5, 1]:
    for RecentYear in [1, 3]:
        L = []
        for col in [i for i in NPPESReview.columns if 'review' in i]:
            # print(col, RecentYear)
            info = get_recent_profile_review_score(col, NPPESReview, RecentYear, PAST_MIN_NUM)
            L.append(info)

        Recent_Profile_Report = pd.DataFrame(L)
        path = '{}/MinTxt{}_RctY{}_PastRvw{}_Report.csv'.format(folder, MIN_TEXT_LEN, RecentYear, PAST_MIN_NUM)
        print('save to: {}'.format(path))
        Recent_Profile_Report.to_csv(path, index = False)

Recent_Profile_Report