# Setup

In [None]:
import pandas as pd
import itertools
import os 

CutOffDate = pd.to_datetime('2021-08-01', utc = 'UTC')
Recent1YearDate = pd.to_datetime('2020-08-01', utc = 'UTC')
Recent2YearDate = pd.to_datetime('2019-08-01', utc = 'UTC')
Recent3YearDate = pd.to_datetime('2018-08-01', utc = 'UTC')

# Settings

In [None]:
MIN_TEXT_LEN = 1

# Get Website Data

In [None]:
def get_website2rvw(Folder):

    suffix = ''
    website2path = {
        'healthgrades': 'healthgrades{}.p'.format(suffix), 
        'vitals': 'vitals{}.p'.format(suffix), 
        'ratemds': 'ratemds{}.p'.format(suffix), 
        'yelp': 'yelp{}.p'.format(suffix),
        'zocdoc': 'zocdoc{}.p'.format(suffix)
    }
    
    website2data = {}
    for name, path in website2path.items():
        path = os.path.join(Folder, path)
        df = pd.read_pickle(path)
        website2data[name] = df
    return website2data
    
Folder = 'Output/MinText{}'.format(MIN_TEXT_LEN)
website2data = get_website2rvw(Folder)

# NPPES

In [None]:
import pandas as pd
import numpy as np

NPPES = pd.read_pickle('Data/NPPES/NPI2InfoMDDO.p')
nppes_cols = ['NPI', 'FirstName', 'LastName']
NPPES = NPPES[nppes_cols]
print(NPPES.shape)

In [None]:
import missingno as msno
import itertools

def get_list_sum(x, finalreview_cols):
    tmp = x[finalreview_cols].to_list()
    tmp_list = [i for i in tmp if type(i) == list]
    if len(tmp_list) == 0:
        return np.nan
    else:
        return list(itertools.chain(*tmp_list))
    
def get_NPPESReview(NPPES, website2data):
    for website, df in website2data.items():
        tmp = df[['NPI', 'final_review']]
        tmp.columns = ['NPI', 'review_{}'.format(website)]
        NPPES = pd.merge(NPPES, tmp, on = 'NPI', how = 'left')

    finalreview_cols = ['review_{}'.format(i) for i in website2data if 'zocdoc' not in i]
    NPPES['review_all'] = NPPES.apply(lambda x: get_list_sum(x, finalreview_cols), axis = 1)

    finalreview_cols = ['review_{}'.format(i) for i in website2data]
    NPPES['review_all(zocdoc)'] = NPPES.apply(lambda x: get_list_sum(x, finalreview_cols), axis = 1)
    
    return NPPES



In [None]:
NPPESReview = get_NPPESReview(NPPES, website2data)
NPPESReview

# Recent Review Report

In [None]:
def get_physician_recent_review(NPPESReview):
    Report = []
    for platform in [i for i in NPPESReview.columns if 'review' in i]:
        d = {}
        d['platform'] = platform
        d['AllPhysicians'] = len(NPPESReview)

        # must dropna here. 
        tmp = NPPESReview[platform].dropna().reset_index(drop = True)
        tmp = tmp.apply(lambda x: [i['ReviewDate'] for i in x])
        d['WebPhysicians'] = len(tmp)

        # get latest review
        tmp = tmp.apply(lambda x:np.max(x) if len(x)>0 else np.nan)
        tmp = tmp.dropna().reset_index(drop = True)
        d['HasReviews'] = len(tmp)

        # get recent reviews
        d['HasReviewsRct3Y'] = tmp.apply(lambda x: True if x >= Recent3YearDate else False).sum()
        d['HasReviewsRct2Y'] = tmp.apply(lambda x: True if x >= Recent2YearDate else False).sum()
        d['HasReviewsRct1Y'] = tmp.apply(lambda x: True if x >= Recent1YearDate else False).sum()

        Report.append(d)
    return pd.DataFrame(Report)


def get_report_df(Report):
    L = []
    for spc, row in Report.iterrows():
        d0 = row.to_dict()
        L.append(d0)
        for base in ['AllPhysicians', 'WebPhysicians', 'HasReviews',]:
            d = {}
            d['platform'] = np.nan
            for i in  ['AllPhysicians', 'WebPhysicians', 'HasReviews',
                       # 'HasRecent5yearsReviews', 'HasRecent4yearsReviews',
                       'HasReviewsRct3Y', 'HasReviewsRct2Y', 'HasReviewsRct1Y']:
                rate = row[i] / row[base]
                if rate > 1: rate = np.nan
                d[i] = rate
            L.append(d)
        L.append({})

    NewReport = pd.DataFrame(L)
    return NewReport


In [None]:
Report = get_physician_recent_review(NPPESReview)
Report.to_csv('Output/PhysicianRecentReview/MinTxt{}_Report.csv'.format(MIN_TEXT_LEN), index = False)
Report

In [None]:
NewReport = get_report_df(Report)
NewReport.to_csv('Output/PhysicianRecentReview/MinTxt{}_ReportPercent.csv'.format(MIN_TEXT_LEN), index = False)
NewReport

# Physician's Review Average Age

In [None]:
NPPESReview

In [None]:
def get_physician_recent_review_average_age(NPPESReview):
    Report = []
    for platform in [i for i in NPPESReview.columns if 'review' in i]:
        d = {}
        d['platform'] = platform
        d['AllPhysicians'] = len(NPPESReview)

        # must dropna here. 
        tmp = NPPESReview[platform].dropna().reset_index(drop = True)
        tmp = tmp.apply(lambda x: [i['ReviewDate'] for i in x])
        d['WebPhysicians'] = len(tmp)

        # get latest review
        tmp = tmp.apply(lambda x: [(pd.to_datetime('2021-08-01', utc = 'UTC') - i) for i in x] if len(x)>0 else np.nan)
        tmp = tmp.dropna().reset_index(drop = True)
        d['HasReviews'] = len(tmp)
        tmp = tmp.apply(lambda x: np.mean([i.days + i.seconds /(24*60*60) for i in x]))
        d['PhysicianReviewAverageAge(days)'] = tmp.mean()
        d['PhysicianReviewAverageAge(years)'] = d['PhysicianReviewAverageAge(days)'] / (365.25)
        Report.append(d)
    
    return pd.DataFrame(Report)


get_physician_recent_review_average_age(NPPESReview)