# Utils Functions

In [None]:
import pandas as pd
import os
import numpy as np
pd.set_option('display.max_columns', None)


def get_df_from_folder2rd(subfolder):
    # subfolder = os.path.join(folder, folder2rd)
    filenames = [os.path.join(subfolder,i) for i in os.listdir(subfolder) if 'errorlog' not in i and '.DS_Store' not in i]
    L = []
    for filename in filenames:
        try:
            df = pd.read_pickle(filename)
            L.append(df)
        except:
            print('Error File', filename.split('/')[-1])

    DF = pd.concat(L).reset_index(drop = True)
    return DF


def review_info(subfolder, review_col):
    DF = get_df_from_folder2rd(subfolder)
    d = {}
    d['subfolder'] = subfolder.split('/')[-1]
    DF['ReviewCountClt'] = DF[review_col].apply(lambda x:len(x))
    d['total_reviews'] = DF['ReviewCountClt'].sum()
    s = DF[DF['ReviewCountClt'] > 0 ]
    d['physician'] = len(DF)
    d['physician_of_reviews'] = (DF['ReviewCountClt'] > 0 ).sum()
    s = DF['ReviewCountClt']
    return DF, d, s


def change_npi_to_int(x):
    try:
        return int(x)
    except:
        # print(x)
        return 0


# Look at Samples

In [None]:
# folder2rd = 'Output/healthgrades/MD_Doc2GoogleURL_healthgrades_s0_e10000/'
# folder2rd = 'Output/vitals/MD_Doc2GoogleURL_vitals_s0_e10000/'
# folder2rd = 'Output/ratemds/MD_Doc2GoogleURL_ratemds_s0_e2199/'
# folder2rd = 'Output/zocdoc/MD_Doc2GoogleURL_zocdoc_s0_e10000/'
folder2rd = 'Output/yelp/MD_Doc2GoogleURL_yelp_s0_e2600/'

DF = get_df_from_folder2rd(folder2rd)
DF.head()

# WebURL Information

In [None]:
WebsiteNames = ['healthgrades', 'vitals', 'ratemds', 'zocdoc', 'yelp']
doj_path = 'Data/NPI_included.csv'
mddo_path = 'Data/NPPESMDDOPhysician.p'

DOJ_NPI_included = pd.read_csv(doj_path)['NPI'].to_list()
MDDO_NPI_included = pd.read_pickle(mddo_path)['NPI'].to_list()

print('Number of DOJ:    {}'.format(len(DOJ_NPI_included)))
print('Number of MDDO:   {}'.format(len(MDDO_NPI_included)))


for name in WebsiteNames:
    inputfolder = os.path.join('Data', name)
    files = [os.path.join(inputfolder, i) for i in os.listdir(inputfolder) if '.p' in i]# 'Data/DocListDF_GoogleSearch_s0_e500.p'
    files.sort()
    WebsiteKeywords = pd.concat([pd.read_pickle(i) for i in files])
    # WebsiteKeywords = WebsiteKeywords.iloc[WebsiteKeywords['NPI'].drop_duplicates().index]
    WebsiteKeywords = WebsiteKeywords['NPI'].drop_duplicates() 

    print('\nFor Website {}, unique NPIs Num: {}'.format(name.upper(), len(WebsiteKeywords)))
    
    d = WebsiteKeywords[WebsiteKeywords.isin(DOJ_NPI_included)]
    print('number of DOJ  NPIs found in {}: {} /{}'.format(name, len(d), len(DOJ_NPI_included)))
    
    d = WebsiteKeywords[WebsiteKeywords.isin(MDDO_NPI_included)]
    print('number of MDDO NPIs found in {}: {} /{}'.format(name, len(d), len(MDDO_NPI_included)))


# Process One Review DF

In [None]:
website2smaple = dict(
    healthgrades = 'Output/healthgrades/MD_Doc2GoogleURL_healthgrades_s0_e10000/',
    vitals = 'Output/vitals/MD_Doc2GoogleURL_vitals_s0_e10000/',
    ratemds = 'Output/ratemds/MD_Doc2GoogleURL_ratemds_s0_e2199/',
    zocdoc = 'Output/zocdoc/MD_Doc2GoogleURL_zocdoc_s0_e10000/',
    yelp = 'Output/yelp/MD_Doc2GoogleURL_yelp_s0_e2600/',
)

website = 'healthgrades'

def get_sample(website):
    folder2rd = website2smaple[website]
    DF = get_df_from_folder2rd(folder2rd)
    return DF

get_sample(website)

## NPI 

In [None]:
################## Physician's NPI
WebsiteNames2NPI = {'healthgrades':'npi', 
                    'vitals':'npi', 
                    'ratemds':'source_npi', 
                    'zocdoc':'npi', 
                    'yelp':'source_npi'}

## Full Name

In [None]:
################## Physician's NPI
WebsiteNames2FullName = {'healthgrades':'providerDisplayFullName', 
                    'vitals':'fullname', 
                    'ratemds':'full_name', 
                    'zocdoc':'approvedFullName', 
                    'yelp':'name'}

## Graduation

In [None]:
################## Physician's Education
def get_healthgrades_graduation_year(x):
    years = [i.get('completionYear', None) for i in x if i.get('type', 'None') == 'Medical School']
    year = years[0] if len(years) else None
    return year


def get_vitals_graduation_year(x):
    years = [i.get('GraduationYear', None) for i in x if i.get('EducationType', 'None') == 'Medical School']
    year = years[0] if len(years) else None
    return year


def get_ratemds_graduation_year(x):
    years = [i.get('graduation_year', None) for i in x if 'school' in i]
    year = years[0] if len(years) else None
    return year

def get_empty(x):
    return None

WebsiteNames2Graudation = {'healthgrades':['education', get_healthgrades_graduation_year],
                            'vitals':['education_nimvs', get_vitals_graduation_year], 
                            'ratemds':['doctor_schools',get_ratemds_graduation_year], 
                            'zocdoc':['npi', get_empty ],
                            'yelp':  ['source_npi', get_empty]}


## Reviews

In [None]:
################## Review


#################################### Review's Columns
WebsiteNames2Reviews = {'healthgrades':['reviews'], 
                        'vitals':['reviews'], 
                        'ratemds':['reviews'], 
                        'zocdoc':['reviews'], 
                        'yelp':['reviews_detailed', 'blocked_reviews', 'removed_reviews']}


## Reviews: Physician's Profile Score, Rating Num, and Review Num

In [None]:



#################################### Review's Reported Total Score
WebsiteNames2ReviewsReportedProfileScore = {'healthgrades':{'reviews':'actualScore'}, 
                                            'vitals':{'reviews':'rating_score'}, 
                                            'ratemds':{'reviews':'ratingValue'}, 
                                            'zocdoc':{'reviews':'averageRating'},
                                            'yelp':{'reviews_detailed':'rating', 
                                                    'blocked_reviews':'rating', 
                                                    'removed_reviews': 'rating'}}



#################################### Review's Reported Rate
WebsiteNames2ReviewsReportedRatingNum = {'healthgrades':{'reviews':'responseCount'}, 
                                            'vitals':{'reviews':'number_of_ratings'},  # 
                                            'ratemds':{'reviews':'ratingCount'}, 
                                            'zocdoc':{'reviews':'reviewCount'}, 
                                            'yelp':{'reviews_detailed':'reviewCount', 
                                                    'blocked_reviews':'blocked_reviews_num', 
                                                    'removed_reviews': 'removed_reviews_num'}}



#################################### Review's Reported Number
WebsiteNames2ReviewsReportedReviewNum = {'healthgrades':{'reviews':'reviewCount'}, 
                                            'vitals':{'reviews':'review_count'},  # 
                                            'ratemds':{'reviews':'ratingCount'}, 
                                            'zocdoc':{'reviews':'reviewCount'}, 
                                            'yelp':{'reviews_detailed':'reviewCount', 
                                                    'blocked_reviews':'blocked_reviews_num', 
                                                    'removed_reviews': 'removed_reviews_num'}}



## Review Function: Standardize Reviews (Time, Content, Score)

### HealthGrades

In [None]:
import json

def standardize_reviews_healthgrades(x):
    new_x = []
    bad_x = []
    for i in x:
        if 'submittedDate' in i:
            date = i['submittedDate']
            if len(date) == 0:
                print('Weird Data: --->', date, i)
                bad_x.append(i)
                continue 
                
            # Review Date  
            i['ReviewDate'] = date
            
            # Review Score
            i['ReviewScore']  = i['overallScore']
            
            # Review Text
            if 'commentText' in i:
                i['ReviewText'] = i['commentText']
            else:
                print('\nNo ReviewText--->', i)
                i['ReviewText'] = ''
                
            i['ReviewSource'] = 'healthgrades'
            assert type(i['ReviewText']) == str
            new_x.append(i)
                
        else:
            print('bad i', i)
            bad_x.append(i)
            
    if len(new_x) < len(x):
        print('\n\nfrom {} to {}\n\n'.format(len(x), len(new_x)))
        
    for i in new_x:
        for key in ['ReviewDate', 'ReviewScore', 'ReviewText', 'ReviewSource']:
            assert key in i
            
    return new_x


### Vitals

In [None]:

def standardize_reviews_vitals(x):
    new_x = []
    bad_x = []
    for i in x:
        if 'updated_at_dt' in i:
            # Get Review Date Time Information
            date = i['updated_at_dt']
            if len(date) == 0:
                print('Wierd Data: --->', date, i)
                bad_x.append(i)
                # pass, go to next review
                continue 
               
            # Review Date
            i['ReviewDate'] = date
            
            
            # Review Score
            i['ReviewScore']  = i['overallrating_f']
            
            # Review Text
            ReviewText = None
            if 'review_t' in i:
                review_tmp = json.loads(i['review_t'])
                assert type(review_tmp) == dict
                if 'Comments' in review_tmp:
                    ReviewText = review_tmp['Comments']
                
            # if still no review
            if ReviewText == None:
                if 'general_comment' in i:
                    ReviewText = i['general_comment']
                else:
                    ReviewText = ''
                    # print('No comments in updated_at_dt track, but still valid: -->', i)
            
            if type(ReviewText) != str: 
                print('ReviewText is not string', ReviewText, i)
                ReviewText = ''
            
            assert type(ReviewText) == str
            i['ReviewText'] = ReviewText
            i['ReviewSource'] = 'vitals'
            new_x.append(i)
            
        elif 'review_date' in i:
            
            date = i['review_date']
            if len(date) == 0:
                print('Wierd Data: --->', date, i)
                bad_x.append(i)
                # pass, go to next review
                continue 
                
            # Review Date
            i['ReviewDate'] = date
            # Review Text
            
            # Review Score
            if 'overall_rating' not in i: print(i)
            i['ReviewScore']  = i['overall_rating']
            i['ReviewText'] = i['general_comment']
            # Review Score
            
            assert type(i['ReviewText']) == str
            i['ReviewSource'] = 'vitals'
            new_x.append(i)
        else:
            print('bad i', i)
            bad_x.append(i)
    
    
    if len(new_x) < len(x):
        print('\n\nfrom {} to {}\n\n'.format(len(x), len(new_x)))
        
    for i in new_x:
        for key in ['ReviewDate', 'ReviewScore', 'ReviewText', 'ReviewSource']:
            assert key in i
            
    return new_x


### RateMDs

In [None]:
def standardize_reviews_ratemds(x):
    new_x = []
    bad_x = []
    for i in x:
        if 'created' in i:
            date = i['created']
            if len(date) == 0:
                print('Weird Data: --->', date, i)
                bad_x.append(i)
                continue 
                
            # Review Date
            i['ReviewDate'] = date
            
            # Review Score
            i['ReviewScore']  = i['average']
            
            # Review Text
            if 'comment' in i:
                i['ReviewText'] = i['comment']
            else:
                print('No comment in this review', i)
                i['ReviewText'] = ''
                
            assert type(i['ReviewText']) == str
            i['ReviewSource'] = 'ratemds'
            new_x.append(i)
                
        else:
            print('bad i', i)
            bad_x.append(i)
            
    if len(new_x) < len(x):
        print('\n\nfrom {} to {}\n\n'.format(len(x), len(new_x)))
        
    for i in new_x:
        for key in ['ReviewDate', 'ReviewScore', 'ReviewText', 'ReviewSource']:
            assert key in i
            
    return new_x


### ZocDoc

In [None]:
def standardize_reviews_zocdoc(x):
    new_x = []
    bad_x = []
    for i in x:
        if 'date' in i:
            date = i['date']
            if len(date) == 0:
                if 'dateRange' in i:
                    date = i['dateRange']
            
            if len(date) == 0:
                print('Weird Data: --->', date, i)
                bad_x.append(i)
                # pass, go to next review
                continue 
            
            # Review Date
            i['ReviewDate'] = date
            
            
            # Review Score
            i['ReviewScore'] = i['overallRating']
            
            
            # Review Text
            if 'comment' in i:
                i['ReviewText'] = i['comment']
            else:
                i['ReviewText'] = ''
                print('No review_t or general comment, but still valid: -->', i)
            
            if type(i['ReviewText']) != str: 
                # print('ReviewText is not string', i['ReviewText'], i)
                i['ReviewText'] = ''
                
            assert type(i['ReviewText']) == str
            i['ReviewSource'] = 'zocdoc'
            new_x.append(i)
        else:
            print('bad i', i)
            bad_x.append(i)
            
    if len(new_x) < len(x):
        print('\n\nfrom {} to {}\n\n'.format(len(x), len(new_x)))
        
    for i in new_x:
        for key in ['ReviewDate', 'ReviewScore', 'ReviewText', 'ReviewSource']:
            assert key in i
    
    return new_x

### Yelp

In [None]:

def standardize_reviews_yelp(x):
    new_x = []
    bad_x = []
    for i in x:
        if 'localizedDate' in i:
            date = i['localizedDate']
            
            if len(date) == 0:
                print('Weird Data: --->', date, i)
                bad_x.append(i)
                # pass, go to next review
                continue 
            
            # Review Date
            i['ReviewDate'] = date
            
            
            # Review Score
            i['ReviewScore'] = i['rating']
            
            
            # Review Text
            if 'comment' in i:
                tmp = i['comment']
                if 'text' in tmp:
                    i['ReviewText'] = tmp['text']
                else:
                    i['ReviewText'] = ''
            else:
                i['ReviewText'] = ''
                print('No review_t or general comment, but still valid: -->', i)
                
            
            assert type(i['ReviewText']) == str
            i['ReviewSource'] = 'yelp'
            new_x.append(i)
        else:
            print('bad i', i)
            bad_x.append(i)
            
    if len(new_x) < len(x):
        print('\n\nfrom {} to {}\n\n'.format(len(x), len(new_x)))
        
    for i in new_x:
        for key in ['ReviewDate', 'ReviewScore', 'ReviewText', 'ReviewSource']:
            assert key in i
    return new_x
    

### Yelp Discarded

In [None]:

def standardize_reviews_yelp_discarded(x):
    new_x = []
    bad_x = []
    for i in x:
        if 'date' in i:
            date = i['date']
            
            if not date: 
                # print('Weird Data: --->', date, i)
                bad_x.append(i)
                # pass, go to next review
                continue 
                
            if len(date) == 0:
                # print('Weird Data: --->', date, i)
                bad_x.append(i)
                # pass, go to next review
                continue 

            # Review Date
            i['ReviewDate'] = date
            
            
            # Review Score
            try:
                i['ReviewScore'] = float(i['rate_score'].split('star')[0])
            except:
                i['ReviewScore'] = None
            
            
            # Review Text
            if 'comment' in i:
                i['ReviewText'] = i['comment']
            else:
                i['ReviewText'] = ''
                print('No review_t or general comment, but still valid: -->', i)
                
            assert type(i['ReviewText']) == str
            i['ReviewSource'] = 'yelp_removed'
            new_x.append(i)
        else:
            print('bad i', i)
            bad_x.append(i)
            
    if len(new_x) < len(x):
        # print('\n\nfrom {} to {}\n\n'.format(len(x), len(new_x)))
        pass
        
        
    for i in new_x:
        for key in ['ReviewDate', 'ReviewScore', 'ReviewText', 'ReviewSource']:
            assert key in i
    
    return new_x
    

### Standardize Review Functions


Standard Reviews: having `ReviewDate`, `ReviewScore`, and `ReviewText`.

The values of the above items might be invalid.

Next step is filter the valid reviews.

In [None]:
WebsiteNames2ValidFilterFunction = {'healthgrades':{'reviews':standardize_reviews_healthgrades}, 
                                    'vitals':{'reviews':standardize_reviews_vitals}, 
                                    'ratemds':{'reviews':standardize_reviews_ratemds}, 
                                    'zocdoc':{'reviews':standardize_reviews_zocdoc},
                                    'yelp':{'reviews_detailed':standardize_reviews_yelp, 
                                            'blocked_reviews':standardize_reviews_yelp_discarded, 
                                            'removed_reviews': standardize_reviews_yelp_discarded}}

def compress_review_info(x):
    new_x = []
    for i in x:
        try:
            d_i = {}
            for key in ['ReviewDate', 'ReviewScore', 'ReviewSource']:
                d_i[key] = i[key] 
            d_i['ReviewTextLen'] = len(i['ReviewText'])
            new_x.append(d_i)
        except:
            print(i)
    return new_x


# Standard Whole Information

In [None]:
suffix = 'stand'

WebsiteNames = ['healthgrades', 'vitals', 'ratemds', 'zocdoc', 'yelp']
# WebsiteNames = ['healthgrades', 'vitals', 'ratemds', 'yelp']
# WebsiteNames = [ 'vitals', 'ratemds', 'zocdoc', 'yelp']
# WebsiteNames = ['healthgrades']
# WebsiteNames = ['vitals']
selectedNPIs = MDDO_NPI_included

for name in WebsiteNames:
    folder = os.path.join('Output', name)
    subfolders = [os.path.join(folder, i) for i in os.listdir(folder) if name in i and '.DS' not in i]# 'Data/DocListDF_GoogleSearch_s0_e500.p'
    # paths = ['MD_DocListDF_All_GoogleSearch_s0_e10000.p']
    subfolders.sort()
    

    npi_col = WebsiteNames2NPI[name]
    name_col = WebsiteNames2FullName[name]
    review_cols = WebsiteNames2Reviews[name]
    graudation_col, gradyear_func =  WebsiteNames2Graudation[name]
    
    for review_col in review_cols:
        print('\n\nnumber of subfolders for {}: {}'.format(name, len(subfolders)))
        print('Website: {}; Review: {}'.format(name, review_col))

        reported_profile_score_col = WebsiteNames2ReviewsReportedProfileScore[name][review_col]
        
        reported_ratings_num_col = WebsiteNames2ReviewsReportedRatingNum[name][review_col]
        
        reported_reviews_num_col = WebsiteNames2ReviewsReportedReviewNum[name][review_col]
        
        
        # functions
        # filter valid reviews, reviews with 'datetime keys'
        filter_valid_func = WebsiteNames2ValidFilterFunction[name][review_col]
        
        WebInfo = []
        for folder2rd in subfolders:
            # print(folder2rd)
            DF = get_df_from_folder2rd(folder2rd); rawlen = len(DF)
            
            
            DF[npi_col] = DF[npi_col].apply(change_npi_to_int)
            
            
            # DF = DF[DF[npi_col].isin(selectedNPIs)]
            DF = DF[DF[npi_col].isin(selectedNPIs)].reset_index(drop = True)
            DF['NPI'] = DF[npi_col]
            
            DF['Name'] = DF[name_col]
            
            DF['GraduationYear'] = DF[graudation_col].apply(gradyear_func)
            
            
            
            # total review counts
            DF['reported_profile_score'] = DF[reported_profile_score_col]
            
            DF['reported_ratings_num'] = DF[reported_ratings_num_col]
            DF['reported_reviews_num'] = DF[reported_reviews_num_col]
            
            # derive review info
            # NEW: this is importortant and new
            DF['collected_reviews_num'] = DF[review_col].apply(lambda x:len(x))# .values
            
            DF['valid_reviews'] = DF[review_col].apply(filter_valid_func)
            DF['stand_reviews'] = DF['valid_reviews'].apply(compress_review_info)
            
            DF['stand_reviews_num'] = DF['stand_reviews'].apply(lambda x:len(x))# .values
            
            cols = ['NPI', 'Name', 'GraduationYear', 
                    'reported_profile_score',
                    'reported_ratings_num', 
                    'reported_reviews_num', 
                    'collected_reviews_num', 
                    
                    'stand_reviews', 'stand_reviews_num',
                    'clct_time', 'url'
                   ]
            
            DF = DF[cols]
            WebInfo.append(DF)
            print(folder2rd, '\t\t', rawlen, len(DF))
            # break
           
        full_name = '{}_{}'.format(name, review_col)
        FullDF = pd.concat(WebInfo).reset_index(drop = True)
        
        # path = 'Output/{}.p'.format(full_name)
        path = 'Output/{}_{}.p'.format(full_name, suffix)
        print('\n\n\n{}\n\n\n'.format(path))
        
        FullDF.to_pickle(path)
        