# Get NPPES 

In [None]:
import pandas as pd
import os

NPPES = pd.read_pickle('Data/NPPES/NPI2InfoMDDO.p')
NPPESNames = NPPES[['NPI', 'LastName', 'FirstName', 'Gender']]
nppes_set = set(NPPESNames['NPI'].unique())
print(NPPESNames.shape)

In [None]:
suffix = '_stand'
website2path = {
    'healthgrades': 'healthgrades_reviews{}.p'.format(suffix), 
    'vitals': 'vitals_reviews{}.p'.format(suffix), 
    'ratemds': 'ratemds_reviews{}.p'.format(suffix), 
    'yelp': 'yelp_reviews_detailed{}.p'.format(suffix),
    'zocdoc': 'zocdoc_reviews{}.p'.format(suffix)
}

# Get IgnoreNPI List

In [None]:
def get_website2ignorenpis(website2path, nppes_set):
    website2ignorenpis = {}

    # healthgrades
    website2ignorenpis['healthgrades'] = []

    # vitals
    website2ignorenpis['vitals'] = []

    # ratemds
    path = 'Data/ToIgnore/ToIgnoreRateMDs.csv'
    ratemds_ignore_npis = pd.read_csv(path)['NPI'].to_list()
    website2ignorenpis['ratemds'] = ratemds_ignore_npis

    # zocdoc
    website2ignorenpis['zocdoc'] = []

    # yelp
    path = 'Data/ToIgnore/ToIgnoreYelp.csv'
    yelp_ignore_npis = pd.read_csv(path)['NPI'].to_list()
    # print(len(yelp_ignore_npis))
    website2ignorenpis['yelp'] = yelp_ignore_npis

    # match with NPPES
    for name, path in website2path.items():
        path = os.path.join('Data/Reviews', path)
        df = pd.read_pickle(path)
        web_npi_set = set(df['NPI'])
        to_ignore_new = list(web_npi_set - nppes_set)
        website2ignorenpis[name] = website2ignorenpis[name] + to_ignore_new
        
    return website2ignorenpis


website2ignorenpis = get_website2ignorenpis(website2path, nppes_set)

# Filter Physcians

In [None]:
def get_website2validphyDF(website2path, website2ignorenpis):
    Report = []
    website2df = {}
    for name, path in website2path.items():
        path = os.path.join('Data/Reviews', path)
        d = {}
        d['name'] = name

        # input data
        df = pd.read_pickle(path)
        website2df[name] = df
        d['RawNum'] = len(df)

        # drop duplicates
        a = df['NPI'].drop_duplicates()
        df = df.loc[a.index].reset_index(drop = True)
        d['UnqNum'] = len(df)

        # match or not
        ignore_npis = website2ignorenpis[name]
        d['ToIgnore'] = len(ignore_npis)
        df = df[-df['NPI'].isin(ignore_npis)].reset_index(drop = True)
        d['MchNum'] = len(df)

        # valid profile score or not
        df['reported_profile_score'] = df['reported_profile_score'].astype(float)
        df = df[- (df['reported_profile_score'] > 5)].reset_index(drop = True)
        df = df[- (df['reported_profile_score'] < 0)].reset_index(drop = True)
        website2df[name] = df
        d['FnlNum'] = len(df)
        Report.append(d)
        
    PhysicianReport = pd.DataFrame(Report)
    return website2df, PhysicianReport

website2validphy, PhysicianReport = get_website2validphyDF(website2path, website2ignorenpis)
PhysicianReport.to_csv('Output/PhysicianReport.csv', index = False)
PhysicianReport

# Filter Reviews

Three elements of Review

1. `Tx`
2. `Ts`
3. `Sc`

In [None]:
CUTOFF_DATE = pd.to_datetime('2021-08-01', utc = 'UTC')

In [None]:
def filter_ValidTimeScore_Reviews(x):
    new_x = []
    bad_x = []
    for i in x:
        Flag = 'Bad'
        try:
            # datetime validation
            date = pd.to_datetime(i['ReviewDate'], utc='UTC') # this step costs time.
            i['ReviewDate'] = date
            
            # review score 
            i['ReviewScore'] = float(i['ReviewScore'])
            if i['ReviewScore'] <=5 and i['ReviewScore'] >= 0:
                Flag = 'Good'
            else:
                print(i)
        except:
            pass
        
        if Flag == 'Bad':
            bad_x.append(i)
        else:
            new_x.append(i)
            
    assert len(new_x) + len(bad_x) == len(x)
    return new_x, bad_x


def get_cutoff_reviews(x):
    return [i for i in x if i['ReviewDate'] < CUTOFF_DATE]


In [None]:
# good reviews: pass (1) Tx, (2) Ts, and (3) Sc
# final reviews: before 2021.8.1


def get_website2validreviewDF(website2validphy, MIN_TEXT_LENGTH, CUTOFF_DATE):
    website2validrvw = {}
    for website, df in website2validphy.items():
        # Filter Review (1) Tx: Text Length
        df['stdTx{}_reviews'.format(MIN_TEXT_LENGTH)] = df['stand_reviews'].apply(lambda x: [i for i in x if i['ReviewTextLen'] >= MIN_TEXT_LENGTH])
        df['stdTx{}_reviews_num'.format(MIN_TEXT_LENGTH)] = df['stdTx{}_reviews'.format(MIN_TEXT_LENGTH)].apply(lambda x: len(x))

        # Filter Review (2) Ts: Time and (3) Sc: Score
        tmp = df['stdTx{}_reviews'.format(MIN_TEXT_LENGTH)].apply(filter_ValidTimeScore_Reviews)
        df['good_review'] = [i[0] for i in tmp.values]
        df['good_review_num'] = df['good_review'].apply(lambda x: len(x))
        # df['bad_review']   = [i[1] for i in tmp.values]
        # df['bad_review_num'] = df['bad_review' ].apply(lambda x: len(x))

        # Filter Time: only here, as the timestamp is valid here.
        df['final_review'] = df['good_review'].apply(get_cutoff_reviews)

        # Final Info
        df['final_review_num'] = df['final_review'].apply(lambda x:len(x))
        df['final_review_date']  = df['final_review'].apply(lambda x: [i['ReviewDate'] for i in x])
        df['final_review_score'] = df['final_review'].apply(lambda x: [i['ReviewScore'] for i in x])
        df['final_review_length'] = df['final_review'].apply(lambda x:[i['ReviewTextLen'] for i in x])
        website2validrvw[website] = df
        
    L = []
    for name, df in website2validrvw.items():
        
        d = {}
        d['name'] = name
        d['rptratings'] = df['reported_ratings_num'].sum()
        d['rptreviews'] = df['reported_reviews_num'].sum()
        d['cltreviews'] = df['collected_reviews_num'].sum()
        d['stdreviews'] = df['stand_reviews_num'].sum()
        d['stdTx{}reviews'.format(MIN_TEXT_LENGTH)] = df['stdTx{}_reviews_num'.format(MIN_TEXT_LENGTH)].sum()
        d['goodreviews'] = df['good_review_num'].sum()
        d['finalreviews'] = df['final_review_num'].sum()
        d['phynum'] = len(df)
        d['phynumTxTsScAug'] = (df['final_review_num'] > 0).sum()
        L.append(d)
        
        
        cols = ['NPI', 
                'GraduationYear',
                'reported_profile_score',
                # 'reported_ratings_num', 
                # 'reported_reviews_num', 
                # 'collected_reviews_num',
                # 'stand_reviews', 
                # 'stand_reviews_num', 
                # 'clct_time', 
                # 'stdTx0_reviews',
                # 'stdTx0_reviews_num',
                # 'good_review', 
                # 'good_review_num',
                'final_review',
                # 'final_review_num', 
                # 'final_review_date', 
                # 'final_review_score',
                # 'final_review_length'
               ]
        df[cols].to_pickle('Output/MinText{}/{}.p'.format(MIN_TEXT_LENGTH, name))
        
    ReviewReport = pd.DataFrame(L)
    return website2validrvw, ReviewReport



## All Reviews

In [None]:
MIN_TEXT_LENGTH = 0

website2validphy, PhyReport = get_website2validphyDF(website2path, website2ignorenpis)
website2validrvw, RvwReport = get_website2validreviewDF(website2validphy, MIN_TEXT_LENGTH, CUTOFF_DATE)

RvwReport.to_csv('Output/MinText{}/ReviewReport.csv'.format(MIN_TEXT_LENGTH), index = False)
RvwReport

## Text Reviews

In [None]:
MIN_TEXT_LENGTH = 1

website2validphy, PhyReport = get_website2validphyDF(website2path, website2ignorenpis)
website2validrvw, RvwReport = get_website2validreviewDF(website2validphy, MIN_TEXT_LENGTH, CUTOFF_DATE)

RvwReport.to_csv('Output/MinText{}/ReviewReport.csv'.format(MIN_TEXT_LENGTH), index = False)
RvwReport