# Functions

In [1]:
import pandas as pd
import os
pd.set_option('display.max_columns', None)



def get_df_from_folder2rd(subfolder):
    # subfolder = os.path.join(folder, folder2rd)
    filenames = [os.path.join(subfolder,i) for i in os.listdir(subfolder) if 'errorlog' not in i and '.DS_Store' not in i]
    L = []
    for filename in filenames:
        try:
            df = pd.read_pickle(filename)
            L.append(df)
        except:
            print('Error File', filename.split('/')[-1])

    DF = pd.concat(L).reset_index(drop = True)
    return DF



def review_info(subfolder, review_col):
    DF = get_df_from_folder2rd(subfolder)
    d = {}
    d['subfolder'] = subfolder.split('/')[-1]
    DF['ReviewCountClt'] = DF[review_col].apply(lambda x:len(x))
    d['total_reviews'] = DF['ReviewCountClt'].sum()
    s = DF[DF['ReviewCountClt'] > 0 ]
    d['physician'] = len(DF)
    d['physician_of_reviews'] = (DF['ReviewCountClt'] > 0 ).sum()
    s = DF['ReviewCountClt']
    return DF, d, s


def change_npi_to_int(x):
    try:
        return int(x)
    except:
        # print(x)
        return 0



# WebURL Information

In [2]:
WebsiteNames = ['healthgrades', 'vitals', 'ratemds', 'zocdoc', 'yelp']
doj_path = 'Data/NPI_included.csv'
mddo_path = 'Data/NPPESMDDOPhysician.p'

DOJ_NPI_included = pd.read_csv(doj_path)['NPI'].to_list()
MDDO_NPI_included = pd.read_pickle(mddo_path)['NPI'].to_list()

print('Number of DOJ:    {}'.format(len(DOJ_NPI_included)))
print('Number of MDDO:   {}'.format(len(MDDO_NPI_included)))


for name in WebsiteNames:
    inputfolder = os.path.join('Data', name)
    files = [os.path.join(inputfolder, i) for i in os.listdir(inputfolder) if '.p' in i]# 'Data/DocListDF_GoogleSearch_s0_e500.p'
    files.sort()
    WebsiteKeywords = pd.concat([pd.read_pickle(i) for i in files])
    # WebsiteKeywords = WebsiteKeywords.iloc[WebsiteKeywords['NPI'].drop_duplicates().index]
    WebsiteKeywords = WebsiteKeywords['NPI'].drop_duplicates() 

    print('\nFor Website {}, unique NPIs Num: {}'.format(name.upper(), len(WebsiteKeywords)))
    
    d = WebsiteKeywords[WebsiteKeywords.isin(DOJ_NPI_included)]
    print('number of DOJ  NPIs found in {}: {} /{}'.format(name, len(d), len(DOJ_NPI_included)))
    
    d = WebsiteKeywords[WebsiteKeywords.isin(MDDO_NPI_included)]
    print('number of MDDO NPIs found in {}: {} /{}'.format(name, len(d), len(MDDO_NPI_included)))


Number of DOJ:    944
Number of MDDO:   1141176

For Website HEALTHGRADES, unique NPIs Num: 1006690
number of DOJ  NPIs found in healthgrades: 634 /944
number of MDDO NPIs found in healthgrades: 735850 /1141176

For Website VITALS, unique NPIs Num: 861412
number of DOJ  NPIs found in vitals: 704 /944
number of MDDO NPIs found in vitals: 772471 /1141176

For Website RATEMDS, unique NPIs Num: 221731
number of DOJ  NPIs found in ratemds: 260 /944
number of MDDO NPIs found in ratemds: 178878 /1141176

For Website ZOCDOC, unique NPIs Num: 241137
number of DOJ  NPIs found in zocdoc: 185 /944
number of MDDO NPIs found in zocdoc: 179059 /1141176

For Website YELP, unique NPIs Num: 60332
number of DOJ  NPIs found in yelp: 46 /944
number of MDDO NPIs found in yelp: 39325 /1141176


# ReviewInfo

In [3]:
def geneConvertDatetime(dateKey):
    def convertDatetime(x):
        L = []
        for i in x:
            if dateKey not in i:
                continue
            
            date = i[dateKey]
            try:
                date = pd.to_datetime(date, utc = 'UTC')
                L.append(date)
            except:
                # print(date)
                pass
                
        return L
    return convertDatetime

In [4]:
# WebsiteNames = ['healthgrades', 'vitals', 'ratemds', 'zocdoc', 'yelp']
WebsiteNames = ['healthgrades', 'vitals', 'ratemds', 'yelp']

WebsiteNames2NPI = {'healthgrades':'npi', 
                    'vitals':'npi', 
                    'ratemds':'source_npi', 
                    'zocdoc':'npi', 
                    'yelp':'source_npi'}

WebsiteNames2Reviews = {'healthgrades':['reviews'], 
                        'vitals':['reviews'], 
                        'ratemds':['reviews'], 
                        'zocdoc':['reviews'], 
                        'yelp':['reviews_detailed', 'blocked_reviews', 'removed_reviews']}

WebsiteNames2DateColumns = {'healthgrades':{'reviews':'submittedDate'}, 
                            'vitals':{'reviews':'updated_at_dt'}, 
                            'ratemds':{'reviews':'created'}, 
                            'zocdoc':{'reviews':'date'}, # not all of the reviews has such information.
                            'yelp':{'reviews_detailed':'localizedDate', 
                                    'blocked_reviews':'date', 
                                    'removed_reviews': 'date'}}

        
        
selectedNPIs = MDDO_NPI_included



WebReview2ReviewNum = {}
WebReview2ReviewTime = {}
WebReview2ReviewCltTime = {}

for name in WebsiteNames:
    folder = os.path.join('Output', name)
    subfolders = [os.path.join(folder, i) for i in os.listdir(folder) if name in i and '.DS' not in i]# 'Data/DocListDF_GoogleSearch_s0_e500.p'
    # paths = ['MD_DocListDF_All_GoogleSearch_s0_e10000.p']
    subfolders.sort()
    

    npi_col = WebsiteNames2NPI[name]
    review_cols = WebsiteNames2Reviews[name]
    
    for review_col in review_cols:
        # L = []
        print('\n\nnumber of subfolders for {}: {}'.format(name, len(subfolders)))
    
        print('Website: {}; Review: {}'.format(name, review_col))
        ReviewCount = []
        ReviewTime = []
        
        datekey = WebsiteNames2DateColumns[name][review_col]
        changetime_func = geneConvertDatetime(datekey)

        # doj_NPI = []
        # all_NPI = []

        for folder2rd in subfolders:
            # print(folder2rd)
            DF = get_df_from_folder2rd(folder2rd); rawlen = len(DF)
            
            
            # DF, description, _ = review_info(folder2rd, review_col)
            # print(DF[npi_col].dtype)
            DF[npi_col] = DF[npi_col].apply(change_npi_to_int)
            
            # DF = DF[DF[npi_col].isin(selectedNPIs)]
            DF = DF[DF[npi_col].isin(selectedNPIs)].reset_index(drop = True)
            NPI2ReviewCounts = DF[review_col].apply(lambda x:len(x))# .values
            NPI2ReviewTimes = DF[review_col].apply(changetime_func)
            # print(NPI2ReviewCounts)
            # L.append(d)
            ReviewCount.append(NPI2ReviewCounts)
            ReviewTime.append(NPI2ReviewTimes)
            # doj_NPI.append(doj)
            # all_NPI.append(DF[npi_col])
            print(folder2rd, '\t\t', rawlen, len(DF))
            # break
            
        # break

        # Log = pd.DataFrame(L)
        ReviewNum = pd.concat(ReviewCount).reset_index(drop = True)
        ReviewTime = pd.concat(ReviewTime).reset_index(drop = True)
        WebReview2ReviewNum['{}_{}'.format(name, review_col)] = ReviewNum
        WebReview2ReviewTime['{}_{}'.format(name, review_col)] = ReviewTime
        
        # a = ReviewNum.describe()
        # a.name = '{}_{}_All'.format(name, review_col)
        # ReviewNumPos = ReviewNum[ReviewNum > 0].reset_index(drop = True)
        # b = ReviewNumPos.describe()
        # b.name = '{}_{}_Pos'.format(name, review_col)
        # print(ReviewNumPos.sum())
        # Result = pd.concat([a, b], axis = 1)




number of subfolders for healthgrades: 68
Website: healthgrades; Review: reviews
Output/healthgrades/MD_Doc2GoogleURL_healthgrades_s0_e10000 		 9979 9668
Output/healthgrades/MD_Doc2GoogleURL_healthgrades_s100000_e110000 		 9982 9708
Output/healthgrades/MD_Doc2GoogleURL_healthgrades_s10000_e20000 		 9973 9712
Output/healthgrades/MD_Doc2GoogleURL_healthgrades_s110000_e120000 		 9984 9675
Output/healthgrades/MD_Doc2GoogleURL_healthgrades_s120000_e130000 		 9967 9677
Output/healthgrades/MD_Doc2GoogleURL_healthgrades_s130000_e137121 		 7108 6880
Output/healthgrades/MD_Doc2GoogleURL_healthgrades_s20000_e30000 		 9978 9725
Output/healthgrades/MD_Doc2GoogleURL_healthgrades_s30000_e40000 		 9967 9673
Output/healthgrades/MD_Doc2GoogleURL_healthgrades_s40000_e50000 		 9975 9714
Output/healthgrades/MD_Doc2GoogleURL_healthgrades_s50000_e60000 		 9974 9710
Output/healthgrades/MD_Doc2GoogleURL_healthgrades_s60000_e70000 		 9968 9734
Output/healthgrades/MD_Doc2GoogleURL_healthgrades_s70000_e80000 		

Output/vitals/vitals_v3_s0_e12500 		 12341 9858
Output/vitals/vitals_v3_s12500_e25000 		 12364 9827
Output/vitals/vitals_v3_s25000_e37500 		 12347 9613
Output/vitals/vitals_v3_s37500_e50000 		 12355 10028
Output/vitals/vitals_v3_s50000_e62500 		 12343 9786
Output/vitals/vitals_v3_s62500_e75000 		 12334 9557
Output/vitals/vitals_v3_s75000_e87500 		 12352 9799
Output/vitals/vitals_v3_s87500_e100000 		 12330 9793
Output/vitals/vitals_v4_s0_e12500 		 12316 9639
Output/vitals/vitals_v4_s12500_e25000 		 12336 9969
Output/vitals/vitals_v4_s25000_e37500 		 12348 9810
Output/vitals/vitals_v4_s37500_e50000 		 12329 9656
Output/vitals/vitals_v4_s50000_e62500 		 12321 9582
Output/vitals/vitals_v4_s62500_e75000 		 12344 9988
Output/vitals/vitals_v4_s75000_e87500 		 12321 9737
Output/vitals/vitals_v4_s87500_e100000 		 12326 9620
Output/vitals/vitals_v5_s0_e192 		 189 157


number of subfolders for ratemds: 23
Website: ratemds; Review: reviews
Output/ratemds/MD_Doc2GoogleURL_ratemds_s0_e2199 		 1897 

In [5]:
from IPython.display import display, HTML

for webcol, ReviewNum in WebReview2ReviewNum.items():
    print('\n\n{}'.format(webcol))
    a = ReviewNum.describe()
    a.name = '{}_All'.format(webcol)
    ReviewNumPos = ReviewNum[ReviewNum > 0].reset_index(drop = True)
    b = ReviewNumPos.describe()
    b.name = '{}_Pos'.format(webcol)
    print('Total Number of Reviews {}'.format(ReviewNumPos.sum()))
    Result = pd.concat([a, b], axis = 1)
    display(Result)



healthgrades_reviews
Total Number of Reviews 2210337


Unnamed: 0,healthgrades_reviews_All,healthgrades_reviews_Pos
count,732012.0,342818.0
mean,3.019537,6.447552
std,12.098088,17.041875
min,0.0,1.0
25%,0.0,1.0
50%,0.0,3.0
75%,2.0,6.0
max,2711.0,2711.0




vitals_reviews
Total Number of Reviews 6158207


Unnamed: 0,vitals_reviews_All,vitals_reviews_Pos
count,731259.0,505005.0
mean,8.421376,12.194349
std,16.902023,19.174466
min,0.0,1.0
25%,0.0,3.0
50%,3.0,7.0
75%,11.0,15.0
max,3280.0,3280.0




ratemds_reviews
Total Number of Reviews 314856


Unnamed: 0,ratemds_reviews_All,ratemds_reviews_Pos
count,163205.0,53395.0
mean,1.929206,5.896732
std,6.934426,11.116856
min,0.0,1.0
25%,0.0,1.0
50%,0.0,3.0
75%,1.0,6.0
max,357.0,357.0




yelp_reviews_detailed
Total Number of Reviews 403352


Unnamed: 0,yelp_reviews_detailed_All,yelp_reviews_detailed_Pos
count,37733.0,25682.0
mean,10.689635,15.70563
std,24.26728,28.043976
min,0.0,1.0
25%,0.0,3.0
50%,3.0,6.0
75%,11.0,17.0
max,602.0,602.0




yelp_blocked_reviews
Total Number of Reviews 172230


Unnamed: 0,yelp_blocked_reviews_All,yelp_blocked_reviews_Pos
count,37733.0,19501.0
mean,4.56444,8.831855
std,12.591117,16.403426
min,0.0,1.0
25%,0.0,2.0
50%,1.0,4.0
75%,4.0,9.0
max,377.0,377.0




yelp_removed_reviews
Total Number of Reviews 17345


Unnamed: 0,yelp_removed_reviews_All,yelp_removed_reviews_Pos
count,37733.0,6900.0
mean,0.459677,2.513768
std,1.410309,2.390361
min,0.0,1.0
25%,0.0,1.0
50%,0.0,1.0
75%,0.0,3.0
max,10.0,10.0


# ReviewTime

In [17]:
import pickle

path = 'Output/WebReview2ReviewTime.p'

with open(path, 'wb') as f: 
    pickle.dump(WebReview2ReviewTime, f)

In [72]:
WebReview2ReviewTime['healthgrades_reviews']

0         [2020-04-08 00:00:00+00:00, 2019-11-16 00:00:0...
1                               [2017-02-24 00:00:00+00:00]
2         [2020-12-15 00:00:00+00:00, 2019-11-24 00:00:0...
3                               [2016-03-27 00:00:00+00:00]
4                                                        []
                                ...                        
732007    [2019-03-11 00:00:00+00:00, 2019-03-05 00:00:0...
732008                          [2019-06-02 00:00:00+00:00]
732009                                                   []
732010                                                   []
732011    [2021-04-09 00:00:00+00:00, 2020-07-04 00:00:0...
Name: reviews, Length: 732012, dtype: object

## Get NPI's Mean Date

In [68]:
import numpy as np
import datetime

def get_the_mean_date(dates):
    ref_date = pd.to_datetime('2010-01-01', utc = 'UTC')
    mean_date = pd.to_timedelta(np.mean([(date - ref_date).days for date in dates]), 'd') + ref_date
    return mean_date



# Collection date: 2021-08-01

# collection date: 
"2020-10-01" 
"2020-10-05"  



# mean date
"2020-10-03"


In [69]:
import numpy as np

Webname2MeanDate = {}
for webname in WebReview2ReviewTime:
    # print('\n\n' + webname)
    L = []
    for npi2reviewdatetime in WebReview2ReviewTime[webname]:
        npi2reviewdatetime = [i for i in npi2reviewdatetime if not pd.isnull(i)]
        if len(npi2reviewdatetime) == 0:
            L.append(np.nan)
        else:
            average_date = get_the_mean_date(npi2reviewdatetime)
            L.append(average_date)
            
    d = pd.Series(L)
    Webname2MeanDate[webname] = d
    
    
    
for webname in Webname2MeanDate:
    print('\n{}'.format(webname))
    
    NPI2AverageDate = Webname2MeanDate[webname]
    
    zero_ratio = (-NPI2AverageDate.isna()).mean()* 100
    print('ratio of npi of positive reviews: {}% [{}/{}]'.format(round(zero_ratio, 2), (-NPI2AverageDate.isna()).sum(),  len(NPI2AverageDate)))
    report = NPI2AverageDate.dt.year.value_counts().sort_index(ascending = False)
    df = pd.concat([report, report.cumsum() / report.sum()], axis = 1)
    df.columns = ['NPI Number', 'CumProp']
    
    display(df)
    
    


healthgrades_reviews
ratio of npi of positive reviews: 46.83% [342818/732012]


Unnamed: 0,NPI Number,CumProp
2021.0,18377,0.053606
2020.0,36892,0.16122
2019.0,69617,0.364292
2018.0,100882,0.658565
2017.0,74347,0.875435
2016.0,41437,0.996307
2015.0,1266,1.0



vitals_reviews
ratio of npi of positive reviews: 29.4% [214987/731259]


Unnamed: 0,NPI Number,CumProp
2021.0,85,0.000395
2020.0,556,0.002982
2019.0,1927,0.011945
2018.0,4116,0.03109
2017.0,11753,0.085759
2016.0,40148,0.272505
2015.0,75806,0.625112
2014.0,54854,0.880263
2013.0,19544,0.97117
2012.0,4991,0.994386



ratemds_reviews
ratio of npi of positive reviews: 32.72% [53395/163205]


Unnamed: 0,NPI Number,CumProp
2021.0,420,0.007866
2020.0,758,0.022062
2019.0,1172,0.044012
2018.0,1818,0.07806
2017.0,2559,0.125986
2016.0,3339,0.18852
2015.0,4481,0.272441
2014.0,5937,0.383631
2013.0,6377,0.503062
2012.0,6327,0.621556



yelp_reviews_detailed
ratio of npi of positive reviews: 68.06% [25682/37733]


Unnamed: 0,NPI Number,CumProp
2021.0,429,0.016704
2020.0,1326,0.068336
2019.0,2944,0.182969
2018.0,4481,0.357449
2017.0,4942,0.549879
2016.0,4200,0.713418
2015.0,2937,0.827778
2014.0,1822,0.898723
2013.0,1098,0.941477
2012.0,647,0.966669



yelp_blocked_reviews
ratio of npi of positive reviews: 51.42% [19403/37733]


Unnamed: 0,NPI Number,CumProp
2021.0,484,0.024945
2020.0,1136,0.083492
2019.0,2139,0.193733
2018.0,2983,0.347472
2017.0,3001,0.502139
2016.0,2767,0.644746
2015.0,2198,0.758027
2014.0,1662,0.843684
2013.0,1212,0.906149
2012.0,828,0.948822



yelp_removed_reviews
ratio of npi of positive reviews: 18.29% [6900/37733]


Unnamed: 0,NPI Number,CumProp
2021.0,170,0.024638
2020.0,329,0.072319
2019.0,875,0.19913
2018.0,918,0.332174
2017.0,797,0.447681
2016.0,819,0.566377
2015.0,813,0.684203
2014.0,635,0.776232
2013.0,524,0.852174
2012.0,350,0.902899


## Get NPI's Top 50 Mean Date

In [70]:
import numpy as np


# today = pd.to_datetime('2021-10-01', utc = 'UTC')


top = 50
Webname2TopMeanDate = {}
for webname in WebReview2ReviewTime:
    # print('\n\n' + webname)
    L = []
    for npi2reviewdatetime in WebReview2ReviewTime[webname]:
        npi2reviewdatetime = [i for i in npi2reviewdatetime if not pd.isnull(i)]
        npi2reviewdatetime = npi2reviewdatetime[:top]
        if len(npi2reviewdatetime) == 0:
            L.append(np.nan)
        else:
            average_date = get_the_mean_date(npi2reviewdatetime)
            L.append(average_date)
            
    d = pd.Series(L)
    Webname2TopMeanDate[webname] = d
    

for webname in Webname2TopMeanDate:
    print('\n{}'.format(webname))
    
    NPI2AverageDate = Webname2TopMeanDate[webname]
    
    zero_ratio = (-NPI2AverageDate.isna()).mean()* 100
    print('ratio of npi of positive reviews: {}% [{}/{}]'.format(round(zero_ratio, 2), (-NPI2AverageDate.isna()).sum(),  len(NPI2AverageDate)))
    report = NPI2AverageDate.dt.year.value_counts().sort_index(ascending = False)
    df = pd.concat([report, report.cumsum() / report.sum()], axis = 1)
    df.columns = ['NPI Number', 'CumProp']
    
    display(df)
    
    


healthgrades_reviews
ratio of npi of positive reviews: 46.83% [342818/732012]


Unnamed: 0,NPI Number,CumProp
2021.0,19202,0.056012
2020.0,37594,0.165674
2019.0,69155,0.367399
2018.0,100078,0.659327
2017.0,74106,0.875494
2016.0,41417,0.996307
2015.0,1266,1.0



vitals_reviews
ratio of npi of positive reviews: 29.4% [214987/731259]


Unnamed: 0,NPI Number,CumProp
2021.0,80,0.000372
2020.0,477,0.002591
2019.0,1675,0.010382
2018.0,3821,0.028155
2017.0,11329,0.080851
2016.0,38894,0.261765
2015.0,73446,0.603395
2014.0,56276,0.865159
2013.0,21487,0.965105
2012.0,5820,0.992176



ratemds_reviews
ratio of npi of positive reviews: 32.72% [53395/163205]


Unnamed: 0,NPI Number,CumProp
2021.0,424,0.007941
2020.0,780,0.022549
2019.0,1188,0.044798
2018.0,1825,0.078977
2017.0,2565,0.127016
2016.0,3353,0.189812
2015.0,4460,0.27334
2014.0,5911,0.384043
2013.0,6370,0.503343
2012.0,6315,0.621613



yelp_reviews_detailed
ratio of npi of positive reviews: 68.06% [25682/37733]


Unnamed: 0,NPI Number,CumProp
2021.0,465,0.018106
2020.0,1541,0.078109
2019.0,3235,0.204073
2018.0,4619,0.383926
2017.0,4819,0.571568
2016.0,3944,0.725138
2015.0,2802,0.834242
2014.0,1735,0.901799
2013.0,1053,0.9428
2012.0,635,0.967526



yelp_blocked_reviews
ratio of npi of positive reviews: 51.42% [19403/37733]


Unnamed: 0,NPI Number,CumProp
2021.0,489,0.025202
2020.0,1157,0.084832
2019.0,2175,0.196928
2018.0,3027,0.352935
2017.0,2957,0.505334
2016.0,2743,0.646704
2015.0,2166,0.758336
2014.0,1662,0.843993
2013.0,1210,0.906355
2012.0,829,0.94908



yelp_removed_reviews
ratio of npi of positive reviews: 18.29% [6900/37733]


Unnamed: 0,NPI Number,CumProp
2021.0,170,0.024638
2020.0,329,0.072319
2019.0,875,0.19913
2018.0,918,0.332174
2017.0,797,0.447681
2016.0,819,0.566377
2015.0,813,0.684203
2014.0,635,0.776232
2013.0,524,0.852174
2012.0,350,0.902899


## Get NPI's Top 10 Mean Date

In [71]:
import numpy as np


top = 10
Webname2TopMeanDate = {}
for webname in WebReview2ReviewTime:
    # print('\n\n' + webname)
    L = []
    for npi2reviewdatetime in WebReview2ReviewTime[webname]:
        npi2reviewdatetime = [i for i in npi2reviewdatetime if not pd.isnull(i)]
        npi2reviewdatetime = npi2reviewdatetime[:top]
        if len(npi2reviewdatetime) == 0:
            L.append(np.nan)
        else:
            average_date = get_the_mean_date(npi2reviewdatetime)
            L.append(average_date)
            
    d = pd.Series(L)
    Webname2TopMeanDate[webname] = d
    
    
for webname in Webname2TopMeanDate:
    print('\n{}'.format(webname))
    
    NPI2AverageDate = Webname2TopMeanDate[webname]
    
    zero_ratio = (-NPI2AverageDate.isna()).mean()* 100
    print('ratio of npi of positive reviews: {}% [{}/{}]'.format(round(zero_ratio, 2), (-NPI2AverageDate.isna()).sum(),  len(NPI2AverageDate)))
    report = NPI2AverageDate.dt.year.value_counts().sort_index(ascending = False)
    df = pd.concat([report, report.cumsum() / report.sum()], axis = 1)
    df.columns = ['NPI Number', 'CumProp']
    display(df)
    
    


healthgrades_reviews
ratio of npi of positive reviews: 46.83% [342818/732012]


Unnamed: 0,NPI Number,CumProp
2021.0,26426,0.077085
2020.0,41014,0.196722
2019.0,69935,0.400723
2018.0,93302,0.672885
2017.0,69762,0.87638
2016.0,41113,0.996307
2015.0,1266,1.0



vitals_reviews
ratio of npi of positive reviews: 29.4% [214987/731259]


Unnamed: 0,NPI Number,CumProp
2021.0,50,0.000233
2020.0,259,0.001437
2019.0,916,0.005698
2018.0,2025,0.015117
2017.0,5347,0.039988
2016.0,16324,0.115919
2015.0,34417,0.276007
2014.0,44319,0.482155
2013.0,40955,0.672655
2012.0,30525,0.81464



ratemds_reviews
ratio of npi of positive reviews: 32.72% [53395/163205]


Unnamed: 0,NPI Number,CumProp
2021.0,414,0.007754
2020.0,751,0.021819
2019.0,1167,0.043675
2018.0,1814,0.077648
2017.0,2579,0.125948
2016.0,3395,0.189531
2015.0,4511,0.274014
2014.0,5940,0.385261
2013.0,6259,0.502482
2012.0,6196,0.618522



yelp_reviews_detailed
ratio of npi of positive reviews: 68.06% [25682/37733]


Unnamed: 0,NPI Number,CumProp
2021.0,1315,0.051203
2020.0,3414,0.184137
2019.0,4744,0.368858
2018.0,4628,0.549062
2017.0,3607,0.68951
2016.0,2755,0.796784
2015.0,1922,0.871622
2014.0,1233,0.919632
2013.0,819,0.951522
2012.0,496,0.970836



yelp_blocked_reviews
ratio of npi of positive reviews: 51.42% [19403/37733]


Unnamed: 0,NPI Number,CumProp
2021.0,559,0.02881
2020.0,1393,0.100603
2019.0,2496,0.229243
2018.0,3076,0.387775
2017.0,2922,0.53837
2016.0,2552,0.669896
2015.0,2037,0.77488
2014.0,1512,0.852806
2013.0,1141,0.911612
2012.0,760,0.950781



yelp_removed_reviews
ratio of npi of positive reviews: 18.29% [6900/37733]


Unnamed: 0,NPI Number,CumProp
2021.0,170,0.024638
2020.0,329,0.072319
2019.0,875,0.19913
2018.0,918,0.332174
2017.0,797,0.447681
2016.0,819,0.566377
2015.0,813,0.684203
2014.0,635,0.776232
2013.0,524,0.852174
2012.0,350,0.902899


## Get NPI's Year-Month

In [None]:
Webname2Year = {}
Webname2YearMonth = {}

for webname in WebReview2ReviewTime:
    print('\n\n' + webname)
    L1 = []
    L2 = []
    for npi2reviewdatetime in WebReview2ReviewTime[webname]:
        npi2reviewdatetime = [i for i in npi2reviewdatetime if not pd.isnull(i)]
        if len(npi2reviewdatetime) == 0:
            L.append(np.nan)
        else:
            year = [i.year for i in npi2reviewdatetime]
            month = [i.month for i in npi2reviewdatetime]
            year_month = ['{}-{}'.format(y, m) for y, m in zip(year, month)]
            L1.append(year)
            L2.append(year_month)
        
    Webname2Year[webname] = pd.Series(L1)
    Webname2YearMonth[webname] = pd.Series(L2)