# Setup

In [None]:
import pandas as pd
import itertools
import os 

CutOffDate = pd.to_datetime('2021-08-01', utc = 'UTC')
Recent1YearDate = pd.to_datetime('2020-08-01', utc = 'UTC')
Recent2YearDate = pd.to_datetime('2019-08-01', utc = 'UTC')
Recent3YearDate = pd.to_datetime('2018-08-01', utc = 'UTC')

# Settings

In [None]:
MIN_TEXT_LEN = 0

# Get Website 2 Review Time

In [None]:
def get_website2rvw(Folder):

    suffix = ''
    website2path = {
        'healthgrades': 'healthgrades{}.p'.format(suffix), 
        'vitals': 'vitals{}.p'.format(suffix), 
        'ratemds': 'ratemds{}.p'.format(suffix), 
        'yelp': 'yelp{}.p'.format(suffix),
        'zocdoc': 'zocdoc{}.p'.format(suffix)
    }
    
    website2data = {}
    for name, path in website2path.items():
        path = os.path.join(Folder, path)
        df = pd.read_pickle(path)
        website2data[name] = df
    return website2data
        

In [None]:
def get_website2time(website2data):
    webiste2time = {}
    for name, df in website2data.items():
        tmp_list = df['final_review'].apply(lambda x: [i['ReviewDate'] for i in x])
        reviewdate = pd.DataFrame(list(itertools.chain(*tmp_list)))
        reviewdate.columns = ['ReviewDate']
        reviewdate['Source'] = 'name'
        webiste2time[name] = reviewdate
    return webiste2time

In [None]:
Folder = 'Output/MinText{}'.format(MIN_TEXT_LEN)
website2data = get_website2rvw(Folder)

In [None]:
webiste2time = get_website2time(website2data)

# Review Recent Year Rate

In [None]:
def recent_year_rate_per_review(webiste2time):
    d = {}
    L = {}
    
    # reviews all the time
    for web, df in webiste2time.items():
        df2 = df
        L[web] = len(df2)
    if True:
        L['all'] = sum([v for k,v in L.items() if k!= 'zocdoc']) 
        L['all(zocdoc)'] =  sum([v for k,v in L.items() if k!= 'all']) 
    d['reviewsalltime'] = L 

    # recent 3 years
    L = {}
    for web, df in webiste2time.items():
        df2 = df
        df2 = df[df['ReviewDate'] > Recent3YearDate]
        # print(web, len(df2))
        L[web] = len(df2)
    if True:
        L['all'] = sum([v for k,v in L.items() if k!= 'zocdoc']) 
        L['all(zocdoc)'] =  sum([v for k,v in L.items() if k!= 'all']) 
    d['recent3years'] = L 

    # recent 2 years
    L = {}
    for web, df in webiste2time.items():
        df2 = df
        df2 = df[df['ReviewDate'] > Recent2YearDate]
        # print(web, len(df2))
        L[web] = len(df2)
    if True:
        L['all'] = sum([v for k,v in L.items() if k!= 'zocdoc']) 
        L['all(zocdoc)'] =  sum([v for k,v in L.items() if k!= 'all']) 
    d['recent2years'] = L 
    
    # recent 1 year
    L = {}
    for web, df in webiste2time.items():
        df2 = df
        df2 = df[df['ReviewDate'] > Recent1YearDate]
        # print(web, len(df2))
        L[web] = len(df2)
    if True:
        L['all'] = sum([v for k,v in L.items() if k!= 'zocdoc']) 
        L['all(zocdoc)'] =  sum([v for k,v in L.items() if k!= 'all']) 
    d['recent1years'] = L 
    
    tmp = pd.DataFrame(d)
    # for i in [3, 2, 1]:
    #     tmp['recent{}years_rate'.format(i)] = tmp['recent{}years'.format(i)]/tmp['reviewsalltime']
    L = []
    for name, row in tmp.iterrows():
        d = {}
        d['platform'] = name
        
        for k, v in row.to_dict().items(): 
            # print(k)
            d[k] = v
        
        for i in [3, 2, 1]:
            col = 'recent{}years'.format(i)
            pct = d[col] / d['reviewsalltime'] * 100
            d[col] = str(d[col]) + ' ({:.2f}%)'.format(pct)
        L.append(d)
    tmp = pd.DataFrame(L)
    return tmp



In [None]:
ReviewCreatedTimeReport = recent_year_rate_per_review(webiste2time)
ReviewCreatedTimeReport.to_csv('Output/ReviewAge/MinTxt{}_ReviewCreatedTimeReport.csv'.format(MIN_TEXT_LEN), index = False)
ReviewCreatedTimeReport

# Average Age

In [None]:
def get_platform_review_average_age(webiste2time, CutOffDate):

    report = []
    for web, df in webiste2time.items():
        # L.append(df)
        days = CutOffDate - pd.to_datetime(df['ReviewDate'])
        days#.apply(lambda x: x.days)
        mean_days = days.mean()
        days = mean_days.days + mean_days.seconds / (24*60*60)
        report.append({'platform': web, 'average_days': days, 'average_years': days/365.25})

    # print('\nAll Review')
    df = pd.concat([df for name, df in webiste2time.items() if name != 'zocdoc'])
    days = pd.to_datetime('2021-08-01', utc = 'UTC') - pd.to_datetime(df['ReviewDate'])
    mean_days = days.mean()
    days = mean_days.days + mean_days.seconds / (24*60*60)
    report.append({'platform': 'all', 'average_days': days, 'average_years': days/365.25})
    # print(mean_days)
    
    # print('\nAll Review')
    df = pd.concat([df for name, df in webiste2time.items()])
    days = pd.to_datetime('2021-08-01', utc = 'UTC') - pd.to_datetime(df['ReviewDate'])
    mean_days = days.mean()
    days = mean_days.days + mean_days.seconds / (24*60*60)
    report.append({'platform': 'all(zocdoc)', 'average_days': days, 'average_years': days/365.25})

    report = pd.DataFrame(report)
    # report.to_clipboard()
    return report

ReportAverageTime = get_platform_review_average_age(webiste2time, CutOffDate)
ReportAverageTime.to_csv('Output/ReviewAge/MinTxt{}_ReportAverageTime.csv'.format(MIN_TEXT_LEN))
ReportAverageTime

# By Date

In [None]:
D = {}
for name in webiste2time:
    df = webiste2time[name]
    tmp = df['ReviewDate'].dt.date
    tmp = tmp.value_counts().sort_index()
    tmp.name = name
    D[name] = tmp

DateValue = pd.concat([v for k, v in D.items()], axis = 1).sort_index()
DateValue.index = pd.to_datetime(DateValue.index)
DateValue.to_csv('Output/ReviewAge/MinTxt{}_ReviewDate.csv'.format(MIN_TEXT_LEN))

# By Month

In [None]:
D = {}
for name in webiste2time:
    df = webiste2time[name]
    tmp = df['ReviewDate'].dt.year.astype(str) +'-'+ df['ReviewDate'].dt.month.apply(lambda x: '0'+str(x) if x < 10 else str(x))
    tmp = tmp.value_counts().sort_index()
    tmp.name = name
    D[name] = tmp

MonthValue = pd.concat([v for k, v in D.items()], axis = 1).sort_index()
MonthValue.index = pd.to_datetime(MonthValue.index)
MonthValue.to_csv('Output/ReviewAge/MinTxt{}_ReviewMonth.csv'.format(MIN_TEXT_LEN))


# By Year

In [None]:
D = {}
for name in webiste2time:
    df = webiste2time[name]
    tmp = df['ReviewDate'].dt.year.astype(str) 
    tmp = tmp.value_counts().sort_index()
    tmp.name = name
    D[name] = tmp

MonthValue = pd.concat([v for k, v in D.items()], axis = 1).sort_index()
MonthValue.index = pd.to_datetime(MonthValue.index)
MonthValue.to_csv('Output/ReviewAge/MinTxt{}_ReviewYear.csv'.format(MIN_TEXT_LEN))
