# Transform Batch/Streaming Data

In [1]:
import pandas as pd
import datetime
import time

Streaming starting date

In [2]:
pd.DataFrame({'date': ['Thursday, July 27', 'Friday, July 28', 'Saturday, July 29'], 
              'reviews': [10223, 10000, 10000]})

Unnamed: 0,date,reviews
0,"Thursday, July 27",10223
1,"Friday, July 28",10000
2,"Saturday, July 29",10000


In [3]:
df_stream_80223_raw_samples = pd.read_parquet('../../stream_80223_raw_samples.parquet')
df_stream_80223_raw_samples.columns

Index(['asin', 'overall', 'reviewText', 'reviewerID', 'reviewerName',
       'summary', 'verified', 'internal_partition', 'partition_number',
       'reviewID', 'style', 'vote', 'image'],
      dtype='object')

In [4]:
batch_data_music_all = pd.read_parquet('../../batch_data_music_all_291631_noID.parquet')
batch_data_music_all.columns

Index(['asin', 'image', 'overall', 'reviewText', 'reviewerID', 'reviewerName',
       'style', 'summary', 'unixReviewTime', 'verified', 'vote'],
      dtype='object')

In [5]:
def get_review_id(id_pre, num_samples, ts):
    # specify padding for int string
    max_unit = 1e9
    n0s = len(str(int(max_unit)))

    if not ts.empty:
        timestamps = ts
    else:
        timestamp = f'{int(time.time()):0{n0s}}'
        timestamps = pd.Series([timestamp for _ in range(num_samples)])

    sub_ids = pd.Series(range(num_samples)).map(f'{{:0{n0s}}}'.format)
    reviews_id = id_pre + timestamps + 'T' + sub_ids

    return reviews_id

def transform(df):
    if 'unixReviewTime' in df.columns: 
        last_batch_date = datetime.datetime.strptime('2018-09-29', "%Y-%m-%d")
        first_streaming_date = datetime.datetime.strptime('2023-07-27', "%Y-%m-%d")
        diff_days = first_streaming_date - last_batch_date - datetime.timedelta(days=1)
        
        id_prefix = 'B' # Batch Data
        df['dateReview'] = pd.to_datetime(
            df['unixReviewTime'].astype(int), unit='s').dt.date + diff_days
        timestamp = df.pop('unixReviewTime')
    else:
        id_prefix = 'S' # Stream Data
        timestamp = pd.Series()
        # Remove unnecessary columns
        df.drop(['internal_partition', 'partition_number'], axis=1, inplace=True)
        df['dateReview'] = datetime.date.today()
        
    df['overall'] = pd.to_numeric(df['overall']).astype(int)
    # Create column of review ID 
    df['reviewID'] = get_review_id(id_prefix, len(df), timestamp)
    # Replace nulls by 0s
    df.loc[df['vote'].isnull(), 'vote'] = '0'
    df['vote'] = df['vote'].str.replace(',', '').astype(int)
    # Remove unnecessary columns
    df.drop(['image', 'style'], axis=1, inplace=True)

In [6]:
transform(df_stream_80223_raw_samples)
print(sorted(df_stream_80223_raw_samples.columns))

['asin', 'dateReview', 'overall', 'reviewID', 'reviewText', 'reviewerID', 'reviewerName', 'summary', 'verified', 'vote']


In [7]:
transform(batch_data_music_all)
print(sorted(batch_data_music_all.columns))

['asin', 'dateReview', 'overall', 'reviewID', 'reviewText', 'reviewerID', 'reviewerName', 'summary', 'verified', 'vote']


In [8]:
batch_data_music_all['dateReview'].sort_values(ascending=False)

122219    2023-07-26
122220    2023-07-26
56173     2023-07-25
267113    2023-07-23
100861    2023-07-22
             ...    
94337     2007-03-28
244468    2006-12-26
244469    2006-06-12
2246      2005-08-07
195180    2005-01-30
Name: dateReview, Length: 291631, dtype: object