# Transform Batch/Streaming Data

In [1]:
import pandas as pd
import datetime
import time

Streaming starting date

In [2]:
pd.DataFrame({'date': ['Thursday, July 27', 'Friday, July 28', 'Saturday, July 29'], 
              'reviews': [10223, 10000, 10000]})

Unnamed: 0,date,reviews
0,"Thursday, July 27",10223
1,"Friday, July 28",10000
2,"Saturday, July 29",10000


In [3]:
df_stream_raw_samples = pd.read_parquet('../../stream_90223_raw_samples.parquet')
df_stream_raw_samples.columns

Index(['asin', 'overall', 'reviewText', 'reviewerID', 'reviewerName',
       'summary', 'verified', 'internal_partition', 'partition_number',
       'style', 'vote', 'image'],
      dtype='object')

In [4]:
batch_data_music_all = pd.read_parquet('../../batch_data_music_all_291631_noID.parquet')
batch_data_music_all.columns

Index(['asin', 'image', 'overall', 'reviewText', 'reviewerID', 'reviewerName',
       'style', 'summary', 'unixReviewTime', 'verified', 'vote'],
      dtype='object')

In [5]:
def get_review_id(id_pre, num_samples, ts):
    # specify padding for int string
    max_unit = 1e9
    n0s = len(str(int(max_unit)))

    if not ts.empty:
        timestamps = ts
    else:
        timestamp = f'{int(time.time()):0{n0s}}'
        timestamps = pd.Series([timestamp for _ in range(num_samples)])

    sub_ids = pd.Series(range(num_samples)).map(f'{{:0{n0s}}}'.format)
    reviews_id = id_pre + timestamps + 'T' + sub_ids

    return reviews_id

def transform(df):
    if 'unixReviewTime' in df.columns: 
        id_prefix = 'B' # Batch Data
        df['dateReview'] = pd.to_datetime(df['unixReviewTime'].astype(int), unit='s').dt.date
        
        last_batch_date = df['dateReview'].max()
        first_streaming_date = datetime.datetime.strptime('2023-07-27', "%Y-%m-%d").date()
        # old batch data, slice to present for app demo
        if last_batch_date < first_streaming_date:
            diff_days = first_streaming_date - last_batch_date - datetime.timedelta(days=1)
            df['dateReview'] = df['dateReview'] + diff_days
        timestamp = df.pop('unixReviewTime')
    else:
        id_prefix = 'S' # Stream Data
        timestamp = pd.Series()
        # Remove unnecessary columns
        df.drop(['internal_partition', 'partition_number'], axis=1, inplace=True)
        df['dateReview'] = datetime.date.today()
        
    df['overall'] = pd.to_numeric(df['overall']).astype(int)
    # Create column of review ID 
    df['reviewID'] = get_review_id(id_prefix, len(df), timestamp)
    # Replace nulls by 0s
    df.loc[df['vote'].isnull(), 'vote'] = '0'
    df['vote'] = df['vote'].str.replace(',', '').astype(int)
    # string verified to boolean
    df.loc[df['verified'] == 'true', 'verified'] = True
    df.loc[df['verified'] == 'false', 'verified'] = False
    # Remove unnecessary columns
    df.drop(['image', 'style'], axis=1, inplace=True)

In [6]:
transform(df_stream_raw_samples)
print(sorted(df_stream_raw_samples.columns))

['asin', 'dateReview', 'overall', 'reviewID', 'reviewText', 'reviewerID', 'reviewerName', 'summary', 'verified', 'vote']


In [7]:
transform(batch_data_music_all)
print(sorted(batch_data_music_all.columns))

['asin', 'dateReview', 'overall', 'reviewID', 'reviewText', 'reviewerID', 'reviewerName', 'summary', 'verified', 'vote']


In [8]:
batch_data_music_all['dateReview'].sort_values(ascending=False)

122219    2023-07-26
122220    2023-07-26
56173     2023-07-25
267113    2023-07-23
100861    2023-07-22
             ...    
94337     2007-03-28
244468    2006-12-26
244469    2006-06-12
2246      2005-08-07
195180    2005-01-30
Name: dateReview, Length: 291631, dtype: object

Prepare streaming data for batch ingestion

In [9]:
import numpy as np

first_sampling = 10223
first_streaming_date = pd.to_datetime('2023-07-27').date()
normal_sampling = 10000
num_samples = len(df_stream_raw_samples)

samples = np.arange(first_sampling, num_samples+1, normal_sampling)
dates = pd.date_range('2023-07-28', periods=(num_samples//normal_sampling)-1)

df_stream_raw_samples.loc[:first_sampling, 'dateReview'] = first_streaming_date

for i in range(len(dates)):
    df_stream_raw_samples.loc[samples[i]:samples[i+1], 'dateReview'] = dates[i].date()

In [10]:
df_stream_raw_samples.to_parquet(f'stream_data_{len(df_stream_raw_samples)}.parquet')

Filter by `Musical Instruments` category

In [11]:
df_musical_instruments_asins = pd.read_parquet('../../musical_instruments_asins.parquet')

In [12]:
df_stream_data_music = df_stream_raw_samples[df_stream_raw_samples['asin'].isin(
    df_musical_instruments_asins['asin'])]
df_stream_data_music.to_parquet(f'stream_data_music_{len(df_stream_data_music)}.parquet')

Concat Batch an Streaming Data for batch ingestion to db

In [13]:
full_dataset = pd.concat([batch_data_music_all, df_stream_data_music]).reset_index(drop=True)
full_dataset.to_parquet(f'batch_stream_music_data_{len(full_dataset)}.parquet')
full_dataset

Unnamed: 0,asin,overall,reviewText,reviewerID,reviewerName,summary,verified,vote,dateReview,reviewID
0,B00N9330MU,4,When my & and 5 yr old daughters opened it at ...,A3V0480PVRUZGP,Cluckadoodledoo,... yr old daughters opened it at Christmas I ...,False,9,2019-11-13,B1421452800T0000000000
1,B00N9330MU,4,My daughter loves this machine. The volume is ...,ANM0OAW6DMOSD,Misty Christian,Four Stars,True,9,2019-08-02,B1412553600T0000000001
2,B00QN040IO,5,"Wonderful, great price.",A2ECJYRPAU0TDU,Skullified,Five Stars,True,0,2021-06-13,B1471392000T0000000002
3,B00WTV515M,4,I bought this as a gift for an SRV fan and he ...,A3KP2K3QQTBE6,tina kraybill,... as a gift for an SRV fan and he loved it. ...,True,0,2023-02-01,B1523059200T0000000003
4,B00WTV515M,5,This is a great display piece. If you are a SR...,AT1GTMI5LASMZ,Waxpoet,Great addition or start to mini collection,True,0,2022-03-20,B1495584000T0000000004
...,...,...,...,...,...,...,...,...,...,...
335050,B000EELB2S,5,I have this attached to my music stand. It el...,A2QYJC77CW9BAS,David J. Milne Jr.,Great Light For Full Music Stands,True,0,2023-08-04,S1691267347T0000090213
335051,B000EELB2S,2,"Caveat emptor...there are several ""Universal"" ...",A1QOZ2N5RLIY32,phidler,Not all lights are created equal,False,0,2023-08-04,S1691267347T0000090215
335052,B000EEN9SC,5,Wonderful. It is very easy to adjust and is mu...,A3FHN66HWGRGZQ,The Bear,Wonderful. It is very easy to adjust and is ...,True,0,2023-08-04,S1691267347T0000090217
335053,B000EEN9SC,5,good for school,AUX2CQ0NAXOE3,Lori Hagemann,Five Stars,True,0,2023-08-04,S1691267347T0000090219
