In [None]:
# Notebook currently does the following:
# 1. Loads data from a json-file
# 2. Balances counts of negative and positive reviews, then saves the data to csv as partitions

# To implement in transformation:
# 1. TF-IDF of text
# 2. Further cleaning: removing unwanted characters and symbols
# 3. Feature engineering: 
#    - length of text
# 4. Transform "unixReviewTime" to datetime format
# 5. Remove punctuation from text features

In [None]:
import pandas as pd
import numpy as np
import pathlib
import re

np.random.seed(0)

In [None]:
CHUNKS_TO_LOAD = 1 
CHUNKSIZE = 10000 # Number of rows to load per chunk, total rows = CHUNKS_TO_LOAD * CHUNKSIZE
DATA_PATH = "" # Absolute path to data

In [None]:
# Load amazon fashion reviews dataset
reader = pd.read_json(
    DATA_PATH,
    lines=True, 
    compression="gzip",
    chunksize=CHUNKSIZE,
)

features = ["overall", "reviewText", "summary", "unixReviewTime", "reviewerID", "reviewerName"]

In [None]:
def remove_punctuation_lowercase(text):
    return (re.sub(r"[^a-zA-Z]", " ", text)).lower()

In [None]:
for chunk_n in range(1, CHUNKS_TO_LOAD+1):
    # Remove redundant columns
    df = reader.__next__().loc[:, features]

    # Fix imbalance between scores of 4, 5 and 2, 1.
    value_counts = df["overall"].value_counts()
    count_positive = value_counts[5] + value_counts[4]
    count_neg = value_counts[2] + value_counts[1] 
    rows_to_drop = count_positive - count_neg

    # Equally many negative as positive samples
    df_balanced = df.drop(
        axis=1, 
        index=np.random.choice(
            df.query("overall == 4 or overall == 5").index, 
            rows_to_drop, 
            replace=False
        )
    )   

    # Remove punctuation and convert text to lowercase
    text_features = ["reviewText", "summary"]
    
    for feature in text_features:
        df_balanced[feature] = df_balanced[feature].convert_dtypes(
            convert_string=True).fillna("").apply(remove_punctuation_lowercase)

    # Add column that is parsed date of unixReviewTime column
    df["reviewDate"] = pd.to_datetime(df["unixReviewTime"], unit="s")

    # Extract old filename to construct new name
    path = pathlib.Path(DATA_PATH)

    file_name = path.parts[-1].split(".")[0] + f"_partition_{chunk_n}"
    
    df_balanced.to_csv(f"../../data/transformed/{file_name}.csv")
