# Batched preprocessing

Based on the google colab, preprocess the dataset with applying the batched functions and multiprocessing. The targets are divided into multiple files like `training_###.csv`. It is available to re-preprocess, while skipping the processed targets.

In [None]:
!pip install --upgrade pip
!pip install tensorflow
#!pip install -U spacy[cuda101]

In [None]:
!python3 -m spacy download en_core_web_sm
# Then restart runtime

In [None]:
import os
os.chdir('/content/drive/MyDrive/colab-data/ai-final/')
print(f"Current directory is {os.getcwd()}")

Current directory is /content/drive/MyDrive/colab-data/ai-final


In [None]:
import os
if not os.path.isdir('amazon_review_full_csv'):
    !wget https://s3.amazonaws.com/fast-ai-nlp/amazon_review_full_csv.tgz
    !tar -xvzf amazon_review_full_csv.tgz
else:
    print('Dataset has been already downloaded')

Dataset has been already downloaded


In [None]:
import re
import datetime
import pandas as pd
import spacy
print('Spacy With GPU' if spacy.prefer_gpu() else 'Spacy Without GPU')

Spacy Without GPU


In [None]:
nlp = spacy.load("en_core_web_sm")
nlp.Defaults.stop_words |= { "book", "movie", "film" }

def tokenize(text):
    if type(text) != 'str':
        text = str(text)
    
    # Remove All numbers
    text = re.sub(r"\d", '', text)

    return ' '.join([token.lemma_.strip() for token in nlp(text)
                    if not token.is_punct
                    and not token.is_stop
                    and token.is_alpha
                    and not token.lemma_.strip() == ''])

In [None]:
def preprocess_data(seq, skips, rows):
    # Skip Condition
    if os.path.isfile(f'amazon_review_full_csv/train/train_{str(seq).zfill(3)}.csv'):
        print(f'{datetime.datetime.now()} - {str(seq).zfill(3)} is skipped')
        return

    # Load Data
    data = pd.read_csv('amazon_review_full_csv/train.csv', header=None, skiprows=skips, nrows=rows)
    data.columns = ['score', 'title', 'text']

    # Concatenate Title and Text
    data['text'] = data['title'] + ' ' + data['text']
    data.drop(columns='title', inplace=True)

    # Lower Casing
    data['text'] = data['text'].str.lower()

    # Tokenize
    data['text'] = data['text'].apply(tokenize)

    # Save
    data.to_csv(f'amazon_review_full_csv/train/train_{str(seq).zfill(3)}.csv', index=False)
    print(f'{datetime.datetime.now()} - {str(seq).zfill(3)} is completed')

In [None]:
from multiprocessing import Pool

TOTAL_ROWS = 3000000 # train: 3000000, test: 650000
BATCH_SIZE =   10000

#TOTAL_ROWS = 100
#BATCH_SIZE =  10

if __name__ == '__main__':
    print(f'{datetime.datetime.now()} - start preprocessing. {TOTAL_ROWS//BATCH_SIZE} times.')
    with Pool(10) as p:
        p.starmap(preprocess_data,
                  [(id+1, id*BATCH_SIZE, BATCH_SIZE)
                    for id in range(TOTAL_ROWS//BATCH_SIZE)])

    #!rm -rf amazon_review_full_csv/train/*

2021-12-03 01:22:41.257545 - start preprocessing. 300 times.
2021-12-03 01:22:42.223472 - 001 is skipped
2021-12-03 01:22:42.228293 - 017 is skipped
2021-12-03 01:22:42.225254 - 009 is skipped
2021-12-03 01:22:42.239214 - 033 is skipped
2021-12-03 01:22:42.243805 - 018 is skipped
2021-12-03 01:22:42.245097 - 041 is skipped
2021-12-03 01:22:42.251276 - 010 is skipped
2021-12-03 01:22:42.260568 - 073 is skipped
2021-12-03 01:22:42.254912 - 049 is skipped
2021-12-03 01:22:42.257905 - 065 is skipped
2021-12-03 01:22:42.263033 - 002 is skipped
2021-12-03 01:22:42.285384 - 035 is skipped
2021-12-03 01:22:42.265719 - 034 is skipped
2021-12-03 01:22:42.255054 - 057 is skipped
2021-12-03 01:22:42.268154 - 011 is skipped
2021-12-03 01:22:42.266911 - 042 is skipped
2021-12-03 01:22:42.276668 - 050 is skipped
2021-12-03 01:22:42.235061 - 025 is skipped
2021-12-03 01:22:42.269672 - 074 is skipped
2021-12-03 01:22:42.287777 - 058 is skipped
2021-12-03 01:22:42.286720 - 043 is skipped
2021-12-03 01:2