In [1]:
import pandas as pd
import nltk
from pandarallel import pandarallel
import csv
import time
from IPython.display import clear_output

pandarallel.initialize(progress_bar=True)
nltk.download('punkt')

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
def tokenize(row):
    star = row['stars']
    text = row['text']
    if int(star) in (1,2):
        label = 'NEGATIVE'
    elif int(star) == 3:
        label = 'NEUTRAL'
    else:
        label = 'POSITIVE'
    
    return f"__label__{label} {' '.join(nltk.word_tokenize(text.lower()))}".split(" ")
    
def save_csv(output_file, tokens):
    with open(output_file, 'w') as csvoutfile:
        csv_writer = csv.writer(csvoutfile, delimiter=' ', lineterminator='\n')
        csv_writer.writerows(tokens)

def preproc(filename):
    start = time.time()
    path = f"s3://yelp-dataset-pt-9/spencer/data/sentiment/en/{filename}.csv"
    df = pd.read_csv(path)
    print(f'{filename} has {len(df):,} rows')
    
    tokens = df.parallel_apply(tokenize, axis=1)
    save_csv(f"tokens_{filename}.csv", tokens.to_list())
    
    print(f'Took {time.time() - start:.2f} seconds')

In [4]:
files = ['train_bal', 'train_same_size_as_bal', 'test_small']

In [5]:
for filename in files:
    preproc(filename)
    clear_output(wait=True)

test_small has 200,000 rows


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=50000), Label(value='0 / 50000')))…

Took 99.32 seconds


In [6]:
import sagemaker

sess = sagemaker.Session()

In [7]:
bucket = "yelp-dataset-pt-9"

In [8]:
for filename in files:
    sess.upload_data(path=f'tokens_{filename}.csv', bucket=bucket, key_prefix=f'spencer/data/sentiment/en/fasttext')