In [1]:
import pandas as pd
import nltk
from pandarallel import pandarallel
import csv
import time
from IPython.display import clear_output
import logging
import boto3
from botocore.exceptions import ClientError
from tqdm.notebook import tqdm

pandarallel.initialize(progress_bar=True)
nltk.download('punkt')

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/spenceradams/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
def upload_file(file_name, bucket, object_name=None):
    """Upload a file to an S3 bucket

    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified then file_name is used
    :return: True if file was uploaded, else False
    """

    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = file_name

    # Upload the file
    s3_client = boto3.client('s3')
    try:
        response = s3_client.upload_file(file_name, bucket, object_name)
    except ClientError as e:
        logging.error(e)
        return False
    return True

def tokenize(row):
    text = row['text']
    
    tokens = nltk.word_tokenize(text.lower())

    return tokens
    
def save_csv(output_file, tokens):
    with open(output_file, 'w') as csvoutfile:
        csv_writer = csv.writer(csvoutfile, delimiter=' ', lineterminator='\n')
        csv_writer.writerows(tokens)
        
    bucket = "yelp-dataset-pt-9"
    upload_file(output_file, bucket, f'spencer/data/sentiment/en/fasttext/{output_file}')

def preproc(filename):
    start = time.time()
    path = f"s3://yelp-dataset-pt-9/spencer/data/sentiment/en/{filename}.csv"
    df = pd.read_csv(path)
    print(f'{filename} has {len(df):,} rows')
    
    labels = ['stars', 'pos_neg_neu',
             'pos_neg_3_is_pos', 'pos_neg_3_is_neg']
    
    tokens = df.parallel_apply(tokenize, axis=1)
    print("Done tokenizing, time to apply to each label.")
    
    for label in tqdm(labels):
        if label == 'stars':
            labels_df = ('__label__' + df['stars'].astype(int).astype(str)).str.split(" ")
        else:
            labels_df = ('__label__' + df[label].astype(str)).str.split(" ")
        tokens_and_labels = labels_df + tokens
        print(tokens_and_labels.tail())
        print("Saving to csv.")
        save_csv(f"{label}_tokens_{filename}.csv", tokens_and_labels.to_list())
        print("CSV saved.")
    
    print(f'Took {time.time() - start:.2f} seconds')

In [9]:
files = ['train_bal', 'train_same_size_as_bal', 'test_small']

In [10]:
for filename in files:
    preproc(filename)
    clear_output(wait=True)

test_small has 200,000 rows


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=50000), Label(value='0 / 50000')))…

Done tokenizing, time to apply to each label.


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

199995    [__label__4, the, ribs, are, excellent, (, so,...
199996    [__label__4, this, place, is, great, !, noodle...
199997    [__label__5, love, the, food, here, ., the, gr...
199998    [__label__2, the, food, was, okay, ,, but, two...
199999    [__label__3, great, location, in, the, peoria,...
dtype: object
Saving to csv.
CSV saved.
199995    [__label__POSITIVE, the, ribs, are, excellent,...
199996    [__label__POSITIVE, this, place, is, great, !,...
199997    [__label__POSITIVE, love, the, food, here, ., ...
199998    [__label__NEGATIVE, the, food, was, okay, ,, b...
199999    [__label__NEUTRAL, great, location, in, the, p...
dtype: object
Saving to csv.
CSV saved.
199995    [__label__POSITIVE, the, ribs, are, excellent,...
199996    [__label__POSITIVE, this, place, is, great, !,...
199997    [__label__POSITIVE, love, the, food, here, ., ...
199998    [__label__NEGATIVE, the, food, was, okay, ,, b...
199999    [__label__POSITIVE, great, location, in, the, ...
dtype: object
Saving