In [2]:
import os
import sys
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D, Flatten
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger, History, CSVLogger
import operator
import joblib
import pandas as pd
import nltk as nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import random
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
import feather
stop_words = set(stopwords.words('english'))

In [2]:
GLOVE_DIR = '../../data/embeddings'
MAX_SEQUENCE_LENGTH = 256
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [None]:
print('Indexing word vectors.')

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, './glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

# Loading

In [None]:
def minority_balance_dataframe_by_multiple_categorical_variables(df, categorical_columns=None, downsample_by=0.1):
    """
    :param df: pandas.DataFrame
    :param categorical_columns: iterable of categorical columns names contained in {df}
    :return: balanced pandas.DataFrame
    """
    if categorical_columns is None or not all([c in df.columns for c in categorical_columns]):
        raise ValueError('Please provide one or more columns containing categorical variables')

    minority_class_combination_count = df.groupby(categorical_columns).apply(lambda x: x.shape[0]).min()
    
    minority_class_combination_count = int(minority_class_combination_count * downsample_by)
    
    df = df.groupby(categorical_columns).apply(
        lambda x: x.sample(minority_class_combination_count)
    ).drop(categorical_columns, axis=1).reset_index().set_index('level_1')

    df.sort_index(inplace=True)

    return df


In [None]:
%%time
reviews = ''

with open('../../data/yelp-dataset/review.json','r') as f:
    for line in f.readlines()[0:3000000]:
        reviews += line

df_reviews = pd.read_json(reviews, lines=True)
del reviews
# df_reviews =  pd.read_json('../../data/yelp-dataset/review.json', lines=True, encoding='utf-8')

In [None]:
%%time
df_reviews['len'] = df_reviews.text.str.len()

In [None]:
%%time
df_reviews = df_reviews[df_reviews['len'].between(10, 4000)]

In [None]:
# balancing dataset
df_rev_balanced = minority_balance_dataframe_by_multiple_categorical_variables(
    df_reviews, print('Indexing word vectors.')

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, './glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))
    categorical_columns=['stars'], 
    downsample_by=0.1
)

In [None]:
df_rev_balanced.reset_index().to_feather('../../data/balanced_reviews.feather')

In [None]:
# !rm -rf ../../assets/balanced_reviews.feather

# Preprocessing

In [None]:
!mkdir -p ../../assets/sentiment_tensorflow

In [23]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(df_rev_balanced.text.tolist())
joblib.dump(tokenizer, '../../assets/sentiment_tensorflow/tokenizer.pickle')

WORD_INDEX_SORTED = sorted(tokenizer.word_index.items(), key=operator.itemgetter(1))

seqs = tokenizer.texts_to_sequences(df_rev_balanced.text.values)
X = pad_sequences(seqs, maxlen=MAX_SEQUENCE_LENGTH)
Y = df_rev_balanced.stars.values.astype(int)
Y_cat = [1 if y > 3 else 0 for y in Y]
assert X.shape[0] == Y.shape[0]

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, Y_cat, test_size=0.2, random_state=9)

In [25]:
with pd.HDFStore('../../data/yelp_x_y_test_train.h5') as h:
    h['X_train'] = pd.DataFrame(X_train)
    h['X_test'] = pd.DataFrame(X_test)
    h['y_train'] = pd.DataFrame(y_train)
    h['y_test'] = pd.DataFrame(y_test)

In [7]:
configuration = {
    "MAX_SEQUENCE_LENGTH": MAX_SEQUENCE_LENGTH
}

In [8]:
with open('../../assets/sentiment_tensorflow/configuration.json', 'w') as f:
    json.dump(configuration, f)