In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import multiprocessing
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, losses, Sequential
from tqdm.notebook import tqdm
from sklearn.metrics import mean_squared_error

In [2]:
#Predefined
data_file = 'train_data.csv'
WORD_DEPTH = 300
EPOCHS = 500

In [3]:
data = pd.read_csv('train_data.csv')

In [4]:
# Check out/ Clean Data

In [5]:
data['Sentiment'].isnull().sum()

20

In [6]:
#Pretty low, so we will drop NAs
data = data.dropna()

In [7]:
data.shape

(895, 6)

Convert data to useable form, meaning:
1. Convert Sentiment to numpy
2. Eliminate zeros, as that is "inconsequential" data
3. Break into batches/testing/validation

In [8]:
tonumpy = lambda x : np.array(float(x)).reshape([1])
data['Sentiment'] = data['Sentiment'].apply(tonumpy)

In [9]:
data = data[data['Sentiment'] != 0]

In [10]:
data = data.sample(frac=1)

In [11]:
data = data.reset_index()

In [12]:
data = data.drop('index', axis = 1)

In [13]:
data_len = data.shape[0]

In [14]:
training = int(data_len * 0.8)
valid = int(training + data_len * 0.1)

In [15]:
data['text'] = data['text'].apply(lambda x: x.lower())

In [16]:
data['text'] = data['text'].apply(lambda x: word_tokenize(x))

# Implement SkipGram

In [17]:
num_words = sum([len(string) for string in data['text']])
num_words

29315

In [18]:
w2v = Word2Vec(data['text'],size=WORD_DEPTH, min_count =1, iter = 20, workers = 8)

In [21]:
w2v.wv.save_word2vec_format('w2v.model')

In [19]:
data['text'] = data['text'].apply(lambda x: w2v.wv[x])

# Pad Data

In [20]:
MAX_LEN = max(data['text'], key = lambda x : x.shape[0]).shape[0]
MAX_LEN

399

In [21]:
def pad(x):
    if x.shape[0] < MAX_LEN:
        padding = np.zeros([MAX_LEN - x.shape[0], 300])
        x = np.concatenate([x, padding])
        return x
    else:
        return x

In [22]:
data['text'] = data['text'].apply(pad)

# Split for training

In [23]:
train_data = data.iloc[:training, :]
validation_data = data.iloc[training:valid, :]
test_data = data.iloc[valid:, :]

In [24]:
def pd_to_data(d):
    x = np.stack(d['text'].to_list(), 0)
    y = np.concatenate(d['Sentiment'].to_list())
    return (x, y)

In [25]:
batches = list()
increment = int(train_data.shape[0]/8)
start_idx = 0
end_idx = int(train_data.shape[0]/8)
for i in range(8):
    x = np.stack(train_data['text'][start_idx:end_idx].to_list(), 0)
    
    y = np.concatenate(train_data['Sentiment'][start_idx:end_idx].to_list())
    start_idx += increment
    end_idx += increment
    batches.append({'x':x, 'y':y})
BATCH_SIZE = increment

In [26]:
batches[0]['x'].shape

(72, 399, 300)

# Build Model

In [27]:
model = Sequential()

In [28]:
model.add(layers.Conv1D(4, 100, activation='relu', input_shape = (MAX_LEN, WORD_DEPTH)))

In [29]:
model.add(layers.MaxPool1D())

In [30]:
model.add(layers.Dropout(0.5))

In [31]:
#Not in original model, but I need 1 d output
model.add(layers.Flatten())

In [32]:
model.add(layers.Dense(100))

In [33]:
model.add(layers.Dense(1, activation = 'tanh'))

In [34]:
model.build()

In [35]:
model.compile(optimizer=keras.optimizers.Adadelta(), loss=losses.MeanSquaredError())

In [36]:
model.load_weights('model.h5')

In [37]:
validation_train = pd_to_data(validation_data)

In [41]:

for i in tqdm(range(1, 4)):
    for batch in batches:
        model.load_weights('model.h5')
        model.fit(x = batch['x'],y = batch['y'], batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=validation_train, verbose = 0)
        model.save('model.h5')

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [42]:
model.save('model.h5')

In [43]:
test = pd_to_data(test_data)

In [44]:
y_pred = model.predict(test[0])

In [45]:
mean_squared_error(test[1], y_pred)

0.2916023192162912