In [None]:
!pip show numpy
!pip install keras
!pip install protobuf
import pandas as pd
import numpy as np

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from numpy import loadtxt

#Read as csv
#skip bad lines instead of using bad data
content = pd.read_csv('data-file.csv', sep='#', on_bad_lines='skip')

#Get the max ammount of words in one comment
print("Measuring the comment with the most words...")

max_comment = max(content['reviewText'], key=len)
print(max_comment)

print("Biggest comment has number of words:")
max_comment_char_count = len(max_comment)
print(max_comment_char_count)

print("CSV has this many lines..")
print(len(content))

print("Train data..")
train_data = content[:64] #only the first x comments
print(train_data)

print("Test data..")
test_data = content[-64:] #only the last x comments
print(test_data)

In [None]:
#Prepare train and test data with content and flag(x, y)
train_y = train_data['reviewText']
print(train_y)
train_x = train_data['labelPos']
print(train_x)

test_y = test_data['reviewText']
print(test_y)
test_x = test_data['labelPos']
print(test_x)

print("Train-Data")
print(train_data)

In [None]:
print(len(test_x))
print(len(test_y))

In [None]:
#Preprocess with tokenization
#max words for vocab
num_words = 1000

#set out of vocab token
oov_token = '<UNK>'

#set pad and trunc-type
pad_type = 'post'
trunc_type = 'post'

In [None]:
#create tokenizer
tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
tokenizer.fit_on_texts(train_y)

#create word index
word_index = tokenizer.word_index

#encode train data
train_sequences = tokenizer.texts_to_sequences(train_y)

#Get max training sequence length
maxlen = max([len(x) for x in train_sequences])

#Pad the training sequences
train_padded = pad_sequences(train_sequences, padding=pad_type, truncating=trunc_type, maxlen=maxlen)

print(train_sequences)
print(train_padded)

In [None]:
#vectorize train data
def vectorize(sequences, dimension = 137):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1
    return results

train_x = vectorize(train_x).astype("int64")
train_y = np.array(train_padded).astype("float64")

test_x = vectorize(test_x).astype("int64")
test_y = np.array(train_padded).astype("float64")

print("Testdata:Y")
print(test_y)
print("Testdata:X(Labels)")
print(test_x)

In [None]:
#output of results
print("Word index:\n", word_index)
print("Training sequences:\n", train_sequences)
print("Padded training sequences:\n", train_padded)
print("Padded training shape:", train_padded.shape)
print("Training sequences data type:", type(train_sequences))
print("Padded Training sequences data type:", type(train_padded))

In [None]:
#tokenize test data
test_sequences = tokenizer.texts_to_sequences(test_data['reviewText'])
test_padded = pad_sequences(test_sequences, padding=pad_type, truncating=trunc_type, maxlen=maxlen)

In [None]:
print("Testing sequences:\n", test_sequences)
print("\nPadded testing sequences:\n", test_padded)
print("\nPadded testing shape:",test_padded.shape)

In [None]:
import numpy as np
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import models
from tensorflow.keras import layers

In [None]:
#build model
model = models.Sequential()

In [None]:
#Add layers to the model
# Input layer
model.add(layers.Dense(10, activation = "relu", input_shape=(2, 137)))
# Hidden layers
model.add(layers.Dropout(0.3, noise_shape=None, seed=None))
model.add(layers.Dense(10, activation = "tanh"))
# Output layer
model.add(layers.Dense(1, activation = "swish"))
model.summary()

In [None]:
#compile model and prepare for training
model.compile(
optimizer = "ftrl",
loss = "binary_crossentropy",
metrics = ["accuracy"]
)

In [None]:
print("Traindata shape..")
print(len(train_x))
print(len(train_y))

print("Testdata shape..")
print(len(test_x))
print(len(test_y))

In [None]:
#define batch size and epochs
results = model.fit(
train_x, train_y,
epochs= 2,
batch_size = 2,
validation_data = (test_x, test_y)
)

In [None]:
#evaluate model accuracy
scores = model.evaluate(test_x, test_y, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))