In [1]:
import json
import re
import numpy as np
import matplotlib.pyplot as plt

import keras
import keras.preprocessing.text as kpt
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import load_model

from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.corpus import stopwords


Using TensorFlow backend.


In [2]:
df = pd.read_csv("src/data_sets/training_set.csv")
label = df[['useful', 'text']]
print(label.iloc[0]['text'])

3) Centra already supports Ethereum, Bitcoin, Centra, Dash, Litecoin, Zcash, and Monero. Nothing special about XRP here. 4) claiming this in anyway as a “pro” for XRP over other cryptos is disingenuous and mostly false.


In [3]:
# Preprocessing
STOPWORDS = set(stopwords.words('english'))

def format_text(s):
    s = re.sub(r"http\S+", "", s)
    s = re.sub('[^0-9a-z #+_]', ' ', s.lower());
    s = " ".join(word for word in s.split() if word not in STOPWORDS)
    return s

label = label[label["text"].notnull()]
label.loc[:,"text"] = label.text.apply(lambda x: format_text(x))
label.loc[:, "text"] = label.text.apply(lambda x : " ".join(re.findall('[\w]+'
         ,x)))
#label["text"] = label["text"].str.lower()
training = [tuple(x) for x in label.values]

print(training[0])

(0, '3 centra already supports ethereum bitcoin centra dash litecoin zcash monero nothing special xrp 4 claiming anyway pro xrp cryptos disingenuous mostly false')


In [4]:
# create our training data from the tweets
train_x = np.asarray([x[1] for x in training])
# index all the sentiment labels
train_y = np.asarray([x[0] for x in training])

# only work with the 3000 most popular words found in our dataset
max_words = 10000

# print(train_x[0])
# print(train_y[0])

useful_examples_index = np.where(train_y > 0)[0]
number_of_useful_examples = len(useful_examples_index)
useless_examples_index = np.where(train_y == 0)[0]
number_of_useless_examples = len(useless_examples_index)

print(number_of_useful_examples)
print(number_of_useless_examples)

1128
5370


In [5]:
# create a new Tokenizer
tknzr = Tokenizer(lower=True, split=" ", num_words=max_words)
tknzr.fit_on_texts(train_x)

#vocabulary:
# print(tknzr.word_index)

tokenized_train_x = tknzr.texts_to_sequences(train_x)

#remove duplicate tokens
for i in range(0, len(tokenized_train_x)):
    tokenized_train_x[i] = list(set(tokenized_train_x[i]))

In [6]:
training_vectors = np.zeros((len(tokenized_train_x), max_words))
# create one-hot matrices out of the indexed tweets
for i in range(0, len(tokenized_train_x)):
      training_vectors[i][tokenized_train_x[i]] = 1

print(training_vectors[0])

[ 0.  1.  1. ...,  0.  0.  0.]


In [7]:
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

# treat the labels as categories
# train_y = keras.utils.to_categorical(train_y, 2)

def create_baseline():
    # create model
    model = Sequential()
    model.add(Dense(256, input_dim=max_words, kernel_initializer='normal', activation='relu'))
    model.add(Dense(64, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# evaluate model with standardized dataset
# estimator = KerasClassifier(build_fn=create_baseline, epochs=100, batch_size=5, verbose=1)
# kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)
# results = cross_val_score(estimator, training_vectors, train_y, cv=kfold)
# print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))


model = create_baseline()

model.fit(training_vectors[:4500], train_y[:4500],
  batch_size=32,
  epochs=10,
  verbose=1,
  validation_split=0.1,
  shuffle=True)

model_json = model.to_json()
with open('model.json', 'w') as json_file:
    json_file.write(model_json)

model.save_weights('model.h5')

Train on 4050 samples, validate on 450 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [8]:
#use completely unseen data (untrained)

print("Running Model on Test Set")

prediction = model.predict(training_vectors[4500:]) 
actual_y = train_y[4500:]
total = len(actual_y)
correct = 0
useful = 0
spam = 0

actual_useful = np.count_nonzero(actual_y)
actual_spam = len(actual_y) - actual_useful

print("Number of spam tweets: " + str(actual_spam) + " Number of useful tweets: " + str(actual_useful))

for p in range(0, len(prediction)):
    predicted = round(prediction[p][0])
    if predicted == actual_y[p]:
        correct += 1
        if predicted == 1:
            useful += 1
        if predicted == 0:
            spam += 1
        
print("Accuracy on test set:  " + str(correct/total))
print("Identified " + str(useful/actual_useful) + " of useful tweets")
print("Identified " + str(spam/actual_spam) + " of spam tweets")

Running Model on Test Set
Number of spam tweets: 1636 Number of useful tweets: 362
Accuracy on test set:  0.8638638638638638
Identified 0.5359116022099447 of useful tweets
Identified 0.9364303178484108 of spam tweets
