<a href="https://colab.research.google.com/github/ChrisNiekler/NASDAQ-Dataset/blob/master/bert/BERT_with_Tensorboard.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# "transformers" muss installiert werden bevor man es nutzten kann.

In [None]:
!pip install transformers

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

# Notwendige Libraries importieren

In [None]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
from transformers import AlbertTokenizer, AlbertForTokenClassification

# BERT sollte auf einer GPU laufen, da es auf einer CPU deutlich mehr Zeit benötigt.

In [None]:
# to check the GPU
%tensorflow_version 2.x
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print(f'Found GPU at: {device_name}')

## Lade das vortrainierte BERT Model und Tokenizer
bert-base-uncased ist das standard Dictionary von Google. Es gibt auch ein bert-large-uncased für eine höhere Performance.

In [None]:
# if model_exists:
#   model = tf.keras.models.load_model(path_h5)
# else:
model = TFBertForSequenceClassification.from_pretrained("bert-large-uncased", num_labels = 3)

tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")

In [None]:
model.summary()

## Laden der reinen Daten
BERT funktioniert ohne Tagging

In [None]:
#but the tweets.csv path
url = 'https://raw.githubusercontent.com/ChrisNiekler/NASDAQ-Dataset/master/Tweets.csv'
data = pd.read_csv(url)
data.head(5)

## Laden der bereinigten Daten
# Prüfen ob sich Null-Werte in den Daten befinden

In [None]:
(data['Tweets'].isnull()).sum()

In [None]:
(data['Sentiment'].isnull()).sum()

# Null-Werte durch Neutrale Werte ersetzten
(Löschen wäre eine Alternative)

In [None]:
data = data.fillna(value=1.0)

In [None]:
(data['Tweets'].isnull()).sum()

In [None]:
(data['Sentiment'].isnull()).sum()

# Aufteilen der Daten in Feature und Label

In [None]:
x = data['Tweets']
y = data['Sentiment']

In [None]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.3)

In [None]:
test_list = test_x.to_list()

In [None]:
test_list_2 = test_y.to_list()

In [None]:
# assign data of lists.
train_da = {'Tweet': train_x, 'Sentiment': train_y}
# Create DataFrame.
train_data = pd.DataFrame(train_da)
# Print the output.
print(train_data)

In [None]:
def convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN): 
  train_InputExamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)
  validation_InputExamples = test.apply(lambda x: InputExample(guid=None,
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)
  
  return train_InputExamples, validation_InputExamples
  train_InputExamples, validation_InputExamples = convert_data_to_examples(train, 
                                                                           test, 
                                                                           'Tweet', 
                                                                           'Sentiment')
  
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later
    for e in examples:
        # Documentation is really strong for this method, so please take a look at it
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )
        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])
        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )
    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )
    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )
DATA_COLUMN = 'Tweet'
LABEL_COLUMN = 'Sentiment'

In [None]:
train_InputExamples, validation_InputExamples = convert_data_to_examples(train_data, test_data, DATA_COLUMN, LABEL_COLUMN)


In [None]:
training_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)

In [None]:
training_data = training_data.shuffle(100).batch(16).repeat(2)

In [None]:
validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)

In [None]:
validation_data = validation_data.batch(16)

In [None]:
%load_ext tensorboard
log_folder = 'logs'

In [None]:
from tensorflow.keras.callbacks import TensorBoard
callbacks = [TensorBoard(log_dir=log_folder,
                         histogram_freq=1,
                         write_graph=True,
                         write_images=True,
                         update_freq='epoch',
                         profile_batch=2,
                         embeddings_freq=1)]

# Training

In [None]:

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

model.fit(training_data, epochs=2, validation_data=validation_data, callbacks=callbacks)

In [None]:
# save tf format, make sure no errors while saving and calling for prdection
model.save('/content/BERT_MODEL/Bert_trained_tweets',save_format='tf')

In [None]:
%tensorboard --logdir={"logs"}

In [None]:
#to predict 
tf_batch = tokenizer(test_list[:30], max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
labels = [ '0.0 Nature','1.0 Positive','2.0 Negative']
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
#test_list is containing the tweets, test_list_2 containing the sentemint 
for i in range(len(test_list[:30])):
  print(test_list[i], ": \n predicted Value is: ", labels[label[i]], ', Original value is : ', test_list_2[i])

In [None]:
# mount it
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#copy it there
!cp -r BERT_MODEL/Bert_trained_tweets/* /content/drive/MyDrive/BERT_MODEL