## Get Data

In [200]:
# from google.colab import drive
# drive.mount('/content/drive')

In [201]:
# !pip install transformers
# !pip install -U tensorflow-text==2.14.0

In [202]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
import sklearn.feature_extraction.text as sk_text
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn import metrics

from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout

from matplotlib.pyplot import figure, show
from collections.abc import Sequence

import torch
from transformers import AutoTokenizer, TFBertTokenizer, TFBertModel
import torch


pd.set_option('display.max_columns', None)
save_path = "./toSave/"

In [203]:
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd

def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

def xml_to_df(xml):
  xtree = ET.parse(xml)
  xroot = xtree.getroot()

  rows = []

  for node in xroot:
    rows.append(node.attrib)

  return pd.DataFrame(rows)

def decode_zscore_to_original(z_scores, mean, sd):
  return z_scores * sd + mean

# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.values.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

In [204]:
dataset = os.path.join("/content/drive/MyDrive/Final/Shared Final/myPosts.xml")
dataset = xml_to_df(dataset)

In [205]:
df = dataset.drop(columns=['CreationDate',
                            'OwnerUserId',
                            'LastEditorUserId',
                            'LastEditDate',
                            'LastActivityDate',
                            'Title',
                            'Tags',
                            'AnswerCount',
                            'ContentLicense',
                            'ParentId',
                            'ClosedDate',
                            'CommunityOwnedDate',
                            'LastEditorDisplayName',
                            'OwnerDisplayName',
                            'FavoriteCount',
                            'CommentCount',
                            'ViewCount',
                            'AcceptedAnswerId',
                            'Id'
                            ])

In [206]:
answers = df[df.PostTypeId == '2']
answers = answers.drop(columns=['PostTypeId'])
answers["Score"] = pd.to_numeric(answers["Score"])

In [207]:
encode_numeric_zscore(answers, 'Score')

In [208]:
# score = answers['Score']
# answers.drop('Score',axis=1,inplace=True)
answers

Unnamed: 0,Score,Body
2,0.701092,"<p>""Backprop"" is the same as ""backpropagation""..."
6,0.081593,"<p>Noise in the data, to a reasonable amount, ..."
8,0.081593,<p>We typically think of machine learning mode...
9,1.114092,<p>There is no direct way to find the optimal ...
14,-0.537907,<blockquote>\n <p>To put it simply in layman ...
...,...,...
374,-0.847657,"<blockquote>\n <p>""heavier-than-air flying ma..."
376,-0.537907,"<p>Yes, there were successful attempts at pred..."
379,-0.847657,<p>Watson can make its diagnosis based on the ...
381,-0.641157,<p>There are a variety of aspects where AI can...


In [209]:
answers['Score']

2      0.701092
6      0.081593
8      0.081593
9      1.114092
14    -0.537907
         ...   
374   -0.847657
376   -0.537907
379   -0.847657
381   -0.641157
385   -0.537907
Name: Score, Length: 205, dtype: float64

## Load BERT Model
### Code from: https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel

In [210]:
# creates a TF compatible version
tf_tokenizer = TFBertTokenizer.from_pretrained("bert-base-uncased")
# loads the pre-trained BERT model
model = TFBertModel.from_pretrained("bert-base-uncased")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [211]:
train_data, test_data, train_score, test_score = train_test_split(answers['Body'], answers['Score'], test_size=0.2, random_state=42)

In [212]:
# Tokenize the input data
max_length = 64
train_data_tokens = tf_tokenizer(train_data.tolist(), padding='max_length', truncation=True, max_length=max_length)
test_data_tokens = tf_tokenizer(test_data.tolist(), padding='max_length', truncation=True, max_length=max_length)

In [213]:
# # Generate embeddings
# outputs = model(**matrix)
# # Extract embeddings from the output
# last_hidden_states = outputs.last_hidden_state

In [214]:
# Convert your score data to numpy arrays
train_score_np = train_score.to_numpy()
test_score_np = test_score.to_numpy()

In [215]:
# Define regression head
input_ids = Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
bert_output = model(input_ids)[0]  # Take the output embeddings from BERT
dropout = Dropout(0.1)(bert_output)
regression_output = Dense(1, activation='linear')(dropout)

In [216]:
# Combine BERT model and regression head
model = Model(inputs=input_ids, outputs=regression_output)

In [217]:
model.compile(optimizer='adam', loss='mean_squared_error')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')

In [218]:
# Display the model summary
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_ids (InputLayer)      [(None, 64)]              0         
                                                                 
 tf_bert_model_9 (TFBertMod  TFBaseModelOutputWithPo   109482240 
 el)                         olingAndCrossAttentions             
                             (last_hidden_state=(Non             
                             e, 64, 768),                        
                              pooler_output=(None, 7             
                             68),                                
                              past_key_values=None,              
                             hidden_states=None, att             
                             entions=None, cross_att             
                             entions=None)                       
                                                           

In [None]:
model.fit(train_data_tokens, train_score_np, epochs=epochs, batch_size=batch_size, validation_data=(test_data_tokens, test_score_np))

In [220]:
pred = model.predict(test_data_tokens)



In [225]:
pred_flat = pred.flatten()


In [None]:
print("Old Predictions:", pred)

print("Predictions:", pred_flat)
print("True Values:", test_score)


In [227]:
print(pred_flat.shape, test_score_np.shape)

(2624,) (41,)


## Extra

In [224]:
# model = Sequential()

# activation_terms = ['relu', 'tanh', 'sigmoid']
# optimizer_terms = ['adam','sgd']
# for i in activation_terms:
#     for j in optimizer_terms:
#         model.add(Dense(25000, input_dim=X.shape[1], activation=i))
#         model.add(Dense(1000, activation=i))
#         model.add(Dense(1000, activation=i))
#         model.add(Dense(1000, activation=i))
#         model.add(Dense(100, activation=i))
#         model.add(Dense(10, activation=i))
#         model.add(Dense(1))
#         model.compile(loss='mean_squared_error', optimizer=j)
#         monitor = EarlyStopping(monitor='val_loss', min_delta=1e-8, patience=10, verbose=1, mode='auto')
#         checkpointer = ModelCheckpoint(filepath="best_weights/best_weights.hdf5", verbose=0, save_best_only=True)
#         model.fit(  x_train,
#                     y_train,
#                     validation_data=(x_test,y_test),
#                     callbacks=[monitor,checkpointer],
#                     verbose=2,
#                     epochs=500)

#         model.load_weights('best_weights/best_weights.hdf5')
#         pred = model.predict(x_test)
#         score = np.sqrt(metrics.mean_squared_error(pred,y_test))
#         print("Score (RMSE): {}".format(score))

#         model.save('my_model.keras')
#         chart_regression(pred.flatten(),y_test, sort= True)