## Get Data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
# import sklearn.feature_extraction.text as sk_text
# from sklearn.model_selection import train_test_split
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.models import load_model
# from sklearn.model_selection import train_test_split
# from tensorflow.keras.callbacks import EarlyStopping
# from tensorflow.keras.layers import Dense
# from sklearn import metrics

import torch
# !pip install transformers
from transformers import AutoTokenizer, TFBertTokenizer, BertModel
import torch


pd.set_option('display.max_columns', None)

In [3]:
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd

def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

def xml_to_df(xml):
  xtree = ET.parse(xml)
  xroot = xtree.getroot()

  rows = []

  for node in xroot:
    rows.append(node.attrib)

  return pd.DataFrame(rows)

In [4]:
dataset = os.path.join("/content/drive/MyDrive/Final/Shared Final/myPosts.xml")
dataset = xml_to_df(dataset)

In [5]:
df = dataset.drop(columns=['CreationDate',
                            'OwnerUserId',
                            'LastEditorUserId',
                            'LastEditDate',
                            'LastActivityDate',
                            'Title',
                            'Tags',
                            'AnswerCount',
                            'ContentLicense',
                            'ParentId',
                            'ClosedDate',
                            'CommunityOwnedDate',
                            'LastEditorDisplayName',
                            'OwnerDisplayName',
                            'FavoriteCount',
                            'CommentCount',
                            'ViewCount',
                            'AcceptedAnswerId',
                            'Id'
                            ])

In [6]:
answers = df[df.PostTypeId == '2']
answers = answers.drop(columns=['PostTypeId'])
answers["Score"] = pd.to_numeric(answers["Score"])

In [7]:
encode_numeric_zscore(answers, 'Score')

In [8]:
answers

Unnamed: 0,Score,Body
2,0.701092,"<p>""Backprop"" is the same as ""backpropagation""..."
6,0.081593,"<p>Noise in the data, to a reasonable amount, ..."
8,0.081593,<p>We typically think of machine learning mode...
9,1.114092,<p>There is no direct way to find the optimal ...
14,-0.537907,<blockquote>\n <p>To put it simply in layman ...
...,...,...
374,-0.847657,"<blockquote>\n <p>""heavier-than-air flying ma..."
376,-0.537907,"<p>Yes, there were successful attempts at pred..."
379,-0.847657,<p>Watson can make its diagnosis based on the ...
381,-0.641157,<p>There are a variety of aspects where AI can...


## Load BERT Model
### Code from: https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel

In [9]:
# loades a pre-trained BERT tokenizer (text converted to lower case)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# creates a TF compatible version
tf_tokenizer = TFBertTokenizer.from_tokenizer(tokenizer)
# loads the pre-trained BERT model
model = BertModel.from_pretrained("bert-base-uncased")

In [10]:
# Tokenize the text in the DataFrame with memory-saving options
tokenized_inputs = tokenizer(
    list(answers['Body']),
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=64
)
outputs = model(**tokenized_inputs)
last_hidden_states = outputs.last_hidden_state

## Prepare data for training

In [None]:
pooled_output = last_hidden_states.mean(dim=1)

In [None]:
num_classes = 1  # Since you're predicting a single score
classification_head = torch.nn.Linear(pooled_output.size(-1), num_classes)

In [None]:
# Assuming you have ground truth scores
target_scores = torch.tensor(answers['Score'], dtype=torch.float32)

# Define your regression loss (e.g., Mean Squared Error)
loss_fn = torch.nn.MSELoss()

# Forward pass
logits = classification_head(pooled_output)
loss = loss_fn(logits.squeeze(), target_scores)

# Backward pass and optimization (update model weights)
loss.backward()
optimizer.step()