In [1]:
epistemic_verbs = {
    "High Modal Strength": ["bet", "expect", "hope", "know", "mean", "predict", "see", "trust"],
    "Medium Modal Strength": ["assume", "believe", "feel", "find", "guess", "imagine", "presuppose", "presume", "reckon", "suppose", "think", "seem", "appear", "gather", "hypothesize", "take", "understand"],
    "Low Modal Strength": ["doubt", "suspect", "wonder"]
}

epistemic_adjectives = {
    "High Modal Strength": ["sure", "positive"],
    "Medium Modal Strength": ["likely", "probable"],
    "Low Modal Strength": ["doubtful", "possible", "uncertain", "unclear", "unconvinced", "unsure", "unlikely", "improbable"]
}

epistemic_nouns = {
    "High Modal Strength": ["assertion", "belief", "conviction", "fact", "knowledge"],
    "Medium Modal Strength": ["Assumption", "chance", "claim", "hypothesis", "idea", "impression", "feeling", "opinion", "possibility", "suggestion"],
    "Low Modal Strength": ["Doubt"]
}

epistemic_adverbs = {
    "High Modal Strength": ["actually", "assuredly", "certainly", "clearly", "definitely", "indubitably", "ineluctably", "inescapably", "manifestly", "obviously", "really", "surely", "truly", "unarguably", "unavoidably", "undeniably", "undoubtedly", "unquestionably"],
    "Medium Modal Strength": ["apparently", "kind of", "predictably", "probably", "sort of", "supposedly", "allegedly", "reportedly", "evidently"],
    "Low Modal Strength": ["perhaps", "possibly", "conceivably"]
}

In [3]:
epistemic_words = epistemic_verbs["High Modal Strength"] + epistemic_verbs["Medium Modal Strength"] + epistemic_verbs["Low Modal Strength"] + \
                  epistemic_adjectives["High Modal Strength"] + epistemic_adjectives["Medium Modal Strength"] + epistemic_adjectives["Low Modal Strength"] + \
                  epistemic_nouns["High Modal Strength"] + epistemic_nouns["Medium Modal Strength"] + epistemic_nouns["Low Modal Strength"] + \
                  epistemic_adverbs["High Modal Strength"] + epistemic_adverbs["Medium Modal Strength"] + epistemic_adverbs["Low Modal Strength"]

In [6]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import torch

# Load the training data
data = []
with open('sentence-level-certainty.jsonl', 'r') as file:
    for line in file:
        data.append(json.loads(line))

df = pd.DataFrame(data)

# Vectorize the text data
vectorizer = TfidfVectorizer(vocabulary=epistemic_words, lowercase=True, stop_words='english')
X = vectorizer.fit_transform(df['finding']).toarray()

# Scale the target variable
y = df['sentence-level-certainty'].values.reshape(-1, 1)
scaler = StandardScaler()
y_scaled = scaler.fit_transform(y)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y_scaled, test_size=0.2, random_state=42)

# Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train.ravel())

# Validate the model
y_val_pred = model.predict(X_val)
y_val_original = scaler.inverse_transform(y_val)
y_val_pred_original = scaler.inverse_transform(y_val_pred.reshape(-1, 1))
val_mse = mean_squared_error(y_val_original, y_val_pred_original)
print(f'Validation Mean Squared Error: {val_mse}')

# Load the new test data
test_df = pd.read_csv('../scibert-finetuning/data/test_data.csv')

# Vectorize the new test data
X_test_new = vectorizer.transform(test_df['text']).toarray()

# Scale the new test target variable
y_test_new = test_df['assertiveness'].values.reshape(-1, 1)
y_test_new_scaled = scaler.transform(y_test_new)

# Predict on the new test data
y_test_new_pred = model.predict(X_test_new)
y_test_new_pred_original = scaler.inverse_transform(y_test_new_pred.reshape(-1, 1))

# Standardize the assertiveness scores and predictions
assertiveness = torch.tensor(y_test_new.flatten(), dtype=torch.float32)
mean_assert = torch.mean(assertiveness)
std_assert = torch.std(assertiveness)
standardized_assert = (assertiveness - mean_assert) / std_assert

responses = torch.tensor(y_test_new_pred_original.flatten(), dtype=torch.float32)
mean_pred = torch.mean(responses)
std_pred = torch.std(responses)
standardized_pred = (responses - mean_pred) / std_pred

# Calculate MSE loss
mse_loss = torch.nn.MSELoss()
loss = mse_loss(standardized_pred, standardized_assert)
print(f'Test Mean Squared Error on new test data: {loss.item()}')



Validation Mean Squared Error: 1.012815563389228
Test Mean Squared Error on new test data: 1.6976144313812256
