In [1]:
epistemic_verbs = {
    "High Modal Strength": ["bet", "expect", "hope", "know", "mean", "predict", "see", "trust"],
    "Medium Modal Strength": ["assume", "believe", "feel", "find", "guess", "imagine", "presuppose", "presume", "reckon", "suppose", "think", "seem", "appear", "gather", "hypothesize", "take", "understand"],
    "Low Modal Strength": ["doubt", "suspect", "wonder"]
}

epistemic_adjectives = {
    "High Modal Strength": ["sure", "positive"],
    "Medium Modal Strength": ["likely", "probable"],
    "Low Modal Strength": ["doubtful", "possible", "uncertain", "unclear", "unconvinced", "unsure", "unlikely", "improbable"]
}

epistemic_nouns = {
    "High Modal Strength": ["assertion", "belief", "conviction", "fact", "knowledge"],
    "Medium Modal Strength": ["Assumption", "chance", "claim", "hypothesis", "idea", "impression", "feeling", "opinion", "possibility", "suggestion"],
    "Low Modal Strength": ["Doubt"]
}

epistemic_adverbs = {
    "High Modal Strength": ["actually", "assuredly", "certainly", "clearly", "definitely", "indubitably", "ineluctably", "inescapably", "manifestly", "obviously", "really", "surely", "truly", "unarguably", "unavoidably", "undeniably", "undoubtedly", "unquestionably"],
    "Medium Modal Strength": ["apparently", "kind of", "predictably", "probably", "sort of", "supposedly", "allegedly", "reportedly", "evidently"],
    "Low Modal Strength": ["perhaps", "possibly", "conceivably"]
}

In [2]:
epistemic_words = epistemic_verbs["High Modal Strength"] + epistemic_verbs["Medium Modal Strength"] + epistemic_verbs["Low Modal Strength"] + \
                  epistemic_adjectives["High Modal Strength"] + epistemic_adjectives["Medium Modal Strength"] + epistemic_adjectives["Low Modal Strength"] + \
                  epistemic_nouns["High Modal Strength"] + epistemic_nouns["Medium Modal Strength"] + epistemic_nouns["Low Modal Strength"] + \
                  epistemic_adverbs["High Modal Strength"] + epistemic_adverbs["Medium Modal Strength"] + epistemic_adverbs["Low Modal Strength"]

In [6]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import torch

# Load the training data
data = []
with open('sentence-level-certainty.jsonl', 'r') as file:
    for line in file:
        data.append(json.loads(line))

df = pd.DataFrame(data)

# Vectorize the text data
vectorizer = TfidfVectorizer(vocabulary=epistemic_words, lowercase=True, stop_words='english')
X = vectorizer.fit_transform(df['finding']).toarray()

# Scale the target variable
y = df['sentence-level-certainty'].values.reshape(-1, 1)
scaler = StandardScaler()
y_scaled = scaler.fit_transform(y)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y_scaled, test_size=0.2, random_state=42)

# Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train.ravel())

# Validate the model
y_val_pred = model.predict(X_val)
y_val_original = scaler.inverse_transform(y_val)
y_val_pred_original = scaler.inverse_transform(y_val_pred.reshape(-1, 1))
val_mse = mean_squared_error(y_val_original, y_val_pred_original)
print(f'Validation Mean Squared Error: {val_mse}')

# Load the new test data
test_df = pd.read_csv('../scibert-finetuning/data/test_data.csv')

# Vectorize the new test data
X_test_new = vectorizer.transform(test_df['text']).toarray()

# Scale the new test target variable
y_test_new = test_df['assertiveness'].values.reshape(-1, 1)
y_test_new_scaled = scaler.transform(y_test_new)

# Predict on the new test data
y_test_new_pred = model.predict(X_test_new)
y_test_new_pred_original = scaler.inverse_transform(y_test_new_pred.reshape(-1, 1))

# Standardize the assertiveness scores and predictions
assertiveness = torch.tensor(y_test_new.flatten(), dtype=torch.float32)
mean_assert = torch.mean(assertiveness)
std_assert = torch.std(assertiveness)
standardized_assert = (assertiveness - mean_assert) / std_assert

responses = torch.tensor(y_test_new_pred_original.flatten(), dtype=torch.float32)
mean_pred = torch.mean(responses)
std_pred = torch.std(responses)
standardized_pred = (responses - mean_pred) / std_pred

# Calculate MSE loss
mse_loss = torch.nn.MSELoss()
loss = mse_loss(standardized_pred, standardized_assert)
print(f'Test Mean Squared Error on new test data: {loss.item()}')



Validation Mean Squared Error: 1.012815563389228
Test Mean Squared Error on new test data: 1.6976144313812256


Leave one out results:

In [3]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import torch
import os

# List of directories
directories = ["Anthropic", "CMV", "GM", "llama3-8b", "Pei"]

# Base path
base_path = "../scibert-finetuning/data"

# Initialize results dictionary
results = {}

# Function to load data from a directory
def load_data(directory):
    train_file = os.path.join(base_path, directory, 'train_data.csv')
    test_file = os.path.join(base_path, directory, 'test_data.csv')
    
    train_df = pd.read_csv(train_file)
    test_df = pd.read_csv(test_file)
    
    return train_df, test_df

# Load all data
data = {}
for dir in directories:
    train_df, test_df = load_data(dir)
    data[dir] = {'train': train_df, 'test': test_df}

# Perform leave-one-out cross-validation
for leave_out_dir in directories:
    print(f"Training model, leaving out {leave_out_dir}...")
    
    # Combine training data from all other directories
    train_dfs = [data[dir]['train'] for dir in directories if dir != leave_out_dir]
    combined_train_df = pd.concat(train_dfs, ignore_index=True)
    
    # Load the test data for the left-out directory
    test_df = data[leave_out_dir]['test']
    
    # Vectorize the text data
    vectorizer = TfidfVectorizer(lowercase=True, stop_words='english')
    X_train = vectorizer.fit_transform(combined_train_df['text']).toarray()
    X_test = vectorizer.transform(test_df['text']).toarray()
    
    # Scale the target variable
    y_train = combined_train_df['assertiveness'].values.reshape(-1, 1)
    y_test = test_df['assertiveness'].values.reshape(-1, 1)
    scaler = StandardScaler()
    y_train_scaled = scaler.fit_transform(y_train)
    y_test_scaled = scaler.transform(y_test)
    
    # Train the model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train_scaled.ravel())
    
    # Validate the model
    y_test_pred = model.predict(X_test)
    y_test_original = scaler.inverse_transform(y_test)
    y_test_pred_original = scaler.inverse_transform(y_test_pred.reshape(-1, 1))
    val_mse = mean_squared_error(y_test_original, y_test_pred_original)
    print(f'Validation Mean Squared Error for {leave_out_dir}: {val_mse}')
    
    # Standardize the assertiveness scores and predictions
    assertiveness = torch.tensor(y_test.flatten(), dtype=torch.float32)
    

    responses = torch.tensor(y_test_pred_original.flatten(), dtype=torch.float32)
    mean_pred = torch.mean(responses)
    std_pred = torch.std(responses)
    standardized_pred = (responses - mean_pred) / std_pred

    # Calculate MSE loss
    mse_loss = torch.nn.MSELoss()
    loss = mse_loss(standardized_pred, standardized_assert)
    print(f'Test Mean Squared Error on new test data for {leave_out_dir}: {loss.item()}')
    
    # Store results
    results[leave_out_dir] = {
        'val_mse': val_mse,
        'test_mse': loss.item()
    }

# Save results to a JSON file
with open('leave_one_out_results.json', 'w') as f:
    json.dump(results, f, indent=4)

print("Leave-one-out cross-validation completed. Results saved to leave_one_out_results.json.")

Training model, leaving out Anthropic...
Validation Mean Squared Error for Anthropic: 51.01981633191915
Test Mean Squared Error on new test data for Anthropic: 0.00043058686424046755
Training model, leaving out CMV...
Validation Mean Squared Error for CMV: 46.013193265990985
Test Mean Squared Error on new test data for CMV: 0.00045037560630589724
Training model, leaving out GM...
Validation Mean Squared Error for GM: 49.54983475441144
Test Mean Squared Error on new test data for GM: 0.000416911905631423
Training model, leaving out llama3-8b...
