Import all necessary libraries to use BERT

In [None]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
# import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt

import pandas as pd

tf.get_logger().setLevel('ERROR')

We now need to set up the dataset so that it can be used with a TensorFlow model (i.e. BERT)

In [None]:
with open("/Users/alejc/onedrive/Desktop/CS159/nlp-final-project/data/liar.data/train.tsv", "r", encoding="utf-8") as infile, open("/Users/alejc/onedrive/Desktop/CS159/nlp-final-project/data/liar.data/traincleaned.tsv", "w", encoding="utf-8") as outfile:
    for line in infile:
        # Remove problematic quotes and extra line breaks
        clean_line = line.replace('"', '').replace('\n', '').replace('\r', '')
        outfile.write(clean_line + "\n")


In [None]:
def parse_tsv_line(line):
    # Define the column types
    column_defaults = [
    '',  # JSON file name
    '',  # Boolean flag as string
    '',  # Statement
    '',  # Topic
    '',  # Speaker
    '',  # Speaker's title
    '',  # State
    '',  # Party
    0,   # Truth count 1
    0,   # Truth count 2
    0,   # Truth count 3
    0,   # Truth count 4
    0,   # Truth count 5
    ''   # Source
]
    # Decode the line into individual columns
    columns = tf.io.decode_csv(line, record_defaults=column_defaults, field_delim='\t', use_quote_delim=True)
    # Separate features and label
    # Separate features and label (assuming no label for now)
    string_features = columns[2:8] + columns[-1:]  # Select all string columns
    numeric_features = columns[8:-1]  # Select integer columns (truth counts)

    # Combine string and numeric features into a single dictionary
    features = {
        "string_features": string_features,
        "numeric_features": tf.stack(numeric_features)
    }

    # If there's a label column, define it here; otherwise, return features only
    label = columns[1]  # Replace with actual label column index if applicable
    return features, label

def create_tsv_dataset(file_path, batch_size=32):
    # Load the file
    dataset = tf.data.TextLineDataset(file_path)
    # Skip the header if the file has one
    # dataset = dataset.skip(1) <- curret file doesn't have a header but if it did we could skip the top row
    # Parse each line
    dataset = dataset.map(parse_tsv_line)
    # Shuffle, batch, and prefetch for performance
    dataset = dataset.shuffle(buffer_size=1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset



In [None]:
# Path to your .tsv file
file_path = "/Users/alejc/onedrive/Desktop/CS159/nlp-final-project/data/liar.data/traincleaned.tsv"

# Create the dataset
batch_size = 32

dataset = create_tsv_dataset(file_path, batch_size)

# Iterate through the dataset
for features, labels in dataset.take(1):
    print("Features:", features)
    print("Labels:", labels)


Utilize pandas to get some preliminary data analysis

In [None]:
import pandas as pd

# Load the TSV file into a pandas DataFrame
file_path = "/Users/alejc/onedrive/Desktop/CS159/nlp-final-project/data/liar.data/train.tsv"
column_names = [
    "json_file", "label", "statement", "topic", "speaker", "speaker_title",
    "state", "party", "barely_true", "false", "half_true",
    "mostly_true", "pants_on_fire", "source"
]
df = pd.read_csv(file_path, sep='\t', names=column_names)

# Find the most common party
most_common_label = df['party'].value_counts().idxmax()
print("Most common label:", most_common_label)


In [None]:
# Group by label and compute the mean of truth_count_5
average_truth_count_5_per_label = df.groupby('state')['pants_on_fire'].mean()
print("Average Truth Count 5 per label:")
print(average_truth_count_5_per_label)


In [None]:
# Group by label and state, and count occurrences
state_counts = df.groupby(['label', 'state']).size().reset_index(name='count')

# For each label, find the top 5 states
top_5_states_by_label = state_counts.groupby('label').apply(
    lambda group: group.nlargest(5, 'count')
).reset_index(drop=True)

print("Top 5 states by label:")
print(top_5_states_by_label)

Preprocess Data For BERT

In [None]:
import pandas as pd

# Read train data
train = pd.read_csv("/Users/alejc/onedrive/Desktop/CS159/nlp-final-project/data/liar.data/train.tsv", sep='\t', header=None)

# Drop columns by index
columns_to_remove = [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]  # Indices of the columns to remove
train = train.drop(columns=columns_to_remove)

# assign numbers to truth label
label_column = train.columns[0]  
# Define the mapping of strings to numbers
label_mapping = {
    'true': 0,
    'mostly-true': 1,
    'half-true': 2,
    'barely-true': 3,
    'false': 4,
    'pants-fire': 5
}
train[label_column] = train[label_column].map(label_mapping)

# Save the updated DataFrame back to a TSV file
output_path = 'train_update.tsv'  # Replace with your desired output file path
train.to_csv(output_path, sep='\t', index=False)

In [None]:
# read test data
test = pd.read_csv("/Users/alejc/onedrive/Desktop/CS159/nlp-final-project/data/liar.data/test.tsv", sep='\t', header=None)

# Drop columns by index
columns_to_remove = [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]  # Indices of the columns to remove
test = test.drop(columns=columns_to_remove)

# assign numbers to truth label
label_column = test.columns[0]  
# Define the mapping of strings to numbers
label_mapping = {
    'true': 0,
    'mostly-true': 1,
    'half-true': 2,
    'barely-true': 3,
    'false': 4,
    'pants-fire': 5
}
test[label_column] =  test[label_column].map(label_mapping)

# Save the updated DataFrame back to a TSV file
output_path = 'test_update.tsv'  # Replace with your desired output file path
test.to_csv(output_path, sep='\t', index=False)

In [None]:
from sklearn.model_selection import train_test_split

train = pd.read_csv("/Users/alejc/onedrive/Desktop/CS159/nlp-final-project/data/liar.data/train_update.tsv", sep='\t', header=None)
test = pd.read_csv("/Users/alejc/onedrive/Desktop/CS159/nlp-final-project/data/liar.data/test_update.tsv", sep='\t', header=None)


# Create dataframes for train 
train_bert_df = pd.DataFrame({
    'id': range(len(train)),
    'label': train[0],
    'alpha': ['q']*train.shape[0],
    'text': train[1].replace(r'\n', ' ', regex=True).str.lower()
})

test_bert_df = pd.DataFrame({
    'id': range(len(test)),
    'text': test[1].replace(r'\n', ' ', regex=True).str.lower()
})

train_bert_df.head()

train_bert_df.to_csv('bert_df_train.tsv', sep='\t', index=False, header=False)
test_bert_df.to_csv('bert_df_test.tsv', sep='\t', index=False, header=False)


In [None]:
import subprocess

# Define file paths
run_classifier_path = r"C:\Users\alejc\OneDrive\Desktop\CS159\nlp-final-project\code\bert-master\run_classifier.py"
bert_config_path = r"C:\Users\alejc\OneDrive\Desktop\CS159\nlp-final-project\code\BERT\bert_config.json"
bert_checkpoint_prefix = r"C:\Users\alejc\OneDrive\Desktop\CS159\nlp-final-project\code\BERT\bert_model.ckpt"
vocab_path = r"C:\Users\alejc\OneDrive\Desktop\CS159\nlp-final-project\code\BERT\vocab.txt"
train_data_path = r"C:\Users\alejc\OneDrive\Desktop\CS159\nlp-final-project\data\liar.data\bert_df_train.tsv"
output_dir = r"C:\Users\alejc\OneDrive\Desktop\CS159\nlp-final-project\code\model_output"

# Define the command as a list of arguments
command = [
    "python", run_classifier_path,
    "--task_name=cola",  # Update this to match your task if different
    "--do_train=true",  # Include training step if required
    "--do_predict=true",
    f"--data_dir={train_data_path}",
    f"--vocab_file={vocab_path}",
    f"--bert_config_file={bert_config_path}",
    f"--init_checkpoint={bert_checkpoint_prefix}",
    "--max_seq_length=128",
    f"--output_dir={output_dir}"
]

result = subprocess.run(command, capture_output=True, text=True, check=True)
print(result.stdout)  # Print the standard output


CalledProcessError: Command '['python', 'C:\\Users\\alejc\\OneDrive\\Desktop\\CS159\\nlp-final-project\\code\\bert-master\\run_classifier.py', '--task_name=cola', '--do_train=true', '--do_predict=true', '--data_dir=C:\\Users\\alejc\\OneDrive\\Desktop\\CS159\\nlp-final-project\\data\\liar.data\\bert_df_train.tsv', '--vocab_file=C:\\Users\\alejc\\OneDrive\\Desktop\\CS159\\nlp-final-project\\code\\BERT\\vocab.txt', '--bert_config_file=C:\\Users\\alejc\\OneDrive\\Desktop\\CS159\\nlp-final-project\\code\\BERT\\bert_config.json', '--init_checkpoint=C:\\Users\\alejc\\OneDrive\\Desktop\\CS159\\nlp-final-project\\code\\BERT\\bert_model.ckpt', '--max_seq_length=128', '--output_dir=C:\\Users\\alejc\\OneDrive\\Desktop\\CS159\\nlp-final-project\\code\\model_output']' returned non-zero exit status 1.