Import all necessary libraries to use BERT

In [1]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

We now need to set up the dataset so that it can be used with a TensorFlow model (i.e. BERT)

In [2]:
with open("/Users/rorybeals/Desktop/CS159/nlp-final-project-1/data/liar.data/train.tsv", "r") as infile, open("/Users/rorybeals/Desktop/CS159/nlp-final-project-1/data/liar.data/traincleaned.tsv", "w") as outfile:
    for line in infile:
        # Remove problematic quotes and extra line breaks
        clean_line = line.replace('"', '').replace('\n', '').replace('\r', '')
        outfile.write(clean_line + "\n")


In [4]:
def parse_tsv_line(line):
    # Define the column types
    column_defaults = [
    '',  # JSON file name
    '',  # Boolean flag as string
    '',  # Statement
    '',  # Topic
    '',  # Speaker
    '',  # Speaker's title
    '',  # State
    '',  # Party
    0,   # Truth count 1
    0,   # Truth count 2
    0,   # Truth count 3
    0,   # Truth count 4
    0,   # Truth count 5
    ''   # Source
]
    # Decode the line into individual columns
    columns = tf.io.decode_csv(line, record_defaults=column_defaults, field_delim='\t', use_quote_delim=True)
    # Separate features and label
    # Separate features and label (assuming no label for now)
    string_features = columns[2:8] + columns[-1:]  # Select all string columns
    numeric_features = columns[8:-1]  # Select integer columns (truth counts)

    # Combine string and numeric features into a single dictionary
    features = {
        "string_features": string_features,
        "numeric_features": tf.stack(numeric_features)
    }

    # If there's a label column, define it here; otherwise, return features only
    label = columns[1]  # Replace with actual label column index if applicable
    return features, label

def create_tsv_dataset(file_path, batch_size=32):
    # Load the file
    dataset = tf.data.TextLineDataset(file_path)
    # Skip the header if the file has one
    # dataset = dataset.skip(1) <- curret file doesn't have a header but if it did we could skip the top row
    # Parse each line
    dataset = dataset.map(parse_tsv_line)
    # Shuffle, batch, and prefetch for performance
    dataset = dataset.shuffle(buffer_size=1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset



In [5]:
# Path to your .tsv file
file_path = "/Users/rorybeals/Desktop/CS159/nlp-final-project-1/data/liar.data/traincleaned.tsv"

# Create the dataset
batch_size = 32

dataset = create_tsv_dataset(file_path, batch_size)

# Iterate through the dataset
for features, labels in dataset.take(1):
    print("Features:", features)
    print("Labels:", labels)


Features: {'string_features': <tf.Tensor: shape=(32, 7), dtype=string, numpy=
array([[b'On how money from a possible lease of the Ohio Turnpike would be used',
        b'infrastructure,transportation', b'john-kasich',
        b'Governor of Ohio as of Jan. 10, 2011', b'Ohio', b'republican',
        b'an interview'],
       [b'Most of, if not all of, the [DeKalb school construction] projects always came in on or were under budget.',
        b'education', b'thomas-brown', b'', b'Georgia', b'democrat',
        b'an interview'],
       [b'More than 9,000 Rhode Island voters used the Moderate Party master lever mechanism and they didnt have a single Moderate on their ballot.',
        b'elections', b'kenneth-block', b'businessman', b'',
        b'republican', b'a television interview'],
       [b'The hour of sleep you lose during the change to daylight saving time might initially pose some health risks.',
        b'science', b'joel-keehn', b'', b'', b'none', b'a blog'],
       [b'Secretary C

2024-11-27 22:18:40.444018: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Utilize pandas to get some preliminary data analysis

In [27]:
import pandas as pd

# Load the TSV file into a pandas DataFrame
file_path = "/Users/rorybeals/Desktop/CS159/nlp-final-project-1/data/liar.data/train.tsv"
column_names = [
    "json_file", "label", "statement", "topic", "speaker", "speaker_title",
    "state", "party", "barely_true", "false", "half_true",
    "mostly_true", "pants_on_fire", "source"
]
df = pd.read_csv(file_path, sep='\t', names=column_names)

# Find the most common label
most_common_label = df['party'].value_counts().idxmax()
print("Most common label:", most_common_label)


Most common label: republican


In [33]:
# Group by label and compute the mean of truth_count_5
average_truth_count_5_per_label = df.groupby('state')['pants_on_fire'].mean()
print("Average Truth Count 5 per label:")
print(average_truth_count_5_per_label)


AttributeError: 'Series' object has no attribute 'idmax'

In [34]:
# Group by label and state, and count occurrences
state_counts = df.groupby(['label', 'state']).size().reset_index(name='count')

# For each label, find the top 5 states
top_5_states_by_label = state_counts.groupby('label').apply(
    lambda group: group.nlargest(5, 'count')
).reset_index(drop=True)

print("Top 5 states by label:")
print(top_5_states_by_label)

Top 5 states by label:
          label         state  count
0   barely-true         Texas    192
1   barely-true       Florida    188
2   barely-true     Wisconsin    122
3   barely-true      New York     98
4   barely-true      Virginia     68
5         false         Texas    188
6         false     Wisconsin    171
7         false       Florida    167
8         false      New York    166
9         false      Virginia     95
10    half-true       Florida    224
11    half-true         Texas    198
12    half-true     Wisconsin    147
13    half-true      Illinois    137
14    half-true      New York    121
15  mostly-true       Florida    217
16  mostly-true         Texas    182
17  mostly-true      Illinois    149
18  mostly-true     Wisconsin    133
19  mostly-true      New York    118
20   pants-fire         Texas     87
21   pants-fire      New York     66
22   pants-fire     Wisconsin     57
23   pants-fire       Florida     45
24   pants-fire  Rhode Island     30
25         true

  top_5_states_by_label = state_counts.groupby('label').apply(
