In [None]:
# Set up environment and list data files
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

data_dir = r"C:\Users\brian\Documents\UniMelb\Year 2\Semester 1\COMP90051 Statistical Machine Learning\Assignments\Assignment 2\comp-90051-2025-s-1-project-2"
os.chdir(data_dir)

print("Working directory:", os.getcwd())
print("Files in folder:", os.listdir())

In [None]:
# Load domain1 JSON as newline-delimited JSON (JSON Lines)
df1 = pd.read_json('domain1_train_data.json', lines=True)
print("Domain1 shape:", df1.shape)
print("Label distribution in domain1:\n", df1['label'].value_counts())
df1.head()

In [None]:
# Load domain2 JSON and inspect
df2 = pd.read_json('domain2_train_data.json', lines=True)
print("Domain2 shape:", df2.shape)
print("Label distribution in domain2:\n", df2['label'].value_counts())
df2.head()

In [None]:
# Load the sample submission CSV to inspect its format
sample = pd.read_csv('sample.csv')
print("Sample submission shape:", sample.shape)
print("\nColumn types:\n", sample.dtypes)
sample.head()

In [None]:
# Load test JSON into a DataFrame and inspect
df_test = pd.read_json('test_data.json', lines=True)
print("Test set shape:", df_test.shape)
df_test.head()

In [None]:
# Combine domain1 and domain2 into a single training DataFrame and inspect
train = pd.concat([
    df1.assign(domain='domain1'),
    df2.assign(domain='domain2')
], ignore_index=True)

print("Combined train shape:", train.shape)
print("\nOverall label distribution:\n", train['label'].value_counts())
print("\nDomain breakdown:\n", train['domain'].value_counts())
train.sample(5)

In [None]:
# EDA
%matplotlib inline

# add a length column
train['length'] = train['text'].apply(len)

# summary statistics
print("Length stats:\n", train['length'].describe())

# histogram
plt.figure(figsize=(8, 4))
plt.hist(train['length'], bins=50)
plt.xlabel('Sequence length (tokens)')
plt.ylabel('Count')
plt.title('Training Data Sequence Length Distribution')
plt.show()


In [None]:
# Pad/truncate sequences to a fixed length
# Choose MAX_LEN = 90 to cover ~95% of your data (only a few sequences > 90 tokens)
MAX_LEN = 90
PAD_VALUE = 0

def pad_truncate(seq, max_len=MAX_LEN, pad_value=PAD_VALUE):
    if len(seq) > max_len:
        return seq[:max_len]
    return seq + [pad_value] * (max_len - len(seq))

# apply to train & test
X_train = np.array(train['text'].apply(pad_truncate).tolist())
X_test  = np.array(df_test['text'].apply(pad_truncate).tolist())

print("X_train shape:", X_train.shape)
print("X_test  shape:", X_test.shape)


In [None]:
# Prepare labels, split into train/validation, and compute vocabulary size
# labels
y = train['label'].values

# stratified split: 80% train, 20% validation
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y, test_size=0.2, stratify=y, random_state=42
)

# vocab size: highest token ID + 1
vocab_size = int(X_train.max() + 1)

print("Training set:", X_train_split.shape, y_train_split.shape)
print("Validation set:", X_val.shape, y_val.shape)
print("Vocabulary size:", vocab_size)


In [None]:
# Convert token sequences into space-delimited strings for scikit-learn
train['text_str'] = train['text'].apply(lambda seq: ' '.join(map(str, seq)))
df_test['text_str'] = df_test['text'].apply(lambda seq: ' '.join(map(str, seq)))

train[['id', 'label', 'domain', 'text_str']].head() # insepct examples