In [27]:
# Set up environment and list data files
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

## Load domain 1 data

In [28]:
# Load domain1 JSON as newline-delimited JSON (JSON Lines)
df1 = pd.read_json('/Users/zigeliang/Desktop/All/Data Science 2025S1/COMP90051/A2/comp-90051-2025-s-1-project-2/domain1_train_data.json', lines=True)
print("Domain1 shape:", df1.shape)
print("Label distribution in domain1:\n", df1['label'].value_counts())
df1.head()


Domain1 shape: (1000, 3)
Label distribution in domain1:
 label
0    500
1    500
Name: count, dtype: int64


Unnamed: 0,text,label,id
0,"[6, 22, 34, 76, 501, 977, 1, 2514, 13623, 76, ...",0,0
1,"[222, 31, 4108, 104, 132, 361, 39, 2305, 12, 9...",0,1
2,"[736, 7194, 113, 12, 366, 2870, 123, 101, 12, ...",0,2
3,"[48, 1, 2025, 69, 361, 533, 327, 237, 4150, 13...",0,3
4,"[2973, 66, 1, 1493, 260, 2740, 50, 1027, 50, 1...",0,4


## Load domain 2 data 

In [29]:
# Load domain1 JSON as newline-delimited JSON (JSON Lines)
df2 = pd.read_json('/Users/zigeliang/Desktop/All/Data Science 2025S1/COMP90051/A2/comp-90051-2025-s-1-project-2/domain2_train_data.json', lines=True)
print("Domain1 shape:", df2.shape)
print("Label distribution in domain1:\n", df2['label'].value_counts())
df2.head()

Domain1 shape: (5000, 3)
Label distribution in domain1:
 label
1    4750
0     250
Name: count, dtype: int64


Unnamed: 0,text,label,id
0,"[22, 6065, 76, 119, 13027, 575, 219, 22, 2435,...",0,0
1,"[1275, 1509, 12, 6113, 6287, 327, 411, 1139, 2...",0,1
2,"[575, 2962, 529, 4624, 39, 279, 1012, 277, 76,...",0,2
3,"[12, 6113, 2428, 69, 375, 1025, 2605, 76, 101,...",0,3
4,"[529, 76, 1509, 861, 1, 645, 1, 5013, 237, 3, ...",0,4


## Load the test dataset 


In [30]:
# Load test JSON into a DataFrame and inspect
df_test = pd.read_json('/Users/zigeliang/Desktop/All/Data Science 2025S1/COMP90051/A2/comp-90051-2025-s-1-project-2/test_data.json', lines=True)
print("Test set shape:", df_test.shape)
df_test.head()

Test set shape: (4000, 2)


Unnamed: 0,text,id
0,"[9159, 3048, 238, 276, 162, 286, 305, 22, 36, ...",0
1,"[64, 5039, 1275, 6, 0, 871, 139, 270, 327, 237...",1
2,"[327, 618, 76, 650, 121, 274, 1025, 0, 12207, ...",2
3,"[6, 12, 609, 11905, 4, 879, 677, 78, 13352, 60...",3
4,"[1, 5504, 55, 22, 101, 3783, 139, 2664, 4, 1, ...",4


## Combine domain 1 and domina 2


In [31]:
# Combine domain1 and domain2 into a single training DataFrame and inspect
train = pd.concat([
    df1.assign(domain='domain1'),
    df2.assign(domain='domain2')
], ignore_index=True)

print("Combined train shape:", train.shape)
print("\nOverall label distribution:\n", train['label'].value_counts())
print("\nDomain breakdown:\n", train['domain'].value_counts())
train.sample(5)

Combined train shape: (6000, 4)

Overall label distribution:
 label
1    5250
0     750
Name: count, dtype: int64

Domain breakdown:
 domain
domain2    5000
domain1    1000
Name: count, dtype: int64


Unnamed: 0,text,label,id,domain
4403,"[199, 22, 2085, 139, 12, 773, 76, 235, 4749, 2...",1,3403,domain2
2792,"[139, 93, 3062, 1, 231, 388, 69, 12, 3830, 101...",1,1792,domain2
5412,"[9327, 180, 469, 1125, 677, 2158, 1679, 105, 1...",1,4412,domain2
2619,"[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...",1,1619,domain2
786,"[324, 151, 285, 241, 4, 7181, 664, 39, 831, 32...",1,786,domain1


## Method 1: Bag-of-Words + TF–IDF Baseline with Class-Weighted Logistic Regression

In [32]:
## Prepare text strings for TF-IDF input

In [33]:
# Convert token sequences into strings
train['text_str'] = train['text'].apply(lambda seq: ' '.join(map(str, seq)))
df_test['text_str'] = df_test['text'].apply(lambda seq: ' '.join(map(str, seq)))

train[['id', 'label', 'domain', 'text_str']].head() # inspect examples

Unnamed: 0,id,label,domain,text_str
0,0,0,domain1,6 22 34 76 501 977 1 2514 13623 76 31 2085 277...
1,1,0,domain1,222 31 4108 104 132 361 39 2305 12 936 1287 66...
2,2,0,domain1,736 7194 113 12 366 2870 123 101 12 230 403 51...
3,3,0,domain1,48 1 2025 69 361 533 327 237 4150 13 22 2128 1...
4,4,0,domain1,2973 66 1 1493 260 2740 50 1027 50 1 3289 69 5...


In [34]:
# Split the combined DataFrame into df_train and df_val
df_train, df_val = train_test_split(train, test_size=0.2, stratify = train['label'], random_state=42)

# Extract labels
y_train = df_train['label'].values
y_val   = df_val['label'].values

# Fit TF-IDF on the training text_str
vectorizer = TfidfVectorizer(max_features=15000)
X_train_tf = vectorizer.fit_transform(df_train['text_str'])
X_val_tf   = vectorizer.transform(df_val['text_str'])
X_test_tf  = vectorizer.transform(df_test['text_str'])



## Evulauate log-regression model baseline

In [35]:
# Train and evaluate a class-weighted Logistic Regression baseline
clf = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)

# Fit on training TF-IDF features
clf.fit(X_train_tf, y_train)

# Predict on validation set
y_pred = clf.predict(X_val_tf)

# Evaluate
acc = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {acc:.4f}\n")
print("Classification Report:")
print(classification_report(y_val, y_pred, digits=4))


Validation Accuracy: 0.9250

Classification Report:
              precision    recall  f1-score   support

           0     0.6546    0.8467    0.7384       150
           1     0.9771    0.9362    0.9562      1050

    accuracy                         0.9250      1200
   macro avg     0.8159    0.8914    0.8473      1200
weighted avg     0.9368    0.9250    0.9290      1200



## 5-Fold cross-validation of TF–IDF + logistic regression baseline

In [36]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=35000)),
    ('clf',  LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42))
])

# Stratified 5-fold CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(
    pipeline,
    train['text_str'],
    train['label'],
    cv=cv,
    scoring='accuracy',
    n_jobs=-1
)

print("CV accuracies per fold:", scores)
print(f"Mean CV accuracy: {scores.mean():.4f} ± {scores.std():.4f}")

CV accuracies per fold: [0.90583333 0.925      0.92       0.91333333 0.92916667]
Mean CV accuracy: 0.9187 ± 0.0083


## Hyperparameter Tuning: GridSearchCV on TF–IDF + LR

In [37]:
# # param_grid for GridSearchCV
# param_grid = {
#     'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2), (1,3), (2,3), (3,3), (1,4)], # define the n-gram range
#     'clf__C': [0.05, 0.1, 2, 3, 8],
#     'clf__penalty': ['l1', 'l2', None], # l1 is for Lasso, l2 is for Ridge
#     'tfidf__max_df': [0.5, 1.0], # max_df is the maximum document frequency
#     'tfidf__min_df': [2, 5], # min_df is the minimum document frequency
# }

# param_grid for RandomizedSearchCV
param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2), (1, 3), (2, 3), (3, 3), (1, 4), (2, 4), (3, 4)],
    'tfidf__max_df': uniform(loc=0.5, scale=1),  # samples from 0.5 to 1.5
    'tfidf__min_df': [1, 2, 3, 5],
    'clf__C': uniform(loc=0.01, scale=10),  # continuous range for C
    'clf__penalty': ['l1', 'l2', None],
} 


# # Perform grid search with GridSearchCV
# grid_search = GridSearchCV(
#     estimator= pipeline,
#     param_grid= param_grid,
#     cv= cv,
#     scoring= 'accuracy',
#     n_jobs= -1,
#     verbose= 1
# )

# Perform grid search with RandomizedSearchCV
grid_search = RandomizedSearchCV(
    estimator= pipeline,
    param_distributions= param_grid,
    n_iter = 30,
    cv= cv,
    scoring= 'accuracy',
    n_jobs= -1,
    verbose= 1
)

# # why use tarin['text_str'] and train['label]
# grid_search.fit(train['text_str'], train['label']) 

# # Perform grid search with GridSearchCV
# grid_search.fit(df_train['text_str'], df_train['label'])


# Perform grid search with RandomizedSearchCV
grid_search.fit(df_train['text_str'], df_train['label'])

print("Best CV Accuracy: {:.4f}".format(grid_search.best_score_))
print("Best Parameters:", grid_search.best_params_)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


85 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/

Best CV Accuracy: 0.9633
Best Parameters: {'clf__C': 9.349625001373763, 'clf__penalty': None, 'tfidf__max_df': 0.7501309216972853, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 3)}




### Re-evaluate model after hyperparameter tuning

In [38]:
# Evaluate tuned model on hold-out validation set
best_pipe = grid_search.best_estimator_
X_val_tuned = best_pipe.named_steps['tfidf'].transform(df_val['text_str'])
y_val_pred = best_pipe.named_steps['clf'].predict(X_val_tuned)

print("Hold-out Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred, digits=4))

Hold-out Validation Accuracy: 0.97
              precision    recall  f1-score   support

           0     0.9318    0.8200    0.8723       150
           1     0.9747    0.9914    0.9830      1050

    accuracy                         0.9700      1200
   macro avg     0.9533    0.9057    0.9277      1200
weighted avg     0.9694    0.9700    0.9692      1200



In [39]:
### Threshold tuning for tuned TF–IDF + Logistic Regression

In [40]:
best_pipe = grid_search.best_estimator_

prob_human = best_pipe.predict_proba(df_val['text_str'])[:, 0]

thresholds = np.linspace(0.1, 0.9, 17)
print("Threshold | Precision (human) | Recall (human) | F1 (human)")
for t in thresholds:
    preds = np.where(prob_human >= t, 0, 1)
    # there is a higher chance of beinf a human (0) if prob_human >= t
    prec = precision_score(y_val, preds, pos_label=0)
    rec  = recall_score(y_val, preds, pos_label=0)
    f1   = f1_score(y_val, preds, pos_label=0)
    print(f"  {t:.2f}    |   {prec:.4f}         |   {rec:.4f}     |  {f1:.4f}")

Threshold | Precision (human) | Recall (human) | F1 (human)
  0.10    |   0.7414         |   0.8600     |  0.7963
  0.15    |   0.7544         |   0.8600     |  0.8037
  0.20    |   0.7619         |   0.8533     |  0.8050
  0.25    |   0.7619         |   0.8533     |  0.8050
  0.30    |   0.9137         |   0.8467     |  0.8789
  0.35    |   0.9130         |   0.8400     |  0.8750
  0.40    |   0.9197         |   0.8400     |  0.8780
  0.45    |   0.9191         |   0.8333     |  0.8741
  0.50    |   0.9318         |   0.8200     |  0.8723
  0.55    |   0.9313         |   0.8133     |  0.8683
  0.60    |   0.9313         |   0.8133     |  0.8683
  0.65    |   0.9302         |   0.8000     |  0.8602
  0.70    |   0.9302         |   0.8000     |  0.8602
  0.75    |   0.9370         |   0.7933     |  0.8592
  0.80    |   0.9435         |   0.7800     |  0.8540
  0.85    |   0.9583         |   0.7667     |  0.8519
  0.90    |   0.9576         |   0.7533     |  0.8433


In [41]:
best_ngram = grid_search.best_params_['tfidf__ngram_range']
print(f"Best_ngrom: {best_ngram}")

best_C = grid_search.best_params_['clf__C']
print(f"Best_C: {best_C}")

best_clf_penalty = grid_search.best_params_['clf__penalty']
print(f"Best_clf_penalty: {best_clf_penalty}")

# Train a new pipeline with the best parameters
pipeline_tuned = Pipeline([
    ('tfidf', TfidfVectorizer(max_features = 32000, ngram_range = best_ngram)),
    ('clf',  LogisticRegression(class_weight='balanced', C=best_C, penalty=best_clf_penalty, max_iter=1000, random_state=42))
])

# Fit the tuned pipeline on the training set
pipeline_tuned.fit(df_train['text_str'], df_train['label'])

# Predict on the validation set
prob_human = pipeline_tuned.predict_proba(df_val['text_str'])[:, 0]

threshold = 0.70
preds_val = np.where(prob_human >= threshold, 0, 1)

# Evaluate
print(f"Validation Accuracy (threshold = {threshold:.2f}): {accuracy_score(y_val, preds_val):.4f}\n")
print("Classification Report:")
print(classification_report(y_val, preds_val, digits=4))

Best_ngrom: (1, 3)
Best_C: 9.349625001373763
Best_clf_penalty: None
Validation Accuracy (threshold = 0.70): 0.9658

Classification Report:
              precision    recall  f1-score   support

           0     0.9431    0.7733    0.8498       150
           1     0.9684    0.9933    0.9807      1050

    accuracy                         0.9658      1200
   macro avg     0.9558    0.8833    0.9153      1200
weighted avg     0.9653    0.9658    0.9644      1200





## Predict y based on the test file data and export as a csv

In [42]:
# prob_human_test = pipeline_tuned.predict_proba(df_test['text_str'])[:, 0]
# threshold_final = threshold
# preds_test = np.where(prob_human_test >= threshold, 0, 1)

# submission = pd.DataFrame({'id': df_test['id'], 'class': preds_test})
# submission.to_csv('/Users/zigeliang/Desktop/All/Data Science 2025S1/COMP90051/A2/comp-90051-2025-s-1-project-2/tfidf2.csv', index=False)
# submission.head(15)

## Method 2: SMOTE on Domain 2’s Human Class

In [43]:
from scipy.sparse import vstack, csr_matrix
from imblearn.over_sampling import SMOTE

In [44]:
# Extract TF–IDF vectorizer and optimal C from the tuned pipeline
vectorizer = best_pipe.named_steps['tfidf']
C_opt = best_pipe.named_steps['clf'].C

# Transform train/validation into feature matrices
X_train_all = vectorizer.transform(df_train['text_str'])
X_val_tf    = vectorizer.transform(df_val['text_str'])
y_train_all = df_train['label'].values

# Select Domain 2 subset
mask_dom2 = df_train['domain'] == 'domain2'
X_dom2    = X_train_all[mask_dom2].toarray()
y_dom2    = y_train_all[mask_dom2]

# Apply SMOTE on Domain 2 human class (label 0)
smote = SMOTE(random_state=42)
X_dom2_res, y_dom2_res = smote.fit_resample(X_dom2, y_dom2)

# Re-combine with Domain 1 unchanged
mask_dom1    = ~mask_dom2
X_dom1       = X_train_all[mask_dom1]
y_dom1       = y_train_all[mask_dom1]
X_dom2_res_sp = csr_matrix(X_dom2_res)

X_res = vstack([X_dom1, X_dom2_res_sp])
y_res = np.concatenate([y_dom1, y_dom2_res])

# Retrain Logistic Regression on SMOTE-augmented data
clf_smote = LogisticRegression(class_weight='balanced',
                               C=C_opt,
                               max_iter=1000,
                               random_state=42)
clf_smote.fit(X_res, y_res)

# Predict on validation using threshold = 0.70
prob_hum_smote = clf_smote.predict_proba(X_val_tf)[:, 0]
preds_smote    = np.where(prob_hum_smote >= 0.70, 0, 1)

# Evaluate
print("SMOTE (Domain2 Human) Validation Accuracy:", accuracy_score(y_val, preds_smote))
print("\nClassification Report:")
print(classification_report(y_val, preds_smote, digits=4))

SMOTE (Domain2 Human) Validation Accuracy: 0.9325

Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.4600    0.6301       150
           1     0.9284    1.0000    0.9629      1050

    accuracy                         0.9325      1200
   macro avg     0.9642    0.7300    0.7965      1200
weighted avg     0.9373    0.9325    0.9213      1200



##### Threshold tuning

In [45]:
X_val_tf = vectorizer.transform(df_val['text_str'])

# Get human-written probabilities from your SMOTE-trained classifier
prob_human_smote = clf_smote.predict_proba(X_val_tf)[:, 0]

# Sweep thresholds and report metrics for the human class
thresholds = np.linspace(0.1, 0.9, 17)
print("Threshold | Precision(0) | Recall(0) | F1(0)")
for t in thresholds:
    preds = np.where(prob_human_smote >= t, 0, 1)
    prec  = precision_score(y_val, preds, pos_label=0)
    rec   = recall_score(y_val, preds, pos_label=0)
    f1    = f1_score(y_val, preds, pos_label=0)
    print(f"  {t:.2f}    |  {prec:.4f}   |   {rec:.4f}  |  {f1:.4f}")

Threshold | Precision(0) | Recall(0) | F1(0)
  0.10    |  0.5526   |   0.9800  |  0.7067
  0.15    |  0.6327   |   0.9533  |  0.7606
  0.20    |  0.7136   |   0.9467  |  0.8138
  0.25    |  0.7446   |   0.9133  |  0.8204
  0.30    |  0.7644   |   0.8867  |  0.8210
  0.35    |  0.9270   |   0.8467  |  0.8850
  0.40    |  0.9496   |   0.7533  |  0.8401
  0.45    |  0.9545   |   0.7000  |  0.8077
  0.50    |  0.9608   |   0.6533  |  0.7778
  0.55    |  0.9892   |   0.6133  |  0.7572
  0.60    |  0.9888   |   0.5867  |  0.7364
  0.65    |  1.0000   |   0.5133  |  0.6784
  0.70    |  1.0000   |   0.4600  |  0.6301
  0.75    |  1.0000   |   0.4133  |  0.5849
  0.80    |  1.0000   |   0.3733  |  0.5437
  0.85    |  1.0000   |   0.3333  |  0.5000
  0.90    |  1.0000   |   0.3067  |  0.4694


In [46]:
threshold = 0.35
preds = np.where(prob_human_smote >= threshold, 0, 1)

print(f"SMOTE Model Accuracy (threshold = {threshold:.2f}):",
      accuracy_score(y_val, preds))
print("\nClassification Report:")
print(classification_report(y_val, preds, digits=4))

SMOTE Model Accuracy (threshold = 0.35): 0.9725

Classification Report:
              precision    recall  f1-score   support

           0     0.9270    0.8467    0.8850       150
           1     0.9784    0.9905    0.9844      1050

    accuracy                         0.9725      1200
   macro avg     0.9527    0.9186    0.9347      1200
weighted avg     0.9719    0.9725    0.9720      1200



## Predict y based on the test file data and export as a csv

In [None]:
# X_test_tf  = vectorizer.transform(df_test['text_str'])

# prob_human_smote = clf_smote.predict_proba(X_test_tf)[:, 0]

# preds_test = np.where(prob_human_smote >= 0.35, 0, 1)

# submission_smote = pd.DataFrame({
#     'id':    df_test['id'],
#     'class': preds_test
# })
# submission_smote.to_csv('/Users/zigeliang/Desktop/All/Data Science 2025S1/COMP90051/A2/comp-90051-2025-s-1-project-2/smote_submission_tfidf2.csv', index=False)
# submission_smote.head(15)


Unnamed: 0,id,class
0,0,1
1,1,0
2,2,0
3,3,1
4,4,0
5,5,1
6,6,0
7,7,1
8,8,1
9,9,1


## Method 3: Domain-Expert Ensemble

#### Train domain-expert classifiers

In [53]:
# Subset by domain
train_dom1 = df_train[df_train.domain == 'domain1']
train_dom2 = df_train[df_train.domain == 'domain2']


# Define identical function for each domain
pipe_dom1 = Pipeline([
    ('tfidf', TfidfVectorizer(max_features = 32000, ngram_range = best_ngram)),
    ('clf',  LogisticRegression(class_weight='balanced', C=best_C, penalty=best_clf_penalty, max_iter=1000, random_state=42))
])

pipe_dom2 = Pipeline([
    ('tfidf', TfidfVectorizer(max_features = 32000, ngram_range = best_ngram)),
    ('clf',  LogisticRegression(class_weight='balanced', C=best_C, penalty=best_clf_penalty, max_iter=1000, random_state=42))
])


# Train
pipe_dom1.fit(train_dom1.text_str, train_dom1.label)
pipe_dom2.fit(train_dom2.text_str, train_dom2.label)

print("Domain‐expert models trained.")



Domain‐expert models trained.




##### Evaluate domain-expert ensemble on validation set

In [55]:
# Allocate an array for human-class probabilities
probs = np.zeros(len(df_val))

# Fill in per-domain probabilities
mask_dom1 = df_val['domain'] == 'domain1'
mask_dom2 = ~mask_dom1

probs[mask_dom1] = pipe_dom1.predict_proba(df_val.loc[mask_dom1, 'text_str'])[:, 0]
probs[mask_dom2] = pipe_dom2.predict_proba(df_val.loc[mask_dom2, 'text_str'])[:, 0]

threshold = 0.70
preds_ensemble = np.where(probs >= threshold, 0, 1)

# Evaluate
print(f"Ensemble Validation Accuracy (thr={threshold:.2f}):",
      accuracy_score(y_val, preds_ensemble))
print("\nClassification Report:")
print(classification_report(y_val, preds_ensemble, digits=4))

Ensemble Validation Accuracy (thr=0.70): 0.9783333333333334

Classification Report:
              precision    recall  f1-score   support

           0     0.9921    0.8333    0.9058       150
           1     0.9767    0.9990    0.9878      1050

    accuracy                         0.9783      1200
   macro avg     0.9844    0.9162    0.9468      1200
weighted avg     0.9786    0.9783    0.9775      1200



## Threshold sweep for domain-expert ensemble

In [58]:
# Compute ensemble “human” probabilities
probs = np.zeros(len(df_val))
mask_dom1     = df_val['domain'] == 'domain1'
mask_dom2     = ~mask_dom1
probs[mask_dom1] = pipe_dom1.predict_proba(df_val.loc[mask_dom1, 'text_str'])[:, 0]
probs[mask_dom2] = pipe_dom2.predict_proba(df_val.loc[mask_dom2, 'text_str'])[:, 0]

# Sweep thresholds
thresholds = np.linspace(0.1, 0.9, 17)
print("Threshold | Precision(0) | Recall(0) | F1(0)")
for t in thresholds:
    preds = np.where(probs >= t, 0, 1)
    prec  = precision_score(y_val, preds, pos_label=0)
    rec   = recall_score(   y_val, preds, pos_label=0)
    f1    = f1_score(       y_val, preds, pos_label=0)
    print(f"  {t:.2f}    |   {prec:.4f}     |   {rec:.4f}   |  {f1:.4f}")

Threshold | Precision(0) | Recall(0) | F1(0)
  0.10    |   0.9437     |   0.8933   |  0.9178
  0.15    |   0.9640     |   0.8933   |  0.9273
  0.20    |   0.9781     |   0.8933   |  0.9338
  0.25    |   0.9852     |   0.8867   |  0.9333
  0.30    |   0.9852     |   0.8867   |  0.9333
  0.35    |   0.9848     |   0.8667   |  0.9220
  0.40    |   0.9848     |   0.8667   |  0.9220
  0.45    |   0.9846     |   0.8533   |  0.9143
  0.50    |   0.9845     |   0.8467   |  0.9104
  0.55    |   0.9845     |   0.8467   |  0.9104
  0.60    |   0.9845     |   0.8467   |  0.9104
  0.65    |   0.9843     |   0.8333   |  0.9025
  0.70    |   0.9921     |   0.8333   |  0.9058
  0.75    |   0.9921     |   0.8333   |  0.9058
  0.80    |   0.9921     |   0.8333   |  0.9058
  0.85    |   0.9918     |   0.8067   |  0.8897
  0.90    |   1.0000     |   0.7800   |  0.8764


#### Evaluate ensemble based attempts of different thresholds

In [64]:
probs = np.zeros(len(df_val))
mask_dom1 = df_val['domain'] == 'domain1'
mask_dom2 = ~mask_dom1
probs[mask_dom1] = pipe_dom1.predict_proba(df_val.loc[mask_dom1, 'text_str'])[:, 0]
probs[mask_dom2] = pipe_dom2.predict_proba(df_val.loc[mask_dom2, 'text_str'])[:, 0]

# 2. Apply threshold
threshold = 0.40
preds_ensem = np.where(probs >= threshold, 0, 1)

# 3. Evaluate
print(f"Ensemble Validation Accuracy (threshold = {threshold:.2f}):",
      accuracy_score(y_val, preds_ensem))
print("\nClassification Report:")
print(classification_report(y_val, preds_ensem, digits=4))

Ensemble Validation Accuracy (threshold = 0.40): 0.9816666666666667

Classification Report:
              precision    recall  f1-score   support

           0     0.9848    0.8667    0.9220       150
           1     0.9813    0.9981    0.9896      1050

    accuracy                         0.9817      1200
   macro avg     0.9831    0.9324    0.9558      1200
weighted avg     0.9817    0.9817    0.9812      1200



In [67]:
# Load & prepare test
df_test = pd.read_json('/Users/zigeliang/Desktop/All/Data Science 2025S1/COMP90051/A2/comp-90051-2025-s-1-project-2/test_data.json', lines=True)
df_test['text_str'] = df_test['text'].apply(lambda seq: ' '.join(map(str, seq)))

# Get human‐class probs from both domain‐expert models
p1 = pipe_dom1.predict_proba(df_test['text_str'])[:,0]
p2 = pipe_dom2.predict_proba(df_test['text_str'])[:,0]

probs_test = (p1 + p2) / 2

threshold = 0.40
preds_test = np.where(probs_test >= threshold, 0, 1)

submission = pd.DataFrame({'id': df_test['id'], 'class': preds_test})
submission.to_csv('/Users/zigeliang/Desktop/All/Data Science 2025S1/COMP90051/A2/comp-90051-2025-s-1-project-2/smote_submission_tfidf2.csv', index=False)
submission.head(15)

Unnamed: 0,id,class
0,0,0
1,1,0
2,2,0
3,3,1
4,4,0
5,5,1
6,6,0
7,7,1
8,8,0
9,9,1
