In [500]:
# Set up environment and list data files
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from scipy.stats import randint

## Load domain 1 data

In [501]:
# Load domain1 JSON as newline-delimited JSON (JSON Lines)
df1 = pd.read_json('/Users/zigeliang/Desktop/All/Data Science 2025S1/COMP90051/A2/comp-90051-2025-s-1-project-2/domain1_train_data.json', lines=True)
print("Domain1 shape:", df1.shape)
print("Label distribution in domain1:\n", df1['label'].value_counts())
df1.head()


Domain1 shape: (1000, 3)
Label distribution in domain1:
 label
0    500
1    500
Name: count, dtype: int64


Unnamed: 0,text,label,id
0,"[6, 22, 34, 76, 501, 977, 1, 2514, 13623, 76, ...",0,0
1,"[222, 31, 4108, 104, 132, 361, 39, 2305, 12, 9...",0,1
2,"[736, 7194, 113, 12, 366, 2870, 123, 101, 12, ...",0,2
3,"[48, 1, 2025, 69, 361, 533, 327, 237, 4150, 13...",0,3
4,"[2973, 66, 1, 1493, 260, 2740, 50, 1027, 50, 1...",0,4


## Load domain 2 data 

In [502]:
# Load domain1 JSON as newline-delimited JSON (JSON Lines)
df2 = pd.read_json('/Users/zigeliang/Desktop/All/Data Science 2025S1/COMP90051/A2/comp-90051-2025-s-1-project-2/domain2_train_data.json', lines=True)
print("Domain1 shape:", df2.shape)
print("Label distribution in domain1:\n", df2['label'].value_counts())
df2.head()

Domain1 shape: (5000, 3)
Label distribution in domain1:
 label
1    4750
0     250
Name: count, dtype: int64


Unnamed: 0,text,label,id
0,"[22, 6065, 76, 119, 13027, 575, 219, 22, 2435,...",0,0
1,"[1275, 1509, 12, 6113, 6287, 327, 411, 1139, 2...",0,1
2,"[575, 2962, 529, 4624, 39, 279, 1012, 277, 76,...",0,2
3,"[12, 6113, 2428, 69, 375, 1025, 2605, 76, 101,...",0,3
4,"[529, 76, 1509, 861, 1, 645, 1, 5013, 237, 3, ...",0,4


## Load the test dataset 


In [503]:
# Load test JSON into a DataFrame and inspect
df_test = pd.read_json('/Users/zigeliang/Desktop/All/Data Science 2025S1/COMP90051/A2/comp-90051-2025-s-1-project-2/test_data.json', lines=True)
print("Test set shape:", df_test.shape)
df_test.head()

Test set shape: (4000, 2)


Unnamed: 0,text,id
0,"[9159, 3048, 238, 276, 162, 286, 305, 22, 36, ...",0
1,"[64, 5039, 1275, 6, 0, 871, 139, 270, 327, 237...",1
2,"[327, 618, 76, 650, 121, 274, 1025, 0, 12207, ...",2
3,"[6, 12, 609, 11905, 4, 879, 677, 78, 13352, 60...",3
4,"[1, 5504, 55, 22, 101, 3783, 139, 2664, 4, 1, ...",4


## Combine domain 1 and domina 2


In [504]:
# Combine domain1 and domain2 into a single training DataFrame and inspect
train = pd.concat([
    df1.assign(domain='domain1'), # add a new column 'domain' with value 'domain1'
    df2.assign(domain='domain2') # in the new added column called 'domain', mark the value from domain 2 with value 'domain2'
], ignore_index=True)

print(train.head())

print("Combined train shape:", train.shape)
print("\nOverall label distribution:\n", train['label'].value_counts())
print("\nDomain breakdown:\n", train['domain'].value_counts())
train.sample(5)

                                                text  label  id   domain
0  [6, 22, 34, 76, 501, 977, 1, 2514, 13623, 76, ...      0   0  domain1
1  [222, 31, 4108, 104, 132, 361, 39, 2305, 12, 9...      0   1  domain1
2  [736, 7194, 113, 12, 366, 2870, 123, 101, 12, ...      0   2  domain1
3  [48, 1, 2025, 69, 361, 533, 327, 237, 4150, 13...      0   3  domain1
4  [2973, 66, 1, 1493, 260, 2740, 50, 1027, 50, 1...      0   4  domain1
Combined train shape: (6000, 4)

Overall label distribution:
 label
1    5250
0     750
Name: count, dtype: int64

Domain breakdown:
 domain
domain2    5000
domain1    1000
Name: count, dtype: int64


Unnamed: 0,text,label,id,domain
4541,"[1, 5255, 377, 12598, 335, 2749, 101, 2239, 52...",1,3541,domain2
5057,"[3882, 12, 5203, 5638, 10463, 206, 301, 12, 16...",1,4057,domain2
807,"[132, 6222, 151, 55, 48, 2436, 366, 13544, 13,...",1,807,domain1
3108,"[353, 2606, 103, 353, 55, 838, 4, 1663, 1, 308...",1,2108,domain2
1741,"[324, 151, 12, 316, 114, 510, 110, 1846, 235, ...",1,741,domain2


## Method 1: Bag-of-Words + TF–IDF Baseline with Class-Weighted Logistic Regression

## Prepare text strings for TF-IDF input

In [505]:
# Convert token sequences into strings
train['text_str'] = train['text'].apply(lambda seq: ' '.join(map(str, seq)))
df_test['text_str'] = df_test['text'].apply(lambda seq: ' '.join(map(str, seq)))

train[['id', 'label', 'domain', 'text_str']].head() # inspect examples

Unnamed: 0,id,label,domain,text_str
0,0,0,domain1,6 22 34 76 501 977 1 2514 13623 76 31 2085 277...
1,1,0,domain1,222 31 4108 104 132 361 39 2305 12 936 1287 66...
2,2,0,domain1,736 7194 113 12 366 2870 123 101 12 230 403 51...
3,3,0,domain1,48 1 2025 69 361 533 327 237 4150 13 22 2128 1...
4,4,0,domain1,2973 66 1 1493 260 2740 50 1027 50 1 3289 69 5...


In [506]:
# Split the combined DataFrame into df_train and df_val
df_train, df_val = train_test_split(train, test_size=0.2, stratify = train['label'], random_state=42)

print(df_train.head())
print(df_train.domain)

# Extract labels
y_train = df_train['label'].values
y_val   = df_val['label'].values

# Fit TF-IDF on the training text_str
vectorizer = TfidfVectorizer(max_features=15000)
X_train_tf = vectorizer.fit_transform(df_train['text_str'])
X_val_tf   = vectorizer.transform(df_val['text_str'])
X_test_tf  = vectorizer.transform(df_test['text_str'])



                                                   text  label    id   domain  \
384   [1, 231, 3252, 22, 595, 4, 133, 101, 4, 1496, ...      0   384  domain1   
3366  [13739, 3355, 101, 1, 73, 13, 22, 238, 706, 49...      1  2366  domain2   
1218  [76, 1509, 55, 305, 497, 12, 1326, 69, 635, 80...      0   218  domain2   
1312  [324, 151, 285, 3050, 1106, 1, 5971, 453, 69, ...      1   312  domain2   
873   [1882, 12, 1773, 507, 36, 64, 12, 630, 39, 168...      1   873  domain1   

                                               text_str  
384   1 231 3252 22 595 4 133 101 4 1496 4613 12 702...  
3366  13739 3355 101 1 73 13 22 238 706 496 12 2547 ...  
1218  76 1509 55 305 497 12 1326 69 635 80 1 279 76 ...  
1312  324 151 285 3050 1106 1 5971 453 69 852 39 143...  
873   1882 12 1773 507 36 64 12 630 39 1685 164 324 ...  
384     domain1
3366    domain2
1218    domain2
1312    domain2
873     domain1
         ...   
2487    domain2
2047    domain2
3718    domain2
3856    domain2
4606 

## Evulauate log-regression model baseline

In [507]:
# Train and evaluate a class-weighted Logistic Regression baseline
clf = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)

# Fit on training TF-IDF features
clf.fit(X_train_tf, y_train)

# Predict on validation set
y_pred = clf.predict(X_val_tf)

# Evaluate
acc = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {acc:.4f}\n")
print("Classification Report:")
print(classification_report(y_val, y_pred, digits=4))


Validation Accuracy: 0.9250

Classification Report:
              precision    recall  f1-score   support

           0     0.6546    0.8467    0.7384       150
           1     0.9771    0.9362    0.9562      1050

    accuracy                         0.9250      1200
   macro avg     0.8159    0.8914    0.8473      1200
weighted avg     0.9368    0.9250    0.9290      1200



## 5-Fold cross-validation of TF–IDF + logistic regression baseline

In [508]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=35000)),
    ('clf',  LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42))
])

# Stratified 5-fold CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(
    pipeline,
    train['text_str'],
    train['label'],
    cv=cv,
    scoring='accuracy',
    n_jobs=-1
)

print("CV accuracies per fold:", scores)
print(f"Mean CV accuracy: {scores.mean():.4f} ± {scores.std():.4f}")

CV accuracies per fold: [0.9058 0.925  0.92   0.9133 0.9292]
Mean CV accuracy: 0.9187 ± 0.0083


## Hyperparameter Tuning: GridSearchCV on TF–IDF + LR

In [509]:
# # param_grid for GridSearchCV
# param_grid = {
#     'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2), (1,3), (2,3), (3,3), (1,4)], # define the n-gram range
#     'clf__C': [0.05, 0.1, 2, 3, 8],
#     'clf__penalty': ['l1', 'l2', None], # l1 is for Lasso, l2 is for Ridge
#     'tfidf__max_df': [0.5, 1.0], # max_df is the maximum document frequency
#     'tfidf__min_df': [2, 5], # min_df is the minimum document frequency
# }

# param_grid for RandomizedSearchCV
param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2), (1, 3), (2, 3), (3, 3), (1, 4), (2, 4), (3, 4)],
    'tfidf__max_df': uniform(loc=0.5, scale=1.0),  # samples from 0.5 to 1.5
    'tfidf__min_df': [1, 2, 3, 5],
    'clf__C': uniform(loc=0.01, scale=10),  # continuous range for C
    'clf__penalty': ['l1', 'l2', None]
} 


# # Perform grid search with GridSearchCV
# grid_search = GridSearchCV(
#     estimator= pipeline,
#     param_grid= param_grid,
#     cv= cv,
#     scoring= 'accuracy',
#     n_jobs= -1,
#     verbose= 1
# )

# Perform grid search with RandomizedSearchCV
grid_search = RandomizedSearchCV(
    estimator = pipeline,
    param_distributions = param_grid,
    n_iter = 30,
    cv= cv,
    scoring= 'accuracy',
    n_jobs= -1,
    verbose= 1
)

# # why use tarin['text_str'] and train['label]
# grid_search.fit(train['text_str'], train['label']) 

# # Perform grid search with GridSearchCV
# grid_search.fit(df_train['text_str'], df_train['label'])


# Perform grid search with RandomizedSearchCV
grid_search.fit(df_train['text_str'], df_train['label'])

print("Best CV Accuracy: {:.4f}".format(grid_search.best_score_))
print("Best Parameters:", grid_search.best_params_)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


110 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages

Best CV Accuracy: 0.9585
Best Parameters: {'clf__C': 1.3088706332961486, 'clf__penalty': None, 'tfidf__max_df': 0.6381795165788078, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}




### Re-evaluate model after hyperparameter tuning

In [510]:
# Evaluate tuned model on hold-out validation set
best_pipe = grid_search.best_estimator_
X_val_tuned = best_pipe.named_steps['tfidf'].transform(df_val['text_str'])
y_val_pred = best_pipe.named_steps['clf'].predict(X_val_tuned)

print("Hold-out Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred, digits=4))

Hold-out Validation Accuracy: 0.9666666666666667
              precision    recall  f1-score   support

           0     0.9435    0.7800    0.8540       150
           1     0.9693    0.9933    0.9812      1050

    accuracy                         0.9667      1200
   macro avg     0.9564    0.8867    0.9176      1200
weighted avg     0.9661    0.9667    0.9653      1200



## Threshold tuning for tuned TF–IDF + Logistic Regression

In [511]:
best_pipe = grid_search.best_estimator_

prob_human = best_pipe.predict_proba(df_val['text_str'])[:, 0]

thresholds = np.linspace(0.1, 0.9, 17)
print("Threshold | Precision (human) | Recall (human) | F1 (human)")
for t in thresholds:
    preds = np.where(prob_human >= t, 0, 1)
    # there is a higher chance of beinf a human (0) if prob_human >= t
    prec = precision_score(y_val, preds, pos_label=0)
    rec  = recall_score(y_val, preds, pos_label=0)
    f1   = f1_score(y_val, preds, pos_label=0)
    print(f"  {t:.2f}    |   {prec:.4f}         |   {rec:.4f}     |  {f1:.4f}")

Threshold | Precision (human) | Recall (human) | F1 (human)
  0.10    |   0.7702         |   0.8267     |  0.7974
  0.15    |   0.7707         |   0.8067     |  0.7883
  0.20    |   0.9160         |   0.8000     |  0.8541
  0.25    |   0.9225         |   0.7933     |  0.8530
  0.30    |   0.9360         |   0.7800     |  0.8509
  0.35    |   0.9435         |   0.7800     |  0.8540
  0.40    |   0.9435         |   0.7800     |  0.8540
  0.45    |   0.9435         |   0.7800     |  0.8540
  0.50    |   0.9435         |   0.7800     |  0.8540
  0.55    |   0.9431         |   0.7733     |  0.8498
  0.60    |   0.9431         |   0.7733     |  0.8498
  0.65    |   0.9431         |   0.7733     |  0.8498
  0.70    |   0.9431         |   0.7733     |  0.8498
  0.75    |   0.9431         |   0.7733     |  0.8498
  0.80    |   0.9426         |   0.7667     |  0.8456
  0.85    |   0.9483         |   0.7333     |  0.8271
  0.90    |   0.9558         |   0.7200     |  0.8213


In [512]:
best_ngram = grid_search.best_params_['tfidf__ngram_range']
print(f"Best_ngrom: {best_ngram}")

best_C = grid_search.best_params_['clf__C']
print(f"Best_C: {best_C}")

best_clf_penalty = grid_search.best_params_['clf__penalty']
print(f"Best_clf_penalty: {best_clf_penalty}")


# Train a new pipeline with the best parameters
pipeline_tuned = Pipeline([
    ('tfidf', TfidfVectorizer(max_features = 30000, ngram_range = best_ngram)),
    ('clf',  LogisticRegression(class_weight='balanced', C=best_C, penalty=best_clf_penalty, max_iter=1000, random_state=42))
])

# Fit the tuned pipeline on the training set
pipeline_tuned.fit(df_train['text_str'], df_train['label'])

# Predict on the validation set
prob_human = pipeline_tuned.predict_proba(df_val['text_str'])[:, 0]

print(prob_human[:55])

threshold = 0.65
preds_val = np.where(prob_human >= threshold, 0, 1)
print(preds_val[:55])

# Evaluate
print(f"Validation Accuracy (threshold = {threshold:.2f}): {accuracy_score(y_val, preds_val):.4f}\n")
print("Classification Report:")
print(classification_report(y_val, preds_val, digits=4))

Best_ngrom: (1, 2)
Best_C: 1.3088706332961486
Best_clf_penalty: None
[0.     0.     1.     0.     0.     0.     0.     0.     1.     0.
 0.     1.     0.     1.     0.     0.     0.     0.     0.     0.
 0.     0.     0.     0.     0.     1.     0.     1.     0.     0.
 0.     0.     1.     0.     0.     0.     0.     0.     0.     0.
 0.     0.     0.     0.     0.     0.     0.     0.     0.4088 0.
 0.     0.     0.     0.     0.    ]
[1 1 0 1 1 1 1 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 0 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
Validation Accuracy (threshold = 0.65): 0.9667

Classification Report:
              precision    recall  f1-score   support

           0     0.9583    0.7667    0.8519       150
           1     0.9676    0.9952    0.9812      1050

    accuracy                         0.9667      1200
   macro avg     0.9630    0.8810    0.9165      1200
weighted avg     0.9664    0.9667    0.9650      1200





## Predict y based on the test file data and export as a csv

In [513]:
# prob_human_test = pipeline_tuned.predict_proba(df_test['text_str'])[:, 0]
# threshold_final = threshold
# preds_test = np.where(prob_human_test >= threshold, 0, 1)

# submission = pd.DataFrame({'id': df_test['id'], 'class': preds_test})
# submission.to_csv('/Users/zigeliang/Desktop/All/Data Science 2025S1/COMP90051/A2/comp-90051-2025-s-1-project-2/tfidf2.csv', index=False)
# submission.head(15)

## Method 2: SMOTE on Domain 2’s Human Class

In [514]:
from scipy.sparse import vstack, csr_matrix
from imblearn.over_sampling import SMOTE

In [515]:
# Extract TF–IDF vectorizer and optimal C from the tuned pipeline
vectorizer = best_pipe.named_steps['tfidf']
C_opt = best_pipe.named_steps['clf'].C

# Transform train/validation into feature matrices
X_train_all = vectorizer.transform(df_train['text_str'])
X_val_tf    = vectorizer.transform(df_val['text_str'])
y_train_all = df_train['label'].values

# Select Domain 2 subset
mask_dom2 = df_train['domain'] == 'domain2'
X_dom2    = X_train_all[mask_dom2].toarray()
y_dom2    = y_train_all[mask_dom2]

# Apply SMOTE on Domain 2 human class (label 0)
smote = SMOTE(random_state=42)
X_dom2_res, y_dom2_res = smote.fit_resample(X_dom2, y_dom2)

# Re-combine with Domain 1 unchanged
mask_dom1    = ~mask_dom2
X_dom1       = X_train_all[mask_dom1]
y_dom1       = y_train_all[mask_dom1]
X_dom2_res_sp = csr_matrix(X_dom2_res)

X_res = vstack([X_dom1, X_dom2_res_sp])
y_res = np.concatenate([y_dom1, y_dom2_res])

# Retrain Logistic Regression on SMOTE-augmented data
clf_smote = LogisticRegression(class_weight='balanced',
                               C=C_opt,
                               max_iter=1000,
                               random_state=42)
clf_smote.fit(X_res, y_res)

# Predict on validation using threshold = 0.70
prob_hum_smote = clf_smote.predict_proba(X_val_tf)[:, 0]
preds_smote    = np.where(prob_hum_smote >= 0.70, 0, 1)

# Evaluate
print("SMOTE (Domain2 Human) Validation Accuracy:", accuracy_score(y_val, preds_smote))
print("\nClassification Report:")
print(classification_report(y_val, preds_smote, digits=4))

SMOTE (Domain2 Human) Validation Accuracy: 0.9125

Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.3000    0.4615       150
           1     0.9091    1.0000    0.9524      1050

    accuracy                         0.9125      1200
   macro avg     0.9545    0.6500    0.7070      1200
weighted avg     0.9205    0.9125    0.8910      1200



##### Threshold tuning

In [516]:
X_val_tf = vectorizer.transform(df_val['text_str'])

# Get human-written probabilities from your SMOTE-trained classifier
prob_human_smote = clf_smote.predict_proba(X_val_tf)[:, 0]

# Sweep thresholds and report metrics for the human class
thresholds = np.linspace(0.1, 0.9, 17)
print("Threshold | Precision(0) | Recall(0) | F1(0)")
for t in thresholds:
    preds = np.where(prob_human_smote >= t, 0, 1)
    prec  = precision_score(y_val, preds, pos_label=0)
    rec   = recall_score(y_val, preds, pos_label=0)
    f1    = f1_score(y_val, preds, pos_label=0)
    print(f"  {t:.2f}    |  {prec:.4f}   |   {rec:.4f}  |  {f1:.4f}")

Threshold | Precision(0) | Recall(0) | F1(0)
  0.10    |  0.2788   |   1.0000  |  0.4360
  0.15    |  0.4233   |   0.9933  |  0.5936
  0.20    |  0.5428   |   0.9733  |  0.6969
  0.25    |  0.6227   |   0.9133  |  0.7405
  0.30    |  0.6904   |   0.9067  |  0.7839
  0.35    |  0.7288   |   0.8600  |  0.7890
  0.40    |  0.8960   |   0.7467  |  0.8145
  0.45    |  0.9479   |   0.6067  |  0.7398
  0.50    |  0.9630   |   0.5200  |  0.6753
  0.55    |  0.9848   |   0.4333  |  0.6019
  0.60    |  0.9833   |   0.3933  |  0.5619
  0.65    |  1.0000   |   0.3467  |  0.5149
  0.70    |  1.0000   |   0.3000  |  0.4615
  0.75    |  1.0000   |   0.2800  |  0.4375
  0.80    |  1.0000   |   0.2533  |  0.4043
  0.85    |  1.0000   |   0.1867  |  0.3146
  0.90    |  1.0000   |   0.1533  |  0.2659


In [517]:
threshold = 0.35
preds = np.where(prob_human_smote >= threshold, 0, 1)

print(f"SMOTE Model Accuracy (threshold = {threshold:.2f}):",
      accuracy_score(y_val, preds))
print("\nClassification Report:")
print(classification_report(y_val, preds, digits=4))

SMOTE Model Accuracy (threshold = 0.35): 0.9425

Classification Report:
              precision    recall  f1-score   support

           0     0.7288    0.8600    0.7890       150
           1     0.9795    0.9543    0.9667      1050

    accuracy                         0.9425      1200
   macro avg     0.8541    0.9071    0.8779      1200
weighted avg     0.9481    0.9425    0.9445      1200



## Predict y based on the test file data and export as a csv

In [518]:
# X_test_tf  = vectorizer.transform(df_test['text_str'])

# prob_human_smote = clf_smote.predict_proba(X_test_tf)[:, 0]

# preds_test = np.where(prob_human_smote >= 0.35, 0, 1)

# submission_smote = pd.DataFrame({
#     'id':    df_test['id'],
#     'class': preds_test
# })
# submission_smote.to_csv('/Users/zigeliang/Desktop/All/Data Science 2025S1/COMP90051/A2/comp-90051-2025-s-1-project-2/smote_submission_tfidf2.csv', index=False)
# submission_smote.head(15)


## Method 3: Domain-Expert Ensemble

#### Train domain-expert classifiers

In [519]:
# Subset by domain
train_dom1 = df_train[df_train.domain == 'domain1']
train_dom2 = df_train[df_train.domain == 'domain2']


# Define identical function for each domain
pipe_dom1 = Pipeline([
    ('tfidf', TfidfVectorizer(max_features = 30000, ngram_range = best_ngram)),
    ('clf',  LogisticRegression(class_weight='balanced', C = best_C, penalty=best_clf_penalty, max_iter= 1100, random_state=42))
])

pipe_dom2 = Pipeline([
    ('tfidf', TfidfVectorizer(max_features = 30000, ngram_range = best_ngram)),
    ('clf',  LogisticRegression(class_weight='balanced', C = best_C, penalty=best_clf_penalty, max_iter= 1100, random_state=42))
])


# Train
pipe_dom1.fit(train_dom1.text_str, train_dom1.label)
pipe_dom2.fit(train_dom2.text_str, train_dom2.label)

print("Domain‐expert models trained.")



Domain‐expert models trained.




##### Evaluate domain-expert ensemble on validation set

In [520]:
# Allocate an array for human-class probabilities
probs = np.zeros(len(df_val))

# Fill in per-domain probabilities
mask_dom1 = df_val['domain'] == 'domain1'
mask_dom2 = ~mask_dom1

probs[mask_dom1] = pipe_dom1.predict_proba(df_val.loc[mask_dom1, 'text_str'])[:, 0]
probs[mask_dom2] = pipe_dom2.predict_proba(df_val.loc[mask_dom2, 'text_str'])[:, 0]

threshold = 0.70
preds_ensemble = np.where(probs >= threshold, 0, 1)

# Evaluate
print(f"Ensemble Validation Accuracy (thr={threshold:.2f}):",
      accuracy_score(y_val, preds_ensemble))
print("\nClassification Report:")
print(classification_report(y_val, preds_ensemble, digits=4))

Ensemble Validation Accuracy (thr=0.70): 0.9841666666666666

Classification Report:
              precision    recall  f1-score   support

           0     0.9852    0.8867    0.9333       150
           1     0.9840    0.9981    0.9910      1050

    accuracy                         0.9842      1200
   macro avg     0.9846    0.9424    0.9622      1200
weighted avg     0.9842    0.9842    0.9838      1200



## Threshold sweep for domain-expert ensemble

In [521]:
# Compute ensemble “human” probabilities
probs = np.zeros(len(df_val))
mask_dom1     = df_val['domain'] == 'domain1'
mask_dom2     = ~mask_dom1
probs[mask_dom1] = pipe_dom1.predict_proba(df_val.loc[mask_dom1, 'text_str'])[:, 0]
probs[mask_dom2] = pipe_dom2.predict_proba(df_val.loc[mask_dom2, 'text_str'])[:, 0]

# Sweep thresholds
thresholds = np.linspace(0.1, 0.9, 17)
print("Threshold | Precision(0) | Recall(0) | F1(0)")
for t in thresholds:
    preds = np.where(probs >= t, 0, 1)
    prec  = precision_score(y_val, preds, pos_label=0)
    rec   = recall_score(   y_val, preds, pos_label=0)
    f1    = f1_score(       y_val, preds, pos_label=0)
    print(f"  {t:.2f}    |   {prec:.4f}     |   {rec:.4f}   |  {f1:.4f}")

Threshold | Precision(0) | Recall(0) | F1(0)
  0.10    |   0.9375     |   0.9000   |  0.9184
  0.15    |   0.9441     |   0.9000   |  0.9215
  0.20    |   0.9507     |   0.9000   |  0.9247
  0.25    |   0.9574     |   0.9000   |  0.9278
  0.30    |   0.9574     |   0.9000   |  0.9278
  0.35    |   0.9574     |   0.9000   |  0.9278
  0.40    |   0.9643     |   0.9000   |  0.9310
  0.45    |   0.9643     |   0.9000   |  0.9310
  0.50    |   0.9643     |   0.9000   |  0.9310
  0.55    |   0.9643     |   0.9000   |  0.9310
  0.60    |   0.9710     |   0.8933   |  0.9306
  0.65    |   0.9710     |   0.8933   |  0.9306
  0.70    |   0.9852     |   0.8867   |  0.9333
  0.75    |   0.9852     |   0.8867   |  0.9333
  0.80    |   0.9851     |   0.8800   |  0.9296
  0.85    |   0.9850     |   0.8733   |  0.9258
  0.90    |   0.9846     |   0.8533   |  0.9143


#### Evaluate ensemble based attempts of different thresholds

In [522]:
probs = np.zeros(len(df_val)) 
mask_dom1 = df_val['domain'] == 'domain1'
print(mask_dom1)
mask_dom2 = ~mask_dom1
# probs that the data from domain 1 is human-written
probs[mask_dom1] = pipe_dom1.predict_proba(df_val.loc[mask_dom1, 'text_str'])[:, 0]
# probs that the data from domain 2 is human-written
probs[mask_dom2] = pipe_dom2.predict_proba(df_val.loc[mask_dom2, 'text_str'])[:, 0]

print(probs[:55])

# 2. Apply threshold
# when the threshold is too high such as 0.75 and 0.80, even though the precision might be high like 98%\
# but the accuracy will be low, because of the overfitting. Therfore, the threshold should be set to 0.40 or 0.35
threshold = 0.35 
preds_ensem = np.where(probs >= threshold, 0, 1)


# 3. Evaluate
print(f"Ensemble Validation Accuracy (threshold = {threshold:.2f}):",
      accuracy_score(y_val, preds_ensem))
print("\nClassification Report:")
print(classification_report(y_val, preds_ensem, digits=4))

4929    False
958      True
1243    False
1766    False
2509    False
        ...  
5071    False
2746    False
5895    False
2826    False
1669    False
Name: domain, Length: 1200, dtype: bool
[1.1958e-09 0.0000e+00 1.0000e+00 7.7828e-09 4.7758e-07 1.7055e-08
 6.2616e-03 2.5782e-07 1.0000e+00 3.1706e-07 2.0825e-09 1.0000e+00
 3.5381e-09 6.6684e-01 2.2781e-08 9.9012e-09 1.9537e-07 4.6507e-09
 7.7761e-09 9.7300e-13 1.2758e-09 7.3330e-09 4.1300e-13 2.7721e-09
 1.8569e-10 1.0000e+00 5.7490e-05 1.0000e+00 1.7833e-08 1.6078e-08
 2.4763e-08 4.0181e-10 1.0000e+00 3.3183e-10 2.0650e-13 1.5193e-07
 9.2317e-07 3.6859e-11 2.5010e-08 3.1189e-07 0.0000e+00 0.0000e+00
 1.0790e-09 1.8690e-07 1.0299e-06 4.9545e-06 1.6923e-09 2.8826e-12
 1.0000e+00 3.7800e-07 0.0000e+00 1.7771e-11 1.0886e-08 1.0190e-07
 2.1715e-10]
Ensemble Validation Accuracy (threshold = 0.35): 0.9825

Classification Report:
              precision    recall  f1-score   support

           0     0.9574    0.9000    0.9278       150
 

## Check for overfitting

In [523]:
# Compute “human” probabilities on the training split
probs_train = np.zeros(len(df_train))
mask_tr_dom1 = df_train['domain'] == 'domain1'
mask_tr_dom2 = ~mask_tr_dom1

probs_train[mask_tr_dom1] = pipe_dom1.predict_proba(df_train.loc[mask_tr_dom1, 'text_str'])[:, 0]
probs_train[mask_tr_dom2] = pipe_dom2.predict_proba(df_train.loc[mask_tr_dom2, 'text_str'])[:, 0]

threshold = 0.40
preds_train = np.where(probs_train >= threshold, 0, 1)

# Evaluate
print(f"Ensemble Training Accuracy (thr={threshold:.2f}):",
      accuracy_score(df_train['label'], preds_train))
print("\nTraining Classification Report:")
print(classification_report(df_train['label'], preds_train, digits=4))

Ensemble Training Accuracy (thr=0.40): 0.9997916666666666

Training Classification Report:
              precision    recall  f1-score   support

           0     0.9983    1.0000    0.9992       600
           1     1.0000    0.9998    0.9999      4200

    accuracy                         0.9998      4800
   macro avg     0.9992    0.9999    0.9995      4800
weighted avg     0.9998    0.9998    0.9998      4800



In [524]:
# Load & prepare test
df_test = pd.read_json('/Users/zigeliang/Desktop/All/Data Science 2025S1/COMP90051/A2/comp-90051-2025-s-1-project-2/test_data.json', lines=True)
df_test['text_str'] = df_test['text'].apply(lambda seq: ' '.join(map(str, seq)))

# Get human‐class probs from both domain‐expert models
p1 = pipe_dom1.predict_proba(df_test['text_str'])[:,0]
p2 = pipe_dom2.predict_proba(df_test['text_str'])[:,0]

probs_test = (p1 + p2) / 2

threshold_final = threshold
preds_test = np.where(probs_test >= threshold, 0, 1)

submission = pd.DataFrame({'id': df_test['id'], 'class': preds_test})
submission.to_csv('/Users/zigeliang/Desktop/All/Data Science 2025S1/COMP90051/A2/comp-90051-2025-s-1-project-2/smote_submission_tfidf2.csv', index=False)
submission.head(15)

Unnamed: 0,id,class
0,0,0
1,1,0
2,2,0
3,3,1
4,4,0
5,5,1
6,6,0
7,7,1
8,8,0
9,9,1
