In [1620]:
# Set up environment and list data files
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from scipy.stats import randint

#### Load domain 1 data

In [1621]:
# Load domain1 JSON as newline-delimited JSON (JSON Lines)
df1 = pd.read_json('/Users/zigeliang/Desktop/All/Data Science 2025S1/COMP90051/A2/comp-90051-2025-s-1-project-2/domain1_train_data.json', lines=True)
print("Domain1 shape:", df1.shape)
print("Label distribution in domain1:\n", df1['label'].value_counts())
df1.head()


Domain1 shape: (1000, 3)
Label distribution in domain1:
 label
0    500
1    500
Name: count, dtype: int64


Unnamed: 0,text,label,id
0,"[6, 22, 34, 76, 501, 977, 1, 2514, 13623, 76, ...",0,0
1,"[222, 31, 4108, 104, 132, 361, 39, 2305, 12, 9...",0,1
2,"[736, 7194, 113, 12, 366, 2870, 123, 101, 12, ...",0,2
3,"[48, 1, 2025, 69, 361, 533, 327, 237, 4150, 13...",0,3
4,"[2973, 66, 1, 1493, 260, 2740, 50, 1027, 50, 1...",0,4


#### Load domain 2 data 

In [1622]:
# Load domain1 JSON as newline-delimited JSON (JSON Lines)
df2 = pd.read_json('/Users/zigeliang/Desktop/All/Data Science 2025S1/COMP90051/A2/comp-90051-2025-s-1-project-2/domain2_train_data.json', lines=True)
print("Domain1 shape:", df2.shape)
print("Label distribution in domain1:\n", df2['label'].value_counts())
df2.head()

Domain1 shape: (5000, 3)
Label distribution in domain1:
 label
1    4750
0     250
Name: count, dtype: int64


Unnamed: 0,text,label,id
0,"[22, 6065, 76, 119, 13027, 575, 219, 22, 2435,...",0,0
1,"[1275, 1509, 12, 6113, 6287, 327, 411, 1139, 2...",0,1
2,"[575, 2962, 529, 4624, 39, 279, 1012, 277, 76,...",0,2
3,"[12, 6113, 2428, 69, 375, 1025, 2605, 76, 101,...",0,3
4,"[529, 76, 1509, 861, 1, 645, 1, 5013, 237, 3, ...",0,4


#### Load the test dataset 


In [1623]:
# Load test JSON into a DataFrame and inspect
df_test = pd.read_json('/Users/zigeliang/Desktop/All/Data Science 2025S1/COMP90051/A2/comp-90051-2025-s-1-project-2/test_data.json', lines=True)
print("Test set shape:", df_test.shape)
df_test.head()

Test set shape: (4000, 2)


Unnamed: 0,text,id
0,"[9159, 3048, 238, 276, 162, 286, 305, 22, 36, ...",0
1,"[64, 5039, 1275, 6, 0, 871, 139, 270, 327, 237...",1
2,"[327, 618, 76, 650, 121, 274, 1025, 0, 12207, ...",2
3,"[6, 12, 609, 11905, 4, 879, 677, 78, 13352, 60...",3
4,"[1, 5504, 55, 22, 101, 3783, 139, 2664, 4, 1, ...",4


#### Combine domain 1 and domina 2


In [1624]:
# Combine domain1 and domain2 into a single training DataFrame and inspect
train = pd.concat([
    df1.assign(domain='domain1'), # add a new column 'domain' with value 'domain1'
    df2.assign(domain='domain2') # in the new added column called 'domain', mark the value from domain 2 with value 'domain2'
], ignore_index=True)

print(train.head())

print("Combined train shape:", train.shape)
print("\nOverall label distribution:\n", train['label'].value_counts())
print("\nDomain breakdown:\n", train['domain'].value_counts())
train.sample(5)

                                                text  label  id   domain
0  [6, 22, 34, 76, 501, 977, 1, 2514, 13623, 76, ...      0   0  domain1
1  [222, 31, 4108, 104, 132, 361, 39, 2305, 12, 9...      0   1  domain1
2  [736, 7194, 113, 12, 366, 2870, 123, 101, 12, ...      0   2  domain1
3  [48, 1, 2025, 69, 361, 533, 327, 237, 4150, 13...      0   3  domain1
4  [2973, 66, 1, 1493, 260, 2740, 50, 1027, 50, 1...      0   4  domain1
Combined train shape: (6000, 4)

Overall label distribution:
 label
1    5250
0     750
Name: count, dtype: int64

Domain breakdown:
 domain
domain2    5000
domain1    1000
Name: count, dtype: int64


Unnamed: 0,text,label,id,domain
5962,"[3293, 2487, 101, 12, 996, 69, 2487, 13, 206, ...",1,4962,domain2
2571,"[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...",1,1571,domain2
1549,"[1, 26, 427, 47, 206, 421, 5989, 1, 2591, 39, ...",1,549,domain2
4480,"[1436, 4552, 101, 307, 48, 12, 316, 114, 635, ...",1,3480,domain2
4602,"[6711, 238, 5410, 529, 222, 151, 12, 2971, 39,...",1,3602,domain2


## Method 1: Bag-of-Words + TF–IDF Baseline with Class-Weighted Logistic Regression

#### Prepare text strings for TF-IDF input

In [1625]:
# Convert token sequences into strings
train['text_str'] = train['text'].apply(lambda seq: ' '.join(map(str, seq)))
df_test['text_str'] = df_test['text'].apply(lambda seq: ' '.join(map(str, seq)))

train[['id', 'label', 'domain', 'text_str']].head() # inspect examples

Unnamed: 0,id,label,domain,text_str
0,0,0,domain1,6 22 34 76 501 977 1 2514 13623 76 31 2085 277...
1,1,0,domain1,222 31 4108 104 132 361 39 2305 12 936 1287 66...
2,2,0,domain1,736 7194 113 12 366 2870 123 101 12 230 403 51...
3,3,0,domain1,48 1 2025 69 361 533 327 237 4150 13 22 2128 1...
4,4,0,domain1,2973 66 1 1493 260 2740 50 1027 50 1 3289 69 5...


In [1626]:
# Split the combined DataFrame into df_train and df_val
df_train, df_val = train_test_split(train, test_size=0.2, stratify = train['label'], random_state=42)

print(df_train.head())
print("\n")
print(df_train.domain)

# Extract labels
y_train = df_train['label'].values
y_val   = df_val['label'].values

# Fit TF-IDF on the training text_str
vectorizer = TfidfVectorizer(max_features=15000)
X_train_tf = vectorizer.fit_transform(df_train['text_str'])
X_val_tf   = vectorizer.transform(df_val['text_str'])
X_test_tf  = vectorizer.transform(df_test['text_str'])



                                                   text  label    id   domain  \
384   [1, 231, 3252, 22, 595, 4, 133, 101, 4, 1496, ...      0   384  domain1   
3366  [13739, 3355, 101, 1, 73, 13, 22, 238, 706, 49...      1  2366  domain2   
1218  [76, 1509, 55, 305, 497, 12, 1326, 69, 635, 80...      0   218  domain2   
1312  [324, 151, 285, 3050, 1106, 1, 5971, 453, 69, ...      1   312  domain2   
873   [1882, 12, 1773, 507, 36, 64, 12, 630, 39, 168...      1   873  domain1   

                                               text_str  
384   1 231 3252 22 595 4 133 101 4 1496 4613 12 702...  
3366  13739 3355 101 1 73 13 22 238 706 496 12 2547 ...  
1218  76 1509 55 305 497 12 1326 69 635 80 1 279 76 ...  
1312  324 151 285 3050 1106 1 5971 453 69 852 39 143...  
873   1882 12 1773 507 36 64 12 630 39 1685 164 324 ...  


384     domain1
3366    domain2
1218    domain2
1312    domain2
873     domain1
         ...   
2487    domain2
2047    domain2
3718    domain2
3856    domain2
460

#### Evulauate log-regression model baseline

In [1627]:
# Train and evaluate a class-weighted Logistic Regression baseline
clf = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)

# Fit on training TF-IDF features
clf.fit(X_train_tf, y_train)

# Predict on validation set
y_pred = clf.predict(X_val_tf)

# Evaluate
acc = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {acc:.4f}\n")
print("Classification Report:")
print(classification_report(y_val, y_pred, digits=4))


Validation Accuracy: 0.9250

Classification Report:
              precision    recall  f1-score   support

           0     0.6546    0.8467    0.7384       150
           1     0.9771    0.9362    0.9562      1050

    accuracy                         0.9250      1200
   macro avg     0.8159    0.8914    0.8473      1200
weighted avg     0.9368    0.9250    0.9290      1200



#### 5-Fold cross-validation of TF–IDF + logistic regression baseline

In [1628]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=35000)),
    ('clf',  LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42))
])

# Stratified 5-fold CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(
    pipeline,
    train['text_str'],
    train['label'],
    cv=cv,
    scoring='accuracy',
    n_jobs=-1
)

print("CV accuracies per fold:", scores)
print(f"Mean CV accuracy: {scores.mean():.4f} ± {scores.std():.4f}")

CV accuracies per fold: [0.90583333 0.925      0.92       0.91333333 0.92916667]
Mean CV accuracy: 0.9187 ± 0.0083


#### Hyperparameter Tuning: RandomizedSearchCV on TF–IDF + LR

In [1629]:
# # param_grid for GridSearchCV
# param_grid = {
#     'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2), (1,3), (2,3), (3,3), (1,4)], # define the n-gram range
#     'clf__C': [0.05, 0.1, 2, 3, 8],
#     'clf__penalty': ['l1', 'l2', None], # l1 is for Lasso, l2 is for Ridge
#     'tfidf__max_df': [0.5, 1.0], # max_df is the maximum document frequency
#     'tfidf__min_df': [2, 5], # min_df is the minimum document frequency
# }

# param_grid for RandomizedSearchCV
param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2), (1, 3), (2, 3), (3, 3), (1, 4), (2, 4), (3, 4)],
    'tfidf__max_df': uniform(loc=0.5, scale=1.0),  # samples from 0.5 to 1.5
    'tfidf__min_df': [1, 2, 3, 5],
    'clf__C': uniform(loc=0.01, scale=10),  # continuous range for C
    'clf__penalty': ['l1', 'l2', None]
} 


# # Perform grid search with GridSearchCV
# grid_search = GridSearchCV(
#     estimator= pipeline,
#     param_grid= param_grid,
#     cv= cv,
#     scoring= 'accuracy',
#     n_jobs= -1,
#     verbose= 1
# )

# Perform grid search with RandomizedSearchCV
grid_search = RandomizedSearchCV(
    estimator = pipeline,
    param_distributions = param_grid,
    n_iter = 30,
    cv= cv,
    scoring= 'accuracy',
    n_jobs= -1,
    verbose= 1
)

# # why use tarin['text_str'] and train['label]
# grid_search.fit(train['text_str'], train['label']) 

# # Perform grid search with GridSearchCV
# grid_search.fit(df_train['text_str'], df_train['label'])


# Perform grid search with RandomizedSearchCV
grid_search.fit(df_train['text_str'], df_train['label'])

print("Best CV Accuracy: {:.4f}".format(grid_search.best_score_))
print("Best Parameters:", grid_search.best_params_)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


95 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/

Best CV Accuracy: 0.9604
Best Parameters: {'clf__C': 5.696204002320979, 'clf__penalty': None, 'tfidf__max_df': 0.6927494788845082, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 4)}




##### Re-evaluate model after hyperparameter tuning

In [1630]:
# Evaluate tuned model on hold-out validation set
best_pipe = grid_search.best_estimator_
X_val_tuned = best_pipe.named_steps['tfidf'].transform(df_val['text_str'])
y_val_pred = best_pipe.named_steps['clf'].predict(X_val_tuned)

print("Hold-out Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred, digits=4))

Hold-out Validation Accuracy: 0.9583333333333334
              precision    recall  f1-score   support

           0     0.9386    0.7133    0.8106       150
           1     0.9604    0.9933    0.9766      1050

    accuracy                         0.9583      1200
   macro avg     0.9495    0.8533    0.8936      1200
weighted avg     0.9577    0.9583    0.9558      1200



#### Threshold tuning for tuned TF–IDF + Logistic Regression

In [1631]:
best_pipe = grid_search.best_estimator_

prob_human = best_pipe.predict_proba(df_val['text_str'])[:, 0]

thresholds = np.linspace(0.1, 0.9, 17)
print("Threshold | Precision (human) | Recall (human) | F1 (human)")
for t in thresholds:
    preds = np.where(prob_human >= t, 0, 1)
    # there is a higher chance of beinf a human (0) if prob_human >= t
    prec = precision_score(y_val, preds, pos_label=0)
    rec  = recall_score(y_val, preds, pos_label=0)
    f1   = f1_score(y_val, preds, pos_label=0)
    print(f"  {t:.2f}    |   {prec:.4f}         |   {rec:.4f}     |  {f1:.4f}")

Threshold | Precision (human) | Recall (human) | F1 (human)
  0.10    |   0.9328         |   0.7400     |  0.8253
  0.15    |   0.9316         |   0.7267     |  0.8165
  0.20    |   0.9310         |   0.7200     |  0.8120
  0.25    |   0.9310         |   0.7200     |  0.8120
  0.30    |   0.9391         |   0.7200     |  0.8151
  0.35    |   0.9391         |   0.7200     |  0.8151
  0.40    |   0.9391         |   0.7200     |  0.8151
  0.45    |   0.9391         |   0.7200     |  0.8151
  0.50    |   0.9386         |   0.7133     |  0.8106
  0.55    |   0.9386         |   0.7133     |  0.8106
  0.60    |   0.9381         |   0.7067     |  0.8061
  0.65    |   0.9375         |   0.7000     |  0.8015
  0.70    |   0.9375         |   0.7000     |  0.8015
  0.75    |   0.9369         |   0.6933     |  0.7969
  0.80    |   0.9369         |   0.6933     |  0.7969
  0.85    |   0.9369         |   0.6933     |  0.7969
  0.90    |   0.9364         |   0.6867     |  0.7923


In [1632]:
best_ngram = grid_search.best_params_['tfidf__ngram_range']
print(f"Best_ngrom: {best_ngram}")

best_C = grid_search.best_params_['clf__C']
print(f"Best_C: {best_C}")

best_clf_penalty = grid_search.best_params_['clf__penalty']
print(f"Best_clf_penalty: {best_clf_penalty}")


# Train a new pipeline with the best parameters
pipeline_tuned = Pipeline([
    ('tfidf', TfidfVectorizer(max_features = 30000, ngram_range = best_ngram)),
    ('clf',  LogisticRegression(class_weight='balanced', C=best_C, penalty=best_clf_penalty, max_iter=1000, random_state=42))
])

# Fit the tuned pipeline on the training set
pipeline_tuned.fit(df_train['text_str'], df_train['label'])

# Predict on the validation set
prob_human = pipeline_tuned.predict_proba(df_val['text_str'])[:, 0]

print(prob_human[:55])

threshold = 0.65
preds_val = np.where(prob_human >= threshold, 0, 1)
print(preds_val[:55])

# Evaluate
print(f"Validation Accuracy (threshold = {threshold:.2f}): {accuracy_score(y_val, preds_val):.4f}\n")
print("Classification Report:")
print(classification_report(y_val, preds_val, digits=4))

Best_ngrom: (1, 4)
Best_C: 5.696204002320979
Best_clf_penalty: None
[0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 3.86494265e-06 0.00000000e+00
 1.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
 0.00000000e+00 9.99999909e-01 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 1.00000000e+00 0.00000000e+00 1.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00]
[1 1 0 1 1 1 1 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 0 1 1 1 1
 1 1 1 1 1 1 1 1 



#### Predict y based on the test file data and export as a csv

In [1633]:
# prob_human_test = pipeline_tuned.predict_proba(df_test['text_str'])[:, 0]
# threshold_final = threshold
# preds_test = np.where(prob_human_test >= threshold, 0, 1)

# submission = pd.DataFrame({'id': df_test['id'], 'class': preds_test})
# submission.to_csv('/Users/zigeliang/Desktop/All/Data Science 2025S1/COMP90051/A2/comp-90051-2025-s-1-project-2/tfidf2.csv', index=False)
# submission.head(15)

## Method 2: SMOTE on Domain 2’s Human Class

In [1634]:
# from scipy.sparse import vstack, csr_matrix
# from imblearn.over_sampling import SMOTE

In [1635]:
# # Extract TF–IDF vectorizer and optimal C from the tuned pipeline
# vectorizer = best_pipe.named_steps['tfidf']
# C_opt = best_pipe.named_steps['clf'].C

# # Transform train/validation into feature matrices
# X_train_all = vectorizer.transform(df_train['text_str'])
# X_val_tf    = vectorizer.transform(df_val['text_str'])
# y_train_all = df_train['label'].values

# # Select Domain 2 subset
# mask_dom2 = df_train['domain'] == 'domain2'
# X_dom2    = X_train_all[mask_dom2].toarray()
# y_dom2    = y_train_all[mask_dom2]

# # Apply SMOTE on Domain 2 human class (label 0)
# smote = SMOTE(random_state=42)
# X_dom2_res, y_dom2_res = smote.fit_resample(X_dom2, y_dom2)

# # Re-combine with Domain 1 unchanged
# mask_dom1    = ~mask_dom2
# X_dom1       = X_train_all[mask_dom1]
# y_dom1       = y_train_all[mask_dom1]
# X_dom2_res_sp = csr_matrix(X_dom2_res)

# X_res = vstack([X_dom1, X_dom2_res_sp])
# y_res = np.concatenate([y_dom1, y_dom2_res])

# # Retrain Logistic Regression on SMOTE-augmented data
# clf_smote = LogisticRegression(class_weight='balanced',
#                                C=C_opt,
#                                max_iter=1000,
#                                random_state=42)
# clf_smote.fit(X_res, y_res)

# # Predict on validation using threshold = 0.70
# prob_hum_smote = clf_smote.predict_proba(X_val_tf)[:, 0]
# preds_smote    = np.where(prob_hum_smote >= 0.70, 0, 1)

# # Evaluate
# print("SMOTE (Domain2 Human) Validation Accuracy:", accuracy_score(y_val, preds_smote))
# print("\nClassification Report:")
# print(classification_report(y_val, preds_smote, digits=4))

##### Threshold tuning

In [1636]:
# X_val_tf = vectorizer.transform(df_val['text_str'])

# # Get human-written probabilities from your SMOTE-trained classifier
# prob_human_smote = clf_smote.predict_proba(X_val_tf)[:, 0]

# # Sweep thresholds and report metrics for the human class
# thresholds = np.linspace(0.1, 0.9, 17)
# print("Threshold | Precision(0) | Recall(0) | F1(0)")
# for t in thresholds:
#     preds = np.where(prob_human_smote >= t, 0, 1)
#     prec  = precision_score(y_val, preds, pos_label=0)
#     rec   = recall_score(y_val, preds, pos_label=0)
#     f1    = f1_score(y_val, preds, pos_label=0)
#     print(f"  {t:.2f}    |  {prec:.4f}   |   {rec:.4f}  |  {f1:.4f}")

In [1637]:
# threshold = 0.35
# preds = np.where(prob_human_smote >= threshold, 0, 1)

# print(f"SMOTE Model Accuracy (threshold = {threshold:.2f}):",
#       accuracy_score(y_val, preds))
# print("\nClassification Report:")
# print(classification_report(y_val, preds, digits=4))

#### Predict y based on the test file data and export as a csv

In [1638]:
# X_test_tf  = vectorizer.transform(df_test['text_str'])

# prob_human_smote = clf_smote.predict_proba(X_test_tf)[:, 0]

# preds_test = np.where(prob_human_smote >= 0.35, 0, 1)

# submission_smote = pd.DataFrame({
#     'id':    df_test['id'],
#     'class': preds_test
# })
# submission_smote.to_csv('/Users/zigeliang/Desktop/All/Data Science 2025S1/COMP90051/A2/comp-90051-2025-s-1-project-2/smote_submission_tfidf2.csv', index=False)
# submission_smote.head(15)


## Method 3: Domain-Expert Ensemble

#### Train domain-expert classifiers

In [1639]:
# Subset by domain
train_dom1 = df_train[df_train.domain == 'domain1']
train_dom2 = df_train[df_train.domain == 'domain2']


# Define identical function for each domain
pipe_dom1 = Pipeline([
    ('tfidf', TfidfVectorizer(max_features = 24855, ngram_range = best_ngram)),
    ('clf',  LogisticRegression(class_weight='balanced', C = best_C, penalty=best_clf_penalty, max_iter= 1120, random_state=42))
])

pipe_dom2 = Pipeline([
    ('tfidf', TfidfVectorizer(max_features = 24855, ngram_range = best_ngram)),
    ('clf',  LogisticRegression(class_weight='balanced', C = best_C, penalty=best_clf_penalty, max_iter= 1120, random_state=42))
])


# Train
pipe_dom1.fit(train_dom1.text_str, train_dom1.label)
pipe_dom2.fit(train_dom2.text_str, train_dom2.label)

print("Domain‐expert models trained.")



Domain‐expert models trained.




##### Evaluate domain-expert ensemble on validation set

In [1640]:
# Allocate an array for human-class probabilities
probs = np.zeros(len(df_val))

# Fill in per-domain probabilities
mask_dom1 = df_val['domain'] == 'domain1'
mask_dom2 = ~mask_dom1

probs[mask_dom1] = pipe_dom1.predict_proba(df_val.loc[mask_dom1, 'text_str'])[:, 0]
probs[mask_dom2] = pipe_dom2.predict_proba(df_val.loc[mask_dom2, 'text_str'])[:, 0]

threshold = 0.30
preds_ensemble = np.where(probs >= threshold, 0, 1)

# Evaluate
print(f"Ensemble Validation Accuracy (thr={threshold:.2f}):",
      accuracy_score(y_val, preds_ensemble))
print("\nClassification Report:")
print(classification_report(y_val, preds_ensemble, digits=4))

Ensemble Validation Accuracy (thr=0.30): 0.9825

Classification Report:
              precision    recall  f1-score   support

           0     0.9708    0.8867    0.9268       150
           1     0.9840    0.9962    0.9901      1050

    accuracy                         0.9825      1200
   macro avg     0.9774    0.9414    0.9584      1200
weighted avg     0.9824    0.9825    0.9822      1200



#### Threshold sweep for domain-expert ensemble

In [1641]:
# Compute ensemble “human” probabilities
probs = np.zeros(len(df_val))
mask_dom1     = df_val['domain'] == 'domain1'
mask_dom2     = ~mask_dom1
probs[mask_dom1] = pipe_dom1.predict_proba(df_val.loc[mask_dom1, 'text_str'])[:, 0]
probs[mask_dom2] = pipe_dom2.predict_proba(df_val.loc[mask_dom2, 'text_str'])[:, 0]

# Sweep thresholds
thresholds = np.linspace(0.1, 0.9, 17)
print("Threshold | Precision(0) | Recall(0) | F1(0)")
for t in thresholds:
    preds = np.where(probs >= t, 0, 1)
    prec  = precision_score(y_val, preds, pos_label=0)
    rec   = recall_score(   y_val, preds, pos_label=0)
    f1    = f1_score(       y_val, preds, pos_label=0)
    print(f"  {t:.2f}    |   {prec:.4f}     |   {rec:.4f}   |  {f1:.4f}")

Threshold | Precision(0) | Recall(0) | F1(0)
  0.10    |   0.9433     |   0.8867   |  0.9141
  0.15    |   0.9568     |   0.8867   |  0.9204
  0.20    |   0.9638     |   0.8867   |  0.9236
  0.25    |   0.9638     |   0.8867   |  0.9236
  0.30    |   0.9708     |   0.8867   |  0.9268
  0.35    |   0.9708     |   0.8867   |  0.9268
  0.40    |   0.9851     |   0.8800   |  0.9296
  0.45    |   0.9848     |   0.8667   |  0.9220
  0.50    |   0.9847     |   0.8600   |  0.9181
  0.55    |   0.9847     |   0.8600   |  0.9181
  0.60    |   0.9923     |   0.8600   |  0.9214
  0.65    |   0.9922     |   0.8533   |  0.9176
  0.70    |   0.9922     |   0.8533   |  0.9176
  0.75    |   0.9922     |   0.8467   |  0.9137
  0.80    |   0.9922     |   0.8467   |  0.9137
  0.85    |   0.9921     |   0.8333   |  0.9058
  0.90    |   1.0000     |   0.8267   |  0.9051


#### Evaluate ensemble based attempts of different thresholds

In [1642]:
probs = np.zeros(len(df_val)) 
mask_dom1 = df_val['domain'] == 'domain1'
print(mask_dom1)
mask_dom2 = ~mask_dom1
# probs that the data from domain 1 is human-written
probs[mask_dom1] = pipe_dom1.predict_proba(df_val.loc[mask_dom1, 'text_str'])[:, 0]
# probs that the data from domain 2 is human-written
probs[mask_dom2] = pipe_dom2.predict_proba(df_val.loc[mask_dom2, 'text_str'])[:, 0]

# 2. Apply threshold
# when the threshold is too high such as 0.75 and 0.80, even though the precision might be high like 98%\
# but the accuracy will be low, because of the overfitting. Therfore, the threshold should be set to 0.40 or 0.35
threshold = 0.30
preds_ensem = np.where(probs >= threshold, 0, 1)


# 3. Evaluate
print(f"Ensemble Validation Accuracy (threshold = {threshold:.2f}):",
      accuracy_score(y_val, preds_ensem))
print("\nClassification Report:")
print(classification_report(y_val, preds_ensem, digits=4))

4929    False
958      True
1243    False
1766    False
2509    False
        ...  
5071    False
2746    False
5895    False
2826    False
1669    False
Name: domain, Length: 1200, dtype: bool
Ensemble Validation Accuracy (threshold = 0.30): 0.9825

Classification Report:
              precision    recall  f1-score   support

           0     0.9708    0.8867    0.9268       150
           1     0.9840    0.9962    0.9901      1050

    accuracy                         0.9825      1200
   macro avg     0.9774    0.9414    0.9584      1200
weighted avg     0.9824    0.9825    0.9822      1200



#### Check for overfitting

In [1643]:
# Compute “human” probabilities on the training split
probs_train = np.zeros(len(df_train))
mask_tr_dom1 = df_train['domain'] == 'domain1'
mask_tr_dom2 = ~mask_tr_dom1

probs_train[mask_tr_dom1] = pipe_dom1.predict_proba(df_train.loc[mask_tr_dom1, 'text_str'])[:, 0]
probs_train[mask_tr_dom2] = pipe_dom2.predict_proba(df_train.loc[mask_tr_dom2, 'text_str'])[:, 0]

threshold = 0.30
preds_train = np.where(probs_train >= threshold, 0, 1)

# Evaluate
print(f"Ensemble Training Accuracy (thr={threshold:.2f}):",
      accuracy_score(df_train['label'], preds_train))
print("\nTraining Classification Report:")
print(classification_report(df_train['label'], preds_train, digits=4))

Ensemble Training Accuracy (thr=0.30): 0.9997916666666666

Training Classification Report:
              precision    recall  f1-score   support

           0     0.9983    1.0000    0.9992       600
           1     1.0000    0.9998    0.9999      4200

    accuracy                         0.9998      4800
   macro avg     0.9992    0.9999    0.9995      4800
weighted avg     0.9998    0.9998    0.9998      4800



In [1644]:
# Load & prepare test
df_test = pd.read_json('/Users/zigeliang/Desktop/All/Data Science 2025S1/COMP90051/A2/comp-90051-2025-s-1-project-2/test_data.json', lines=True)
df_test['text_str'] = df_test['text'].apply(lambda seq: ' '.join(map(str, seq)))

# Get human‐class probs from both domain‐expert models
p1 = pipe_dom1.predict_proba(df_test['text_str'])[:,0]
p2 = pipe_dom2.predict_proba(df_test['text_str'])[:,0]

probs_test = (p1 + p2) / 2

threshold_final = threshold
preds_test = np.where(probs_test >= threshold, 0, 1)

submission = pd.DataFrame({'id': df_test['id'], 'class': preds_test})
submission.to_csv('/Users/zigeliang/Desktop/All/Data Science 2025S1/COMP90051/A2/comp-90051-2025-s-1-project-2/ensemble_submission_tfidf2.csv', index=False)
submission.head(15)

Unnamed: 0,id,class
0,0,0
1,1,0
2,2,0
3,3,1
4,4,0
5,5,1
6,6,0
7,7,1
8,8,0
9,9,1


## Method of Random Forest Model 

In [1645]:
import json
import numpy as np
import joblib
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint

#### Load domain 1 data and split the training and testing sets

In [1646]:
# read the json file for domain 1
d1_data = []
with open('/Users/zigeliang/Desktop/All/Data Science 2025S1/COMP90051/A2/comp-90051-2025-s-1-project-2/domain1_train_data.json', 'r') as f:
    for line in f:
        d1_data.append(json.loads(line))
# convert the data to datareame and show the first 5 records
d1_df = pd.DataFrame(d1_data)
print(f"domain 1 shappe: {d1_df.shape}")

d1_x, d1_y = d1_df['text'], d1_df['label']
d1_x_training, d1_x_testing, d1_y_training, d1_y_testing = train_test_split(d1_x, d1_y, test_size=0.2, random_state=24)

# show the number of label 0 and 1 overall
print(f"domain 1 number of label 0 and 1: {d1_y.value_counts()[0]}, {d1_y.value_counts()[1]}")

domain 1 shappe: (1000, 3)
domain 1 number of label 0 and 1: 500, 500


#### Load domain 2 data and split the training and testing sets

In [1647]:
# read the json file for domain 2
d2_data = []
with open('/Users/zigeliang/Desktop/All/Data Science 2025S1/COMP90051/A2/comp-90051-2025-s-1-project-2/domain2_train_data.json', 'r') as f:
    for line in f:
        d2_data.append(json.loads(line))
# convert the data to datareame and show the first 5 records
d2_df = pd.DataFrame(d2_data)
print(f"domain 2 shappe: {d2_df.shape}")

d2_x, d2_y = d2_df['text'], d2_df['label']
d2_x_training, d2_x_testing, d2_y_training, d2_y_testing = train_test_split(d2_x, d2_y, test_size=0.2, random_state=24)

# show the number of label 0 and 1 overall
print(f"domain 2 number of label 0 and 1: {d2_y.value_counts()[0]}, {d2_y.value_counts()[1]}")

domain 2 shappe: (5000, 3)
domain 2 number of label 0 and 1: 250, 4750


#### Load the test data

In [1648]:
# load test data 
test_data = []
with open('/Users/zigeliang/Desktop/All/Data Science 2025S1/COMP90051/A2/comp-90051-2025-s-1-project-2/test_data.json', 'r') as f:
    for line in f:
        test_data.append(json.loads(line))
# convert the data to datareame and show the first 5 records
test_df = pd.DataFrame(test_data)
print(f"test data shappe: {test_df.shape}")
test_df.head()

test_texts = []
test_ids = []

# convert the test data to string
for text in test_df['text']:
    str_text = str(text)
    transfomed_str_text = str_text.replace(",", " ").replace("]", "").replace("[", "")
    test_texts.append(transfomed_str_text)

for id in test_df['id']:
    test_ids.append(id)

test data shappe: (4000, 2)


#### Vectorise the X (text) of traning and testing sets from domain 1 and 2, as well as the X (text) from test file

In [1649]:
total_x_training = pd.concat([d1_x_training, d2_x_training], ignore_index=True)
total_y_training = pd.concat([d1_y_training, d2_y_training], ignore_index=True)

#  Apply CountVectorizer
vectorizer = CountVectorizer()

# Now the total_X_converted is a list of list of int,\
# converted it to as list of string in order to apply the CountVectorizer
total_x_training_str = []
for text in total_x_training:
    text_str = ' '.join(map(str,text))
    total_x_training_str.append(text_str)

total_x_training_str_vec = vectorizer.fit_transform(total_x_training_str)


# vectorize the training data from domain 1  
d1_x_training_str = []
for text in d1_x_training:
    text_str = ' '.join(map(str,text))
    d1_x_training_str.append(text_str)

d1_x_training_str_vec = vectorizer.transform(d1_x_training_str)


# vectorize the training data from domain 2
d2_x_training_str = []
for text in d2_x_training:
    text_str = ' '.join(map(str,text))
    d2_x_training_str.append(text_str)

d2_x_training_str_vec = vectorizer.transform(d2_x_training_str)


# vectorize the testing data from domain 1 
d1_x_testing_str = []
for text in d1_x_testing:
    text_str = ' '.join(map(str,text))
    d1_x_testing_str.append(text_str)

d1_x_testing_str_vec = vectorizer.transform(d1_x_testing_str)


# vectorize the testing data from domain 2
d2_x_testing_str = []
for text in d2_x_testing:
    text_str = ' '.join(map(str,text))
    d2_x_testing_str.append(text_str)

d2_x_testing_str_vec = vectorizer.transform(d2_x_testing_str)

# vectorize the data from test file
test_texts_vec = []
for text in test_texts:
    vec_text = vectorizer.transform([text])
    test_texts_vec.append(vec_text)

    

#### Address and solve the class imbalance problem suing SMOTE

In [1650]:
# fix the issue of imbalanced data by applying SMOTE to domain 2
smote = SMOTE(random_state=24)
d2_x_training_smote, d2_y_training_smote = smote.fit_resample(d2_x_training_str_vec, d2_y_training)

# create marks as 1 if the data comes domain 1 and 2 if the data comes forom domain 2
y_total = [1] * len(d1_x_training) + [2] * len(d2_x_training)
x_total = pd.concat([d1_x_training, d2_x_training], ignore_index=True)

# fit the vectorizer to x_total before transforming
x_total_str = []
for text in x_total:
    text_str = ' '.join(map(str,text))
    x_total_str.append(text_str)
    
# vectorize the X training data from domain 1 and domain 2
x_total_transformed = vectorizer.transform(x_total_str)

# fix the issue of imbalanced data after domain
smote_domain = SMOTE(random_state=24)
x_total_transformed_smote, y_total_smote = smote_domain.fit_resample(x_total_transformed, y_total)


#### Hyperparemater tuning for Random Forrest classifier

In [1651]:
# define paramter grids for GridSearchCV
para_grid_d1 = {
    'n_estimators': randint(50, 201), # more tree, more generalization capacity
    'max_depth': randint(5,21), # higher values have more overfitting risk
    'min_samples_split': randint(1, 11), # higher values reduce overfitting risk 
    'min_samples_leaf': randint(1, 11), # higher values lead to more generalization capacity
}

para_grid_d2 = {
    'n_estimators': randint(50, 151), # more tree, more generalization capacity
    'max_depth': randint(5, 11), # higher values have more overfitting risk
    'min_samples_split': randint(1, 16), # higher values reduce overfitting risk 
    'min_samples_leaf': randint(1, 16), # higher values lead to more generalization capacity
}

para_grid_domain = {
    'n_estimators': randint(50, 201), # more tree, more generalization capacity
    'max_depth': randint(1,21), # higher values have more overfitting risk
    'min_samples_split': randint(1, 11), # higher values reduce overfitting risk 
    'min_samples_leaf': randint(1, 11), # higher values lead to more generalization capacity
}

# define the RandomForestClassifier for domain 1 for classifying 1 or 0 in domain 1
rf_d1 = RandomForestClassifier(random_state=24)
# Search best parameters for domain 1
# gs_d1 = GridSearchCV(estimator = rf_d1, param_grid = para_grid_d1, cv = 5, n_jobs = -1)
gs_d1 = RandomizedSearchCV(estimator = rf_d1, param_distributions = para_grid_d1, n_iter=10, cv=3, n_jobs=-1)
gs_d1.fit(d1_x_training_str_vec, d1_y_training)
d1_best_params = gs_d1.best_params_
print(f"The best fitting parameters for domain 1: {d1_best_params}")


# define the RandomForestClassifier for classifying 1 or 0 in domain 2
rf_d2 = RandomForestClassifier(random_state=24)
# Search best paramters for domain 2
# gs_d2 = GridSearchCV(estimator = rf_d2, param_grid = para_grid_d2, cv = 5, n_jobs = -1)
gs_d2 = RandomizedSearchCV(estimator = rf_d2, param_distributions = para_grid_d2, n_iter=10, cv=3, n_jobs=-1)
gs_d2.fit(d2_x_training_smote, d2_y_training_smote)
d2_best_params = gs_d2.best_params_
print(f"The best fitting parameters for domain 2: {d2_best_params}")


# define the RandomForestClassifier for classifying domain 1 or domain 2
rf_domain = RandomForestClassifier(random_state=24)
# Search best parameters for domain classification 
# gs_domain = GridSearchCV(estimator = rf_domain, param_grid = para_grid_domain, cv = 5, n_jobs = -1)
gs_domain = RandomizedSearchCV(estimator = rf_domain, param_distributions = para_grid_domain, n_iter=10, cv=3, n_jobs=-1)
domain_best_params = gs_domain.fit(x_total_transformed_smote, y_total_smote)
print(f"The best fitting parameters for the domain classifier: {domain_best_params.best_params_}")


6 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParamet

The best fitting parameters for domain 1: {'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 88}


3 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParamet

The best fitting parameters for domain 2: {'max_depth': 10, 'min_samples_leaf': 3, 'min_samples_split': 3, 'n_estimators': 116}


3 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParamet

The best fitting parameters for the domain classifier: {'max_depth': 8, 'min_samples_leaf': 7, 'min_samples_split': 6, 'n_estimators': 123}


#### Evaluate the trained model performance on the testing data

In [1652]:
# evaluate the trained model performance for domain 1
print("The performance of domain 1 model: \n")
d1_y_pred = gs_d1.predict(d1_x_testing_str_vec)
evaluate_report_d1 = classification_report(d1_y_testing, d1_y_pred)
print(evaluate_report_d1)

print("The performance of domain 2 model: \n")
# evaluate the trained model performance for domain 2
d2_y_pred = gs_d2.predict(d2_x_testing_str_vec)
evaluate_report_d2 = classification_report(d2_y_testing, d2_y_pred)
print(evaluate_report_d2)

print("The performance of domain classifier: \n")
# evalue the the trained model performance for domain classification
domain_y_pred = gs_domain.predict(x_total_transformed_smote)
evaluate_report_domain = classification_report(y_total_smote, domain_y_pred)
print(evaluate_report_domain)


The performance of domain 1 model: 

              precision    recall  f1-score   support

           0       0.84      0.89      0.86        90
           1       0.90      0.86      0.88       110

    accuracy                           0.88       200
   macro avg       0.87      0.88      0.87       200
weighted avg       0.88      0.88      0.88       200

The performance of domain 2 model: 

              precision    recall  f1-score   support

           0       0.44      0.64      0.52        50
           1       0.98      0.96      0.97       950

    accuracy                           0.94      1000
   macro avg       0.71      0.80      0.74      1000
weighted avg       0.95      0.94      0.95      1000

The performance of domain classifier: 

              precision    recall  f1-score   support

           1       0.94      0.97      0.95      4000
           2       0.97      0.93      0.95      4000

    accuracy                           0.95      8000
   macro avg  

#### Use the trained model to predict the test data


In [1653]:
y_pred_final = []
for text in test_texts:
    # run the domain classifier to distinguish the data from domain 1 or domain 2
    text_vec = vectorizer.transform([text])
    preicted_domain =  gs_domain.predict(text_vec)[0]
    # if data from domain 1
    if preicted_domain == 1:
        # run the domain1 classifier to dishguish 1 or 0 within domain 1
        predicted_label_d1 = gs_d1.predict(text_vec)[0]
        y_pred_final.append(predicted_label_d1)
    # if data from domain 2
    if preicted_domain == 2:
        # run the domain2 classifier to dishguish 1 or 0 within domain 2
        predicted_label_d2 = gs_d2.predict(text_vec)[0]
        y_pred_final.append(predicted_label_d2)

# convert each predicted y to integer
y_predicted_int = []
for i in y_pred_final:
    if i == '0':
        y_predicted_int.append(0)
    else:
        y_predicted_int.append(1)

#### Export output as a csv file

In [1654]:
print(y_pred_final[:10])
print(y_predicted_int[:10])
print(f"length of y_predicted_int: {len(y_predicted_int)}")
print(f"length of test_ids: {len(test_ids)}")

[1, 1, 1, 1, 1, 1, 0, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
length of y_predicted_int: 4000
length of test_ids: 4000


In [1655]:
# contruct the output file
final_output = pd.DataFrame({'id': test_ids, 'label': y_pred_final})
# export to csv file
final_output.to_csv('/Users/zigeliang/Desktop/All/Data Science 2025S1/COMP90051/A2/comp-90051-2025-s-1-project-2/rf_output3.csv', index=False)

#### Accuracy rate is 0.7160