In [120]:
import json
import numpy as np
import joblib
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

## Load domain 1 data and split the training and testing sets

In [121]:
# read the json file for domain 1
d1_data = []
with open('/Users/zigeliang/Desktop/All/Data Science 2025S1/COMP90051/A2/comp-90051-2025-s-1-project-2/domain1_train_data.json', 'r') as f:
    for line in f:
        d1_data.append(json.loads(line))
# convert the data to datareame and show the first 5 records
d1_df = pd.DataFrame(d1_data)
print(f"domain 1 shappe: {d1_df.shape}")

d1_x, d1_y = d1_df['text'], d1_df['label']
d1_x_training, d1_x_testing, d1_y_training, d1_y_testing = train_test_split(d1_x, d1_y, test_size=0.2, random_state=24)

# show the number of label 0 and 1 overall
print(f"domain 1 number of label 0 and 1: {d1_y.value_counts()[0]}, {d1_y.value_counts()[1]}")


domain 1 shappe: (1000, 3)
domain 1 number of label 0 and 1: 500, 500


## Load domain 2 data and split the training and testing sets

In [122]:
# read the json file for domain 2
d2_data = []
with open('/Users/zigeliang/Desktop/All/Data Science 2025S1/COMP90051/A2/comp-90051-2025-s-1-project-2/domain2_train_data.json', 'r') as f:
    for line in f:
        d2_data.append(json.loads(line))
# convert the data to datareame and show the first 5 records
d2_df = pd.DataFrame(d2_data)
print(f"domain 2 shappe: {d2_df.shape}")

d2_x, d2_y = d2_df['text'], d2_df['label']
d2_x_training, d2_x_testing, d2_y_training, d2_y_testing = train_test_split(d2_x, d2_y, test_size=0.2, random_state=24)

# show the number of label 0 and 1 overall
print(f"domain 2 number of label 0 and 1: {d2_y.value_counts()[0]}, {d2_y.value_counts()[1]}")


domain 2 shappe: (5000, 3)
domain 2 number of label 0 and 1: 250, 4750


## Load the test data

In [123]:
# load test data 
test_data = []
with open('/Users/zigeliang/Desktop/All/Data Science 2025S1/COMP90051/A2/comp-90051-2025-s-1-project-2/test_data.json', 'r') as f:
    for line in f:
        test_data.append(json.loads(line))
# convert the data to datareame and show the first 5 records
test_df = pd.DataFrame(test_data)
print(f"test data shappe: {test_df.shape}")
test_df.head()

test data shappe: (4000, 2)


Unnamed: 0,text,id
0,"[9159, 3048, 238, 276, 162, 286, 305, 22, 36, ...",0
1,"[64, 5039, 1275, 6, 0, 871, 139, 270, 327, 237...",1
2,"[327, 618, 76, 650, 121, 274, 1025, 0, 12207, ...",2
3,"[6, 12, 609, 11905, 4, 879, 677, 78, 13352, 60...",3
4,"[1, 5504, 55, 22, 101, 3783, 139, 2664, 4, 1, ...",4


## Vectorise the total X (text) as well as the X (text) of traning and testing sets from domain 1 and 2

In [124]:
total_x_training = pd.concat([d1_x_training, d2_x_training], ignore_index=True)
total_y_training = pd.concat([d1_y_training, d2_y_training], ignore_index=True)

#  Apply CountVectorizer
vectorizer = CountVectorizer()
# Now the total_X_converted is a list of list of int,\
# converted it to as list of string in order to apply the CountVectorizer
total_x_training_str = []
for text in total_x_training:
    text_str = ' '.join(map(str,text))
    total_x_training_str.append(text_str)

total_x_training_str_vec = vectorizer.fit_transform(total_x_training_str)


# vectorize the training data from domain 1  
d1_x_training_str = []
for text in d1_x_training:
    text_str = ' '.join(map(str,text))
    d1_x_training_str.append(text_str)

d1_x_training_str_vec = vectorizer.transform(d1_x_training_str)


# vectorize the training data from domain 2
d2_x_training_str = []
for text in d2_x_training:
    text_str = ' '.join(map(str,text))
    d2_x_training_str.append(text_str)

d2_x_training_str_vec = vectorizer.transform(d2_x_training_str)


# vectorize the testing data from domain 1 
d1_x_testing_str = []
for text in d1_x_testing:
    text_str = ' '.join(map(str,text))
    d1_x_testing_str.append(text_str)

d1_x_testing_str_vec = vectorizer.transform(d1_x_testing_str)


# vectorize the testing data from domain 2
d2_x_testing_str = []
for text in d2_x_testing:
    text_str = ' '.join(map(str,text))
    d2_x_testing_str.append(text_str)

d2_x_testing_str_vec = vectorizer.transform(d2_x_testing_str)
    

## Address and solve the class imbalance problem

In [125]:
# fix the issue of imbalanced data by applying SMOTE to domain 2
smote = SMOTE(random_state=24)
d2_x_training_smote, d2_y_training_smote = smote.fit_resample(d2_x_training_str_vec, d2_y_training)

# create marks as 1 if the data comes domain 1 and 2 if the data comes forom domain 2
y_total = [1] * len(d1_x_training) + [2] * len(d2_x_training)
x_total = pd.concat([d1_x_training, d2_x_training], ignore_index=True)

# fit the vectorizer to x_total before transforming
x_total_str = []
for text in x_total:
    text_str = ' '.join(map(str,text))
    x_total_str.append(text_str)
    
# vectorize the X training data from domain 1 and domain 2
x_total_transformed = vectorizer.transform(x_total_str)

# fix the issue of imbalanced data after domain
smote_domain = SMOTE(random_state=24)
total_X_converted_vec_smote, y_total_smmote = smote_domain.fit_resample(x_total_transformed, y_total)


## Paremater tuning for Random Forrest classifier

In [126]:
# define a paramter grid for RandomizedSearchCV
para_grid = {
    'n_estimators': [50, 100, 150], # more tree, more generalization capacity
    'max_depth': [None, 10, 20, 30], # higher values have more overfitting risk
    'min_samples_split': [2, 5, 10], # higher values reduce overfitting risk 
    'min_samples_leaf': [1, 2, 4], # higher values lead to more generalization capacity
}

# define the RandomForestClassifier for domain 1
rf_d1 = RandomForestClassifier(random_state=24)
# grid search for domain 1
gs_d1 = GridSearchCV(estimator=rf_d1, param_grid=para_grid, cv=5, n_jobs=-1)
gs_d1.fit(d1_x_training_str_vec, d1_y_training)
d1_best_params = gs_d1.best_params_
print(f"The best fitting parameters for domain 1: {d1_best_params}")


# define the RandomForestClassifier for domain 2
rf_d2 = RandomForestClassifier(random_state=24)
# grid search for domain 2, the grid search for domain 2 need to be the data after SMOTE
gs_d2 = GridSearchCV(estimator=rf_d2, param_grid=para_grid, cv=5, n_jobs=-1)
gs_d2.fit(d2_x_training_smote, d2_y_training_smote)
d2_best_params = gs_d2.best_params_
print(f"The best fitting parameters for domain 2: {d2_best_params}")


# define the RandomForestClassifier for domain 1 and 2
rf_domain = RandomForestClassifier(random_state=24)




The best fitting parameters for domain 1: {'max_depth': 30, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 150}
The best fitting parameters for domain 2: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
