In [7]:
import json
import numpy as np
import joblib
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

## Load domain 1 data

In [8]:
# read the json file for domain 1
d1_data = []
with open('/Users/zigeliang/Desktop/All/Data Science 2025S1/COMP90051/A2/comp-90051-2025-s-1-project-2/domain1_train_data.json', 'r') as f:
    for line in f:
        d1_data.append(json.loads(line))
# convert the data to datareame and show the first 5 records
d1_df = pd.DataFrame(d1_data)
print(f"domain 1 shappe: {d1_df.shape}")
d1_df.head()


domain 1 shappe: (1000, 3)


Unnamed: 0,text,label,id
0,"[6, 22, 34, 76, 501, 977, 1, 2514, 13623, 76, ...",0,0
1,"[222, 31, 4108, 104, 132, 361, 39, 2305, 12, 9...",0,1
2,"[736, 7194, 113, 12, 366, 2870, 123, 101, 12, ...",0,2
3,"[48, 1, 2025, 69, 361, 533, 327, 237, 4150, 13...",0,3
4,"[2973, 66, 1, 1493, 260, 2740, 50, 1027, 50, 1...",0,4


## Load domain 2 data

In [9]:
# read the json file for domain 2
d2_data = []
with open('/Users/zigeliang/Desktop/All/Data Science 2025S1/COMP90051/A2/comp-90051-2025-s-1-project-2/domain2_train_data.json', 'r') as f:
    for line in f:
        d2_data.append(json.loads(line))
# convert the data to datareame and show the first 5 records
d2_df = pd.DataFrame(d2_data)
print(f"domain 2 shappe: {d2_df.shape}")
d2_df.head()

domain 2 shappe: (5000, 3)


Unnamed: 0,text,label,id
0,"[22, 6065, 76, 119, 13027, 575, 219, 22, 2435,...",0,0
1,"[1275, 1509, 12, 6113, 6287, 327, 411, 1139, 2...",0,1
2,"[575, 2962, 529, 4624, 39, 279, 1012, 277, 76,...",0,2
3,"[12, 6113, 2428, 69, 375, 1025, 2605, 76, 101,...",0,3
4,"[529, 76, 1509, 861, 1, 645, 1, 5013, 237, 3, ...",0,4


## Load the test data

In [10]:
# load test data 
test_data = []
with open('/Users/zigeliang/Desktop/All/Data Science 2025S1/COMP90051/A2/comp-90051-2025-s-1-project-2/test_data.json', 'r') as f:
    for line in f:
        test_data.append(json.loads(line))
# convert the data to datareame and show the first 5 records
test_df = pd.DataFrame(test_data)
print(f"test data shappe: {test_df.shape}")
test_df.head()

test data shappe: (4000, 2)


Unnamed: 0,text,id
0,"[9159, 3048, 238, 276, 162, 286, 305, 22, 36, ...",0
1,"[64, 5039, 1275, 6, 0, 871, 139, 270, 327, 237...",1
2,"[327, 618, 76, 650, 121, 274, 1025, 0, 12207, ...",2
3,"[6, 12, 609, 11905, 4, 879, 677, 78, 13352, 60...",3
4,"[1, 5504, 55, 22, 101, 3783, 139, 2664, 4, 1, ...",4


## Combine the two domains into one dataframe, and convert the data type

In [25]:
# combine domain 1 and 2 
combined_df  =pd.concat([d1_df, d2_df], ignore_index=True)
print(f"combined data shappe: {combined_df.shape}")
# show of number of labels that label is 0 and 1
print(combined_df['label'].value_counts())

# convert the text to a form of list of list that the random forest can accept
total_X_converted = combined_df['text'].tolist()
total_Y_converted = combined_df['label'].tolist()
# show the first 5 records of X
print(f"total_X_converted: {total_X_converted[:5]}")
# show the first 5 records of Y
print(f"total_Y_converted: {total_Y_converted[:5]}")

    

combined data shappe: (6000, 3)
label
1    5250
0     750
Name: count, dtype: int64
total_X_converted: [[6, 22, 34, 76, 501, 977, 1, 2514, 13623, 76, 31, 2085, 277, 22, 238, 862, 931, 132, 305, 6, 22, 132, 882, 154, 13623, 66, 1, 6320, 34, 76, 501, 66, 1, 13019, 139, 863, 299, 34, 76, 199, 296, 337, 3077, 360, 498, 1338, 65, 734, 1308, 3212, 69, 1263, 13, 104, 132, 487, 31, 3, 98, 313, 66, 1, 2429, 13, 104, 2664, 4113, 101, 305, 334, 13, 4113, 101, 6337, 69, 104, 132, 487, 76, 31, 137], [222, 31, 4108, 104, 132, 361, 39, 2305, 12, 936, 1287, 66, 104, 2272, 123, 194, 1, 1287, 31, 64, 6311, 139, 104, 132, 361, 39, 1896, 861, 22, 1103, 123, 101, 2172, 1407, 375, 299, 354, 280, 92, 517, 280, 6255, 80, 76, 36, 2344, 4, 593, 98, 222, 31, 896, 6526, 69, 104, 46, 119, 3061, 48, 719, 34, 14242, 117, 123, 215, 104, 46, 39, 104, 132, 2284, 222, 31, 831, 1, 2470, 2075, 69, 132, 222], [736, 7194, 113, 12, 366, 2870, 123, 101, 12, 230, 403, 516, 147, 5549, 335, 1, 16988, 15911, 366, 2870, 3695, 4600