In [1]:
import numpy as np
import pandas as pd
import os
import textwrap
import re 
import sklearn.linear_model
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer ,TfidfVectorizer
from sklearn.metrics import make_scorer, roc_auc_score ,confusion_matrix
import torch
import torch.nn as nn
import torch.optim as optim
from skorch import NeuralNetClassifier
from skorch.callbacks import EarlyStopping
from skorch.dataset import ValidSplit
import sklearn.pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer

In [12]:
data_dir = 'data_readinglevel'
x_train_df = pd.read_csv(os.path.join(data_dir, 'x_train.csv'))
y_train_df = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))
y_train_df['stage_encoded'] = y_train_df['Coarse Label'].map({'Key Stage 2-3': 0, 'Key Stage 4-5': 1})
y_train_clean = y_train_df['stage_encoded'].values
N, n_cols = x_train_df.shape
#print("Shape of x_train_df: (%d, %d)" % (N, n_cols))
#print("Shape of y_train_df: %s" % str(y_train_df.shape))
#print(x_train_df.columns)

def load_arr_from_npz(npz_path):
    ''' Load array from npz compressed file given path

    Returns
    -------
    arr : numpy ndarray
    '''
    npz_file_obj = np.load(npz_path)
    arr = npz_file_obj.f.arr_0.copy() # Rely on default name from np.savez
    npz_file_obj.close()
    return arr

xBERT_train_NH = load_arr_from_npz(os.path.join(
        data_dir, 'x_train_BERT_embeddings.npz'))

assert len(xBERT_train_NH) == len(x_train_df)

In [13]:
#tr_text_list = x_train_df['text'].values

def custom_tokenizer(text):
    return re.findall(r"\w+|[^\w\s]", text)


# Set random seeds for reproducibility
random_state = 1543


# Identify categorical columns
# categorical_cols = ['author', 'title']
# print(x_train_df.columns)
# # One-Hot Encode 'author' and 'title'
# categorical_data = x_train_df.loc[:, categorical_cols]  # Explicitly select columns as DataFrame
# # One-Hot Encode 'author' and 'title'
# encoder = OneHotEncoder(handle_unknown='ignore')
# one_hot_encoded = encoder.fit_transform(categorical_data)
# one_hot_encoded = one_hot_encoded.toarray()

# print(type(one_hot_encoded))
# print(f"One-hot encoded shape: {one_hot_encoded.shape}")

x_train_df = x_train_df.drop(columns=['author','title','passage_id'])
other_cols = x_train_df.columns.drop('text').tolist()
# Convert BERT embeddings to DataFrame
# bert_df = pd.DataFrame(xBERT_train_NH, columns=[f'bert_{i}' for i in range(xBERT_train_NH.shape[1])])
# bert_array = bert_df.to_numpy()
# print(f"Bert array has shape: {bert_array.shape }")
# print(f"x_train array has shape: {x_train_df.to_numpy().shape }")

# # Concatenate all features
# X_train_final_np = np.hstack((x_train_df.to_numpy(),bert_array))
#X_train_final_np = np.hstack((one_hot_encoded,x_train_df.to_numpy(),bert_array))


y_train_clean = np.array(y_train_clean)
X_train_val, X_test, y_train_val, y_test = train_test_split(x_train_df, y_train_clean, test_size=0.1, random_state=1543, stratify=y_train_clean)


In [20]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_units, activation='relu'):
        super().__init__()
        self.layers = nn.ModuleList()
        prev_units = input_size
        activation_class = {
            'relu': nn.ReLU,
            'tanh': nn.Tanh,
            'logistic': nn.Sigmoid
        }[activation]
        for units in hidden_units:
            self.layers.append(nn.Linear(prev_units, units))
            self.layers.append(activation_class())
            prev_units = units
        self.output_layer = nn.Linear(prev_units, 1)
    
    def forward(self, X):
        for layer in self.layers:
            X = layer(X)
        logits = self.output_layer(X)
        return logits.squeeze(-1) 
net = NeuralNetClassifier(
    module=MLP,
    module__input_size=x_train_df.shape[1],
    module__hidden_units=(64,),
    module__activation='relu',
    optimizer=optim.Adam,
    criterion=nn.BCEWithLogitsLoss,
    max_epochs=200,
    batch_size=32,
    callbacks=[
        EarlyStopping(patience=5)
    ],
    train_split=ValidSplit(0.1, stratified=True),
    device='cuda' if torch.cuda.is_available() else 'cpu',
    verbose=1
)
preprocessor = ColumnTransformer([
    ('tfidf', TfidfVectorizer(
        min_df=1, 
        max_df=1,
        ngram_range=(1,1),
        stop_words=None,
        token_pattern=r'(?u)\b[\w-]+\b',
        lowercase=False,
        norm='l2'
    ), 'text'),
    ('numerical', 'passthrough', other_cols)
], sparse_threshold=0)

# 3. Full pipeline with BERT concatenation
full_pipeline = sklearn.pipeline.Pipeline([
    ('preprocessor', preprocessor),
    ('bert_merger', FunctionTransformer(
        lambda X: np.hstack((
            X.toarray() if hasattr(X, "toarray") else X,  # Convert sparse to dense
            xBERT_train_NH
        )), 
        validate=False)),
    ('scaler', StandardScaler()),
    ('net', NeuralNetClassifier(
        MLP,
        module__input_size=preprocessor.fit_transform(X_train_val).shape[1] + xBERT_train_NH.shape[1],
        module__hidden_units=(64,),
        module__activation='relu',
        optimizer=optim.Adam,
        criterion=nn.BCEWithLogitsLoss,
        max_epochs=200,
        batch_size=32,
        callbacks=[EarlyStopping(patience=5)],
        train_split=ValidSplit(0.1, stratified=True),
        device='cuda' if torch.cuda.is_available() else 'cpu',
        verbose=0
    ))
])

# 4. Updated parameter grid focusing on text features
param_grid = {
    'preprocessor__tfidf__min_df': [1, 2, 5],
    'preprocessor__tfidf__max_df': [0.7, 0.8],
    'preprocessor__tfidf__ngram_range': [(1,1), (1,2)],
    'preprocessor__tfidf__token_pattern': [
        r'(?u)\b\w{2,}\b',
        r'(?u)\b[a-zA-Z]+\b'
    ],
    'preprocessor__tfidf__lowercase': [True, False],
    'net__module__hidden_units': [(64,), (128,), (128, 64)],
    'net__optimizer__lr': [0.001, 0.0005],
    'net__module__activation': ['relu', 'tanh', 'logistic'],
    'net__optimizer__weight_decay': [1e-4, 0.001, 0.01],
    'net__batch_size': [32, 64, 128],
}


In [21]:
# 5. Execute search
random_search = RandomizedSearchCV(
    full_pipeline,
    param_distributions=param_grid,
    n_iter=100,
    scoring='roc_auc',
    cv=5,
    verbose=2,
    random_state=random_state,
    n_jobs=1
)

# Convert data to float32 for PyTorch


random_search.fit(X_train_val, y_train_val)
best_model = random_search.best_estimator_
X_test_float = X_test.astype(np.float32)

# Predict probabilities
y_test_probs = best_model.predict_proba(X_test_float)[:, 1]

# Calculate metrics
best_test_scores = roc_auc_score(y_test, y_test_probs)
predicted_labels = np.where(y_test_probs >= 0.5, 1, 0)
cm = confusion_matrix(y_test, predicted_labels)
cm_df = pd.DataFrame(cm, 
                     index=['Actual 0', 'Actual 1'], 
                     columns=['Predicted 0', 'Predicted 1'])
print(cm_df)
print(f"Test Set ROC AUC Score: {best_test_scores:.4f}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END net__batch_size=32, net__module__activation=relu, net__module__hidden_units=(64,), net__optimizer__lr=0.0005, net__optimizer__weight_decay=0.0001, preprocessor__tfidf__lowercase=True, preprocessor__tfidf__max_df=0.7, preprocessor__tfidf__min_df=5, preprocessor__tfidf__ngram_range=(1, 2), preprocessor__tfidf__token_pattern=(?u)\b[a-zA-Z]+\b; total time=   1.0s
[CV] END net__batch_size=32, net__module__activation=relu, net__module__hidden_units=(64,), net__optimizer__lr=0.0005, net__optimizer__weight_decay=0.0001, preprocessor__tfidf__lowercase=True, preprocessor__tfidf__max_df=0.7, preprocessor__tfidf__min_df=5, preprocessor__tfidf__ngram_range=(1, 2), preprocessor__tfidf__token_pattern=(?u)\b[a-zA-Z]+\b; total time=   0.9s
[CV] END net__batch_size=32, net__module__activation=relu, net__module__hidden_units=(64,), net__optimizer__lr=0.0005, net__optimizer__weight_decay=0.0001, preprocessor__tfidf__lowercase=True, pr

ValueError: 
All the 500 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\ningn\anaconda3\envs\cs135_25s_env\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\ningn\anaconda3\envs\cs135_25s_env\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ningn\anaconda3\envs\cs135_25s_env\Lib\site-packages\sklearn\pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ningn\anaconda3\envs\cs135_25s_env\Lib\site-packages\sklearn\pipeline.py", line 406, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ningn\anaconda3\envs\cs135_25s_env\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ningn\anaconda3\envs\cs135_25s_env\Lib\site-packages\sklearn\pipeline.py", line 1310, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ningn\anaconda3\envs\cs135_25s_env\Lib\site-packages\sklearn\utils\_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ningn\anaconda3\envs\cs135_25s_env\Lib\site-packages\sklearn\base.py", line 1101, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ningn\anaconda3\envs\cs135_25s_env\Lib\site-packages\sklearn\utils\_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ningn\anaconda3\envs\cs135_25s_env\Lib\site-packages\sklearn\preprocessing\_function_transformer.py", line 252, in transform
    out = self._transform(X, func=self.func, kw_args=self.kw_args)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ningn\anaconda3\envs\cs135_25s_env\Lib\site-packages\sklearn\preprocessing\_function_transformer.py", line 379, in _transform
    return func(X, **(kw_args if kw_args else {}))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ningn\AppData\Local\Temp\ipykernel_42616\1068932454.py", line 55, in <lambda>
    lambda X: np.hstack((
              ^^^^^^^^^^^
  File "c:\Users\ningn\anaconda3\envs\cs135_25s_env\Lib\site-packages\numpy\core\shape_base.py", line 359, in hstack
    return _nx.concatenate(arrs, 1, dtype=dtype, casting=casting)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 4000 and the array at index 1 has size 5557

--------------------------------------------------------------------------------
400 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\ningn\anaconda3\envs\cs135_25s_env\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\ningn\anaconda3\envs\cs135_25s_env\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ningn\anaconda3\envs\cs135_25s_env\Lib\site-packages\sklearn\pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ningn\anaconda3\envs\cs135_25s_env\Lib\site-packages\sklearn\pipeline.py", line 406, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ningn\anaconda3\envs\cs135_25s_env\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ningn\anaconda3\envs\cs135_25s_env\Lib\site-packages\sklearn\pipeline.py", line 1310, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ningn\anaconda3\envs\cs135_25s_env\Lib\site-packages\sklearn\utils\_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ningn\anaconda3\envs\cs135_25s_env\Lib\site-packages\sklearn\base.py", line 1101, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ningn\anaconda3\envs\cs135_25s_env\Lib\site-packages\sklearn\utils\_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ningn\anaconda3\envs\cs135_25s_env\Lib\site-packages\sklearn\preprocessing\_function_transformer.py", line 252, in transform
    out = self._transform(X, func=self.func, kw_args=self.kw_args)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ningn\anaconda3\envs\cs135_25s_env\Lib\site-packages\sklearn\preprocessing\_function_transformer.py", line 379, in _transform
    return func(X, **(kw_args if kw_args else {}))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ningn\AppData\Local\Temp\ipykernel_42616\1068932454.py", line 55, in <lambda>
    lambda X: np.hstack((
              ^^^^^^^^^^^
  File "c:\Users\ningn\anaconda3\envs\cs135_25s_env\Lib\site-packages\numpy\core\shape_base.py", line 359, in hstack
    return _nx.concatenate(arrs, 1, dtype=dtype, casting=casting)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 4001 and the array at index 1 has size 5557


In [22]:
best_model_overall = random_search.best_estimator_
print(best_model_overall.get_params())

x_test_df = pd.read_csv(os.path.join(data_dir, 'x_test.csv'))
xBERT_test_NH = load_arr_from_npz(os.path.join(data_dir, 'x_test_BERT_embeddings.npz'))


categorical_data_test = x_test_df.loc[:, ['author', 'title']]
one_hot_encoded_test = encoder.transform(categorical_data_test).toarray()

x_test_clean = x_test_df.drop(columns=['author','title','passage_id','text']).to_numpy()

bert_test_df = pd.DataFrame(xBERT_test_NH, columns=[f'bert_{i}' for i in range(xBERT_test_NH.shape[1])])
X_test_final_np = np.hstack((
    one_hot_encoded_test,
    x_test_clean,
    bert_test_df.to_numpy()
)).astype(np.float32) 

yproba_N2 = best_model_overall.predict_proba(X_test_final_np)
print("Class probabilities (first 5):\n", yproba_N2[:5])

y_proba_N1 = yproba_N2[:, 1]
print("Class 1 probabilities (first 5):", y_proba_N1[:5])
print("Probability array shape:", y_proba_N1.shape)

np.savetxt("yproba1_test.txt", y_proba_N1, fmt="%.6f")
loaded_probs = np.loadtxt('yproba1_test.txt')
print("Loaded probabilities shape:", loaded_probs.shape)

{'module': <class '__main__.MLP'>, 'criterion': <class 'torch.nn.modules.loss.BCEWithLogitsLoss'>, 'optimizer': <class 'torch.optim.adam.Adam'>, 'lr': 0.01, 'max_epochs': 200, 'batch_size': 64, 'iterator_train': <class 'torch.utils.data.dataloader.DataLoader'>, 'iterator_valid': <class 'torch.utils.data.dataloader.DataLoader'>, 'dataset': <class 'skorch.dataset.Dataset'>, 'train_split': <skorch.dataset.ValidSplit object at 0x000001AC5FEC78D0>, 'callbacks': [<skorch.callbacks.training.EarlyStopping object at 0x000001AC5F8B5C50>], 'predict_nonlinearity': 'auto', 'warm_start': False, 'verbose': 1, 'device': 'cpu', 'compile': False, 'use_caching': 'auto', 'torch_load_kwargs': None, '_params_to_validate': {'module__activation', 'module__hidden_units', 'optimizer__weight_decay', 'optimizer__lr', 'module__input_size'}, 'module__input_size': 1235, 'module__hidden_units': (64,), 'module__activation': 'tanh', 'classes': None, 'optimizer__weight_decay': 0.0001, 'optimizer__lr': 0.0005, 'callbacks

In [23]:
import pandas as pd


results_df = pd.DataFrame(random_search.cv_results_)

param_columns = [col for col in results_df.columns if col.startswith("param_")]
for col in param_columns:

    results_df[col] = results_df[col].apply(
        lambda x: 'None' if x is None else str(x) 
    )

results_df = results_df[param_columns + ["mean_test_score"]]
print(results_df)

   param_optimizer__weight_decay param_optimizer__lr  \
0                          0.001              0.0005   
1                         0.0001               0.001   
2                         0.0001               0.001   
3                         0.0001                0.01   
4                         0.0001              0.0005   
..                           ...                 ...   
95                         0.001                0.01   
96                         0.001              0.0005   
97                          0.01                0.01   
98                          0.01               0.001   
99                          0.01              0.0005   

   param_module__hidden_units param_module__activation param_batch_size  \
0              (256, 128, 64)                     relu               64   
1                       (64,)                 logistic              128   
2                  (128, 128)                     relu               64   
3                    (64, 3