In [1]:
import numpy as np
import pandas as pd
import os
import textwrap
import re 
import sklearn.linear_model
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer ,TfidfVectorizer
from sklearn.metrics import make_scorer, roc_auc_score ,confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MaxAbsScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder

In [2]:
data_dir = 'data_readinglevel'
x_train_df = pd.read_csv(os.path.join(data_dir, 'x_train.csv'))
y_train_df = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))
y_train_df['stage_encoded'] = y_train_df['Coarse Label'].map({'Key Stage 2-3': 0, 'Key Stage 4-5': 1})
y_train_clean = y_train_df['stage_encoded'].values
N, n_cols = x_train_df.shape
print("Shape of x_train_df: (%d, %d)" % (N, n_cols))
print("Shape of y_train_df: %s" % str(y_train_df.shape))
print(x_train_df.columns)
def load_arr_from_npz(npz_path):
    ''' Load array from npz compressed file given path

    Returns
    -------
    arr : numpy ndarray
    '''
    npz_file_obj = np.load(npz_path)
    arr = npz_file_obj.f.arr_0.copy() # Rely on default name from np.savez
    npz_file_obj.close()
    return arr
xBERT_train_NH = load_arr_from_npz(os.path.join(
        data_dir, 'x_train_BERT_embeddings.npz'))
len(xBERT_train_NH) == len(x_train_df)

Shape of x_train_df: (5557, 32)
Shape of y_train_df: (5557, 6)
Index(['author', 'title', 'passage_id', 'text', 'char_count', 'word_count',
       'sentence_count', 'avg_word_length', 'avg_sentence_length',
       'type_token_ratio', 'pronoun_freq', 'function_words_count',
       'punctuation_frequency', 'sentiment_polarity', 'sentiment_subjectivity',
       'readability_Kincaid', 'readability_ARI', 'readability_Coleman-Liau',
       'readability_FleschReadingEase', 'readability_GunningFogIndex',
       'readability_LIX', 'readability_SMOGIndex', 'readability_RIX',
       'readability_DaleChallIndex', 'info_characters_per_word',
       'info_syll_per_word', 'info_words_per_sentence',
       'info_type_token_ratio', 'info_characters', 'info_syllables',
       'info_words', 'info_wordtypes'],
      dtype='object')


True

In [None]:
tr_text_list = x_train_df['text'].values
def custom_tokenizer(text):
    return re.findall(r"\w+|[^\w\s]", text)
random_state=1543
x_train_df = x_train_df.drop(['author', 'title', 'passage_id','text'], axis=1)
y_train_clean = np.array(y_train_clean)


In [3]:
#tr_text_list = x_train_df['text'].values

def custom_tokenizer(text):
    return re.findall(r"\w+|[^\w\s]", text)

random_state=1543

#x_train_df = x_train_df.drop(['author', 'title', 'passage_id','text'], axis=1)
# x_train_df = pd.concat([
#     pd.DataFrame(xBERT_train_NH, columns=['bert']),
#     x_train_df
# ], axis=1)


# Identify categorical columns
categorical_cols = ['author', 'title']
print(x_train_df.columns)
# One-Hot Encode 'author' and 'title'
categorical_data = x_train_df.loc[:, categorical_cols]  # Explicitly select columns as DataFrame

# One-Hot Encode 'author' and 'title'
encoder = OneHotEncoder(handle_unknown='ignore')
one_hot_encoded = encoder.fit_transform(categorical_data)
one_hot_encoded = one_hot_encoded.toarray()

print(type(one_hot_encoded))
print(f"One-hot encoded shape: {one_hot_encoded.shape}")


x_train_df = x_train_df.drop(columns=['author','title','passage_id','text'])

# Convert BERT embeddings to DataFrame
bert_df = pd.DataFrame(xBERT_train_NH, columns=[f'bert_{i}' for i in range(xBERT_train_NH.shape[1])])
bert_array = bert_df.to_numpy()
print(f"Bert array has shape: {bert_array.shape }")
print(f"x_train array has shape: {x_train_df.to_numpy().shape }")

# Concatenate all features
X_train_final_np = np.hstack((one_hot_encoded,x_train_df.to_numpy(),bert_array))

# Print final shape
print(f"Final feature matrix shape: {X_train_final_np.shape}")

y_train_clean = np.array(y_train_clean)

Index(['author', 'title', 'passage_id', 'text', 'char_count', 'word_count',
       'sentence_count', 'avg_word_length', 'avg_sentence_length',
       'type_token_ratio', 'pronoun_freq', 'function_words_count',
       'punctuation_frequency', 'sentiment_polarity', 'sentiment_subjectivity',
       'readability_Kincaid', 'readability_ARI', 'readability_Coleman-Liau',
       'readability_FleschReadingEase', 'readability_GunningFogIndex',
       'readability_LIX', 'readability_SMOGIndex', 'readability_RIX',
       'readability_DaleChallIndex', 'info_characters_per_word',
       'info_syll_per_word', 'info_words_per_sentence',
       'info_type_token_ratio', 'info_characters', 'info_syllables',
       'info_words', 'info_wordtypes'],
      dtype='object')
<class 'numpy.ndarray'>
One-hot encoded shape: (5557, 439)
Bert array has shape: (5557, 768)
x_train array has shape: (5557, 28)
Final feature matrix shape: (5557, 1235)


In [4]:
x_train_df = X_train_final_np

In [5]:

numeric_transformer = Pipeline([
    ('passthrough', FunctionTransformer(validate=False)) 
])


text_transformer = Pipeline([
    ('tfidf', TfidfVectorizer(
        min_df=1,
        max_df=1,
        ngram_range=(1,1),
        stop_words=None,
        token_pattern=r"(?u)\b\w\w+\b",
        lowercase=False)),
    ('scaler', MaxAbsScaler())
])


preprocessor = ColumnTransformer([
    ('numeric', numeric_transformer, x_train_df.columns.tolist()),
    ('text', text_transformer, 'text') 
], remainder='drop')

full_pipeline = Pipeline([
    ('prep', preprocessor),
    ('class', MLPClassifier(
        max_iter=200,
        early_stopping=True,
        learning_rate='adaptive',
        solver='sgd',
        random_state=random_state))
])


my_parameter_grid_by_name = {
    'prep__text__tfidf__min_df': [1,2,5,10,30,50] ,
    'prep__text__tfidf__max_df': [0.7,0.75, 0.8,0.85]  ,
    'prep__text__tfidf__ngram_range': [(1,1), (1,2)],
    'prep__text__tfidf__stop_words': [None, 'english'],
    'prep__text__tfidf__token_pattern': [
        r'(?u)\b[\w-]+\b',
        r'(?u)\b\w\w+\b',
        r'(?u)\b(?!\d+\b)\w+\b'],
    'prep__text__tfidf__lowercase': [False, True],
    'class__hidden_layer_sizes': [(64,), (128,)],
    'class__activation': ['relu', 'tanh','logistic'],
    'class__alpha': [0.001, 0.01],
    'class__learning_rate_init': [0.001, 0.01],
    'class__batch_size': [32, 64]
}

my_scoring_metric_name = 'roc_auc'
X_combined = pd.concat([
    pd.DataFrame(tr_text_list, columns=['text']),
    x_train_df
], axis=1)

X_train_val, X_test, y_train_val, y_test = train_test_split(X_combined, y_train_clean, test_size=0.1, random_state=1543, stratify=y_train_clean)


AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [None]:

n_iter = [50]
# valid_indicators_N = np.zeros(N)
# valid_indicators_N[valid_ids] = -1
# my_splitter = sklearn.model_selection.PredefinedSplit(valid_indicators_N)
y_train_clean = list(y_train_clean)
my_best_models = []
best_test_scores = []
for iter in n_iter:
    random_searcher = RandomizedSearchCV(
    full_pipeline,
    my_parameter_grid_by_name,
    scoring=my_scoring_metric_name ,
    cv=sklearn.model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state),
    refit=True,
    n_iter=iter,  
    random_state=random_state, 
    return_train_score=True 
    )
    random_searcher.fit(X_train_val, y_train_val)
    # Get the best-trained model (automatically refit on full training data)
    best_model = random_searcher.best_estimator_
    my_best_models.append(best_model)

    # Best hyperparameters
    best_params = random_searcher.best_params_
    print("Best Hyperparameters:", best_params)

    # Best cross-validation score (average accuracy across folds)
    best_score = random_searcher.best_score_
    print(f"Best CV AUROC: for {iter} iterations is {best_score:.4f}")

    #use best model to get predictions on test data
    y_test_probs = best_model.predict_proba(X_test)[:, 1]  

    # Compute ROC AUC score
    roc_auc = roc_auc_score(y_test, y_test_probs)
    best_test_scores.append(roc_auc)
    predicted_labels = np.where(y_test_probs >= 0.5, 1, 0)
    cm = confusion_matrix(y_test,predicted_labels )
    cm_df = pd.DataFrame(cm, 
                     index=['Actual 0', 'Actual 1'], 
                     columns=['Predicted 0', 'Predicted 1'])

    print(cm_df)
    print(f"Test Set ROC AUC Score for {iter}:", roc_auc)



In [None]:
print(n_iter[np.argmax(best_test_scores)])
best_model_overall = my_best_models[np.argmax(best_test_scores)]
print(best_model_overall.get_params())
#print(y_test.shape)

overall_train_roc = roc_auc_score(y_train_clean,best_model_overall.predict_proba(tr_text_list)[:,1])

print(overall_train_roc)
x_test_df = pd.read_csv(os.path.join(data_dir, 'x_test.csv'))
tr_test_list = x_test_df['text'].values.tolist()
#print(tr_test_list[:5])
yproba_N2 = best_model_overall.predict_proba(tr_test_list)
print(yproba_N2[:5])
y_proba_N1 = yproba_N2[:,1]
print(y_proba_N1[:5])
print(y_proba_N1.shape)
np.savetxt("yproba1_test.txt", y_proba_N1, fmt="%.6f")
loaded_probs = np.loadtxt('yproba1_test.txt')
print(loaded_probs.shape)

In [None]:
import pandas as pd


results_df = pd.DataFrame(random_searcher.cv_results_)

param_columns = [col for col in results_df.columns if col.startswith("param_")]
for col in param_columns:

    results_df[col] = results_df[col].apply(
        lambda x: 'None' if x is None else str(x) 
    )

results_df = results_df[param_columns + ["mean_test_score"]]
print(results_df)