In [1]:
%run ./Preamble.ipynb

all_models = PM.check_for_existing_patterns("activations")
# Uncomment below lines to use specific model sets:
# all_models=PM.gpt2xl_models
# all_models=PM.gpt2_models
# all_models=['gpt2','gpt2-untrained_1','gpt2-untrained_1_weight_config_all']
print(all_models)

## load preprocessed data
ys = PM.load_ys(v=1) # , use_cache=False)
Xss = PM.load_Xss(all_models, v=0) # , use_cache=False)

['gpt2-xl', 'gpt2-xl-untrained_1', 'gpt2', 'gpt2-untrained_1', 'gpt2-untrained_2', 'gpt2-untrained_3', 'gpt2-untrained_4', 'gpt2-untrained_5', 'gpt2-untrained_6', 'gpt2-untrained_7', 'gpt2-untrained_8', 'gpt2-untrained_9', 'gpt2-untrained_1_weight_config_all', 'gpt2-untrained_2_weight_config_all', 'gpt2-untrained_3_weight_config_all', 'gpt2-untrained_4_weight_config_all', 'gpt2-untrained_5_weight_config_all', 'gpt2-untrained_6_weight_config_all', 'gpt2-untrained_7_weight_config_all', 'gpt2-untrained_8_weight_config_all', 'gpt2-untrained_9_weight_config_all']


loading mydatadict:   0%|          | 0/22 [00:00<?, ?it/s]

52   = number of possible targets
y shape: (7958,)
Size of ys: 0.004186046 GB

classification_labels: {
  "words": ['side' 'was' 'intertwined' ... 'four' 'years' '.'],
  "words-": ['.' 'side' 'was' ... ',' 'four' 'years'],
  "words+": ['was' 'intertwined' '.' ... 'years' '.' 'side'],
  "word_idx": [ 386  387  388 ... 8341 8342 8343],
  "word_idx-": [8343  386  387 ... 8340 8341 8342],
  "word_idx+": [ 387  388  389 ... 8342 8343  386],
  "predicate_lemmas": ['nan' 'be' 'intertwine' ... 'nan' 'nan' 'nan'],
  "predicate_lemmas-": ['nan' 'nan' 'be' ... 'nan' 'nan' 'nan'],
  "predicate_lemmas+": ['be' 'intertwine' 'nan' ... 'nan' 'nan' 'nan'],
  "predicate_framenet_ids": ['nan' '03' '01' ... 'nan' 'nan' 'nan'],
  "predicate_framenet_ids-": ['nan' 'nan' '03' ... 'nan' 'nan' 'nan'],
  "predicate_framenet_ids+": ['03' '01' 'nan' ... 'nan' 'nan' 'nan'],
  "word_senses": [nan nan nan ... nan nan nan],
  "word_senses-": [nan nan nan ... nan nan nan],
  "word_senses+": [nan nan nan ... nan nan na

loading models:   0%|          | 0/21 [00:00<?, ?it/s]

In [2]:
# Data adjustments

# Cap values for specific target variables (word_idx, tree_depth) to predefined limits
for y in ["word_idx", "word_idx+", "word_idx-"]:
    ys[y] = np.array([x if x <= 34 else 34 for x in ys[y]])  # Cap word index to 34

for y in ["tree_depth", "tree_depth+", "tree_depth-"]:
    ys[y] = np.array([x if x <= 14 else 14 for x in ys[y]])  # Cap tree depth to 14


## scikit-learn

In [3]:
import numpy as np
from sklearnex import patch_sklearn
patch_sklearn()  # Apply Intel optimizations to scikit-learn
# https://stackoverflow.com/a/6929403
# https://intel.github.io/scikit-learn-intelex/algorithms.html
# https://github.com/intel/scikit-learn-intelex

from sklearn.model_selection import train_test_split, ShuffleSplit, cross_val_score
from sklearn import svm
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import cross_val_predict, cross_validate, KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix as calc_confusion_matrix, f1_score as calc_f1_score, accuracy_score as calc_accuracy_score, make_scorer
from scipy.stats import pearsonr

# Define Pearson correlation scorer for regression tasks
pearson_r_regression_scorer = make_scorer(lambda y1, y2: pearsonr(y1, y2)[0])

# Cross-validation strategies
cv_KFold = KFold(n_splits=5)
cv_StratifiedKFold = StratifiedKFold(n_splits=5)

# Helper function to iterate over models, layers, and targets
def get_input_coordinates(targets=None, models=None, model_layers_filter=None, **kwargs):
    """
    Yield (target, model, layer) combinations based on available models and targets.

    Args:
        targets (list): List of target variables to iterate over. Defaults to regression labels.
        models (list): List of models to iterate over. Defaults to all loaded models.
        model_layers_filter (list): Filter layers if only specific layers should be used.

    Yields:
        tuple: (target, model, layer) combinations.
    """
    global ys, Xss
    targets = targets if targets else globals()['reggression_labels']
    models = models if models else globals()['all_models']
    
    for target in targets:
        for model in models:
            model_layers = list(Xss[model])
            if model_layers_filter:
                model_layers = [x for i, x in enumerate(model_layers) if i in model_layers_filter]
            for layer in model_layers:
                yield (target, model, layer)


# Function to compute scores for all (target, model, layer) combinations
def score_all(target=None, model=None, layer=None, svm_model=None, **kwargs):
    """
    Compute scores for all (target, model, layer) combinations using the provided SVM model.

    Args:
        target, model, layer: Specific combinations to evaluate (defaults to None).
        svm_model (object): SVM model (e.g., LinearRegression, LogisticRegression, etc.).
    
    Returns:
        dict: Scores for all input combinations.
    """
    return {
        (target, model, layer, str(svm_model)): 
        score_and_fit_svm_model(target=target, model=model, layer=layer, svm_model=svm_model, **kwargs)
        for target, model, layer in tqdm(list(get_input_coordinates(**kwargs)))
    }


# Function to fit and score a model using cross-validation
def score_and_fit_svm_model(target=0, model=None, layer=0, max_instances=500, svm_model=None, v=0, time=0, use_cache=False, **kwargs):
    """
    Fit and score an SVM model using cross-validation. Supports both regression and classification.

    Args:
        target (str): Target variable name.
        model (str): Model name.
        layer (str): Model layer name.
        max_instances (int): Maximum number of instances to use for fitting.
        svm_model (object): The SVM model to fit (e.g., LinearRegression, LogisticRegression).
        v (int): Verbosity level for printing (default is 0).
        time (bool): Whether to time the fitting process (default is False).
        use_cache (bool): Whether to use cached results (default is True).

    Returns:
        float: The computed score (e.g., accuracy for classification or Pearson correlation for regression).
    """
    # File paths for saving scores and predictions
    score_save_name = f"{HOME}/data/experiment_1_results/{model},{target},{layer},{max_instances},{str(svm_model)},v2.csv"
    predict_save_name = f"{HOME}/data/experiment_1_results/{model},{target},{layer},{max_instances},{str(svm_model)},v2.npz"
    
    # Try to load from cache if use_cache is enabled
    try:
        if use_cache:
            df = pd.read_csv(score_save_name)
            return df["score"][0]
    except:
        pass

    # Limit the number of instances and extract the data
    X = Xss[model][layer][:max_instances]
    y = ys[target][:max_instances]

    if isinstance(svm_model, LinearRegression):
        # For regression tasks (Pearson correlation)
        scores = cross_val_score(svm_model, X, y, cv=cv_KFold, n_jobs=-1, scoring=pearson_r_regression_scorer)
        pearson_r_score = np.mean(scores)
        pearson_r_error = np.std(scores) / np.sqrt(len(scores))
        print(f"pearson_r_score: {pearson_r_score}, err={pearson_r_error}")

        # Save results
        np.savez(predict_save_name, scores=scores, pearson_r_score=pearson_r_error, pearson_r_error=pearson_r_error)
        score = pearson_r_score
    
    elif isinstance(svm_model, (svm.LinearSVC, svm.SVC, LogisticRegression)):
        # For classification tasks (accuracy score)
        predictions = cross_val_predict(svm_model, X, y, cv=cv_StratifiedKFold, n_jobs=-1)
        confusion_matrix = calc_confusion_matrix(y, predictions)
        print("confusion_matrix:")
        print(confusion_matrix)
        accuracy_score = calc_accuracy_score(y, predictions)
        print(f"accuracy_score: {accuracy_score}")

        # Save results
        np.savez(predict_save_name, predictions=predictions, confusion_matrix=confusion_matrix, accuracy_score=accuracy_score)
        score = accuracy_score
    
    else:
        raise ValueError(f"Unexpected predictor: {svm_model}")
    
    # Verbose output
    if v == 1:
        print(f"score = {score}")
    elif v == 2:
        print(f"target={target}, model={model}, layer={layer}, score={score}")
    
    # Save the score to a CSV file
    df = pd.DataFrame({
        "model": [model],
        "target": [target],
        "layer": [layer],
        "max_instances": [max_instances],
        "v": [v],
        "score": [score]
    })
    df.to_csv(score_save_name, index=False)
    return score


# Models to test
Linear = LinearRegression()
SVC_model = svm.SVC(kernel='linear')
SVR_model = svm.SVR(kernel='linear')
Logistic1 = LogisticRegression(random_state=1, penalty="l1", solver="saga", tol=0.1, max_iter=1000)
Logistic2 = LogisticRegression(random_state=1, tol=0.1, solver="newton-cg", max_iter=1000)
Logistic3 = LogisticRegression(random_state=1, penalty="l1", solver="saga", tol=0.1, warm_start=True)
Logistic4 = LogisticRegression(random_state=1, tol=0.1, solver="newton-cg", warm_start=True)
LinearSVC_model = svm.LinearSVC()

svm_labels = [str(x) for x in [Linear, SVC_model, SVR_model, Logistic1, Logistic2, Logistic3, Logistic4, LinearSVC_model]]

use_cache = True  # Enables the use of cached results

# Score the models on different target variables
res_pos_classification_Logistic2 = score_all(targets=my_classification_targets, max_instances=10000, svm_model=Logistic1, use_cache=use_cache)
res_regression = score_all(targets=reggression_labels, max_instances=10000, svm_model=Linear, use_cache=use_cache)
res_pos_classification_Logistic1 = score_all(targets=pos_labels, max_instances=10000, svm_model=Logistic1, use_cache=use_cache)

    
Linear=LinearRegression()
SVC_model=svm.SVC(kernel='linear')
SVR_model=svm.SVR(kernel='linear')
Logistic1=LogisticRegression(random_state=1,penalty="l1",solver="saga",tol=.1,max_iter=1000)
Logistic2=LogisticRegression(random_state=1,tol=.1,solver="newton-cg",max_iter=1000)
Logistic3=LogisticRegression(random_state=1,penalty="l1",solver="saga",tol=.1,warm_start=True)
Logistic4=LogisticRegression(random_state=1,tol=.1,solver="newton-cg",warm_start=True)
LinearSVC_model=svm.LinearSVC()

svm_labels=[str(x) for x in [Linear,SVC_model,SVR_model,Logistic1,Logistic2,Logistic3,Logistic4,LinearSVC_model]]

use_cache=True
# use_cache=False

res_reggresion = score_all(targets=reggression_labels, max_instances=10000,svm_model=Linear, use_cache=use_cache)
res_pos_classification_Logistic1 = score_all(targets=pos_labels, max_instances=10000,svm_model=Logistic1, use_cache=use_cache)
res_pos_classification_Logistic2 = score_all(targets=my_classification_targets, max_instances=10000,svm_model=Logistic1, use_cache=use_cache)



Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


  0%|          | 0/3105 [00:00<?, ?it/s]

  0%|          | 0/4140 [00:00<?, ?it/s]

  0%|          | 0/3105 [00:00<?, ?it/s]

  0%|          | 0/4140 [00:00<?, ?it/s]

  0%|          | 0/3105 [00:00<?, ?it/s]

  0%|          | 0/3105 [00:00<?, ?it/s]