In [2]:
#hide
from LeveragingStructure.core import *

In [3]:
import warnings
warnings.filterwarnings('ignore')

# Leveraging Structure for Improved Classification of Grouped Data


## Create environment and Install Dependencies

`conda create --name structure python=3.9`

`conda activate structure`

`python -m pip install -r requirements.txt`

# Running Experiment on Synthetic Data

## Create Synthetic Dataset

In [None]:
from LeveragingStructure.data.leveragingStructure import SyntheticSetting2

In [None]:
import numpy as np

In [None]:
dataset = SyntheticSetting2.from_criteria(n_targets=10,n_clusters=2,dim=2,aucRange=[.75,.95],
                                          irreducibility_range=[.01,.9],
                                         num_points_labeled_partition=lambda: np.round(np.random.normal(1000,100)),
                                         num_points_unlabeled_partition=lambda: np.round(np.random.normal(10000,1000)),
                                         timeoutMins=2,nTimeouts=3)

### Visualize (Synthetic Dataset Only)

In [None]:
dataset.NMix.plotCIEllipse()

## Split Labeled and Unlabeled Sets

In [None]:
from sklearn.model_selection import GroupShuffleSplit

### Split Labeled Train and Validation Sets

In [None]:
XLabeled,yLabeled,instanceNumLabeled,bagLabeled = map(np.concatenate, list(zip(*[s[:]+(s.instanceNum,
                                                                                       np.ones(len(s),
                                                                                               dtype=int)*sNum) \
                                                                                 for (sNum,s) in enumerate(dataset.labeledSamples)])))
gss = GroupShuffleSplit(n_splits=1)

labeledTrainIndices,labeledValIndices = next(iter(gss.split(XLabeled,yLabeled,instanceNumLabeled)))

XLabeledTrain = XLabeled[labeledTrainIndices]
XLabeledVal = XLabeled[labeledValIndices]
yLabeledTrain = yLabeled[labeledTrainIndices]
yLabeledVal = yLabeled[labeledValIndices]
instanceNumLabeledTrain = instanceNumLabeled[labeledTrainIndices]
instanceNumLabeledVal = instanceNumLabeled[labeledValIndices]
bagLabeledTrain = bagLabeled[labeledTrainIndices]
bagLabeledVal = bagLabeled[labeledValIndices]

### Split Unlabeled Train and Test Sets

In [None]:
XUnlabeled,yUnlabeled,instanceNumUnlabeled = list(zip(*[s[:]+(s.instanceNum,) for s in dataset.unlabeledSamples]))

bagUnlabeled = [np.ones(len(y),dtype=int)*bagNum for bagNum,y in enumerate(yUnlabeled)]

XUnlabeled,yUnlabeled,bagUnlabeled,instanceNumUnlabeled = map(np.concatenate, [XUnlabeled,yUnlabeled,bagUnlabeled,instanceNumUnlabeled])

unlabeledTrainIndices,unlabeledTestIndices = next(iter(GroupShuffleSplit(n_splits=1).split(XUnlabeled,
                                                                                           yUnlabeled,
                                                                                           instanceNumUnlabeled)))

XUnlabeledTrain,yUnlabeledTrain = XUnlabeled[unlabeledTrainIndices],yUnlabeled[unlabeledTrainIndices]
bagUnlabeledTrain = bagUnlabeled[unlabeledTrainIndices]
instanceNumUnlabeledTrain = instanceNumUnlabeled[unlabeledTrainIndices]

XUnlabeledTest,yUnlabeledTest = XUnlabeled[unlabeledTestIndices],yUnlabeled[unlabeledTestIndices]
bagUnlabeledTest = bagUnlabeled[unlabeledTestIndices]
instanceNumUnlabeledTest = instanceNumUnlabeled[unlabeledTestIndices]

## Run Method on Synthetic Data

In [None]:
import os
if not os.path.isdir("experiments"):
    os.mkdir("experiments")

In [None]:
from LeveragingStructure.experiment_utils import Method,GroupAwareGlobal,FrustratinglyEasyDomainAdaptation

In [None]:
if not os.path.isdir("experiments/synthetic_experiment"):
    os.mkdir("experiments/synthetic_experiment")

In [None]:
method = Method("experiments/synthetic_experiment/ourMethod")

In [None]:
method.fit(XLabeledTrain,yLabeledTrain,XLabeledVal,yLabeledVal,XUnlabeledTrain,bagUnlabeledTrain,
           cluster_range=np.arange(1,4))

In [None]:
mm1 = Method("experiments/synthetic_experiment/singleClusterAblation")

In [None]:
mm1.fit(XLabeledTrain,yLabeledTrain,XLabeledVal,yLabeledVal,XUnlabeledTrain,bagUnlabeledTrain,
       cluster_range=[1])

In [None]:
ag = GroupAwareGlobal(savepath="experiments/synthetic_experiment/groupAwareGlobal")

In [None]:
ag.fit(XLabeledTrain,yLabeledTrain,bagLabeledTrain,
       XLabeledVal,yLabeledVal,bagLabeledVal,
       XUnlabeledTrain,bagUnlabeledTrain,
           cluster_range=np.arange(1,4))

In [None]:
fe = FrustratinglyEasyDomainAdaptation(savepath="experiments/synthetic_experiment/frustratinglyEasy")

In [None]:
fe.fit(XLabeledTrain,yLabeledTrain,XLabeledVal,yLabeledVal,XUnlabeledTrain,bagUnlabeledTrain,)

## Evaluate Performances on Synthetic Data

In [None]:
from sklearn.metrics import roc_auc_score

### Our Method

In [None]:
roc_auc_score(yUnlabeledTest,method.predict(XUnlabeledTest,bagUnlabeledTest))

### Cluster Global

In [None]:
roc_auc_score(yUnlabeledTest,method.predict(XUnlabeledTest,bagUnlabeledTest,clusterGlobal=True))

### Label Shift

In [None]:
roc_auc_score(yUnlabeledTest,mm1.predict(XUnlabeledTest,bagUnlabeledTest))

### Global

In [None]:
roc_auc_score(yUnlabeledTest,mm1.predict(XUnlabeledTest,bagUnlabeledTest,clusterGlobal=True))

### Group Aware Global

In [None]:
roc_auc_score(yUnlabeledTest,ag.predict(XUnlabeledTest,bagUnlabeledTest))

### Frustratingly Easy Domain Adaptation

In [None]:
roc_auc_score(yUnlabeledTest,fe.predict(XUnlabeledTest,bagUnlabeledTest))

# Experiment on Real Data

In [None]:
from LeveragingStructure.data.leveragingStructure import ACSLoaderSetting2, HuggingfaceDatasetSetting2

In [None]:
baseDSKwargs= dict(resampleGroupID=False,
                allowDuplicates=False,
                labelProportion=.5,
                minsize=500,
                cluster_range=np.arange(1,8),
                bagLabeledSampleDistribution=lambda bag_size: bag_size,
                bagUnlabeledSampleDistribution=lambda bag_size: bag_size,
                minibatchKMeans=True,
                reassignment_ratio=.001,
                batch_size=2^13,
                verbose=True,
                tol=.01)
dataset2 = HuggingfaceDatasetSetting2(**baseDSKwargs)

In [None]:
XLabeled,yLabeled,instanceNumLabeled,bagLabeled = map(np.concatenate, list(zip(*[s[:]+(s.instanceNum,
                                                                                       np.ones(len(s),
                                                                                               dtype=int)*sNum) \
                                                                                 for (sNum,s) in enumerate(dataset2.labeledSamples)])))
gss = GroupShuffleSplit(n_splits=1)

labeledTrainIndices,labeledValIndices = next(iter(gss.split(XLabeled,yLabeled,instanceNumLabeled)))

XLabeledTrain = XLabeled[labeledTrainIndices]
XLabeledVal = XLabeled[labeledValIndices]
yLabeledTrain = yLabeled[labeledTrainIndices]
yLabeledVal = yLabeled[labeledValIndices]
instanceNumLabeledTrain = instanceNumLabeled[labeledTrainIndices]
instanceNumLabeledVal = instanceNumLabeled[labeledValIndices]
bagLabeledTrain = bagLabeled[labeledTrainIndices]
bagLabeledVal = bagLabeled[labeledValIndices]

In [None]:
XUnlabeled,yUnlabeled,instanceNumUnlabeled = list(zip(*[s[:]+(s.instanceNum,) for s in dataset2.unlabeledSamples]))

bagUnlabeled = [np.ones(len(y),dtype=int)*bagNum for bagNum,y in enumerate(yUnlabeled)]

XUnlabeled,yUnlabeled,bagUnlabeled,instanceNumUnlabeled = map(np.concatenate, [XUnlabeled,yUnlabeled,bagUnlabeled,instanceNumUnlabeled])

unlabeledTrainIndices,unlabeledTestIndices = next(iter(GroupShuffleSplit(n_splits=1).split(XUnlabeled,
                                                                                           yUnlabeled,
                                                                                           instanceNumUnlabeled)))

XUnlabeledTrain,yUnlabeledTrain = XUnlabeled[unlabeledTrainIndices],yUnlabeled[unlabeledTrainIndices]
bagUnlabeledTrain = bagUnlabeled[unlabeledTrainIndices]
instanceNumUnlabeledTrain = instanceNumUnlabeled[unlabeledTrainIndices]

XUnlabeledTest,yUnlabeledTest = XUnlabeled[unlabeledTestIndices],yUnlabeled[unlabeledTestIndices]
bagUnlabeledTest = bagUnlabeled[unlabeledTestIndices]
instanceNumUnlabeledTest = instanceNumUnlabeled[unlabeledTestIndices]

In [None]:
import os
if not os.path.isdir("experiments/amazon_review_all_experiment"):
    os.mkdir("experiments/amazon_review_all_experiment")

In [None]:
method = Method("experiments/amazon_review_all_experiment/ourMethod")

In [None]:
method.fit(XLabeledTrain,yLabeledTrain,XLabeledVal,yLabeledVal,XUnlabeledTrain,bagUnlabeledTrain,
           cluster_range=np.arange(1,8))

In [None]:
mm1 = Method("experiments/amazon_review_all_experiment/singleClusterAblation")

In [None]:
mm1.fit(XLabeledTrain,yLabeledTrain,XLabeledVal,yLabeledVal,XUnlabeledTrain,bagUnlabeledTrain,
       cluster_range=[1])

In [None]:
ag = GroupAwareGlobal(savepath="experiments/amazon_review_all_experiment/groupAwareGlobal")

In [None]:
ag.fit(XLabeledTrain,yLabeledTrain,bagLabeledTrain,
       XLabeledVal,yLabeledVal,bagLabeledVal,
       XUnlabeledTrain,bagUnlabeledTrain,
           cluster_range=np.arange(1,4))

In [None]:
fe = FrustratinglyEasyDomainAdaptation(savepath="experiments/amazon_review_all_experiment/frustratinglyEasy")

In [None]:
fe.fit(XLabeledTrain,yLabeledTrain,XLabeledVal,yLabeledVal,XUnlabeledTrain,bagUnlabeledTrain,)

### Performance on Amazon Reviews

In [4]:
from sklearn.metrics import roc_auc_score

### Our Method

In [None]:
roc_auc_score(yUnlabeledTest,method.predict(XUnlabeledTest,bagUnlabeledTest))

### Cluster Global

In [None]:
roc_auc_score(yUnlabeledTest,method.predict(XUnlabeledTest,bagUnlabeledTest,clusterGlobal=True))

### Label Shift

In [None]:
roc_auc_score(yUnlabeledTest,mm1.predict(XUnlabeledTest,bagUnlabeledTest))

### Global

In [None]:
roc_auc_score(yUnlabeledTest,mm1.predict(XUnlabeledTest,bagUnlabeledTest,clusterGlobal=True))

### Group Aware Global

In [None]:
roc_auc_score(yUnlabeledTest,ag.predict(XUnlabeledTest,bagUnlabeledTest))

### Frustratingly Easy Domain Adaptation

In [None]:
roc_auc_score(yUnlabeledTest,fe.predict(XUnlabeledTest,bagUnlabeledTest))

# Full Results

In [1]:
from LeveragingStructure.experiment_utils import *

2022-10-25 05:41:25.632289: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-25 05:41:25.855189: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/ood.discovery.neu.edu/software/anaconda3/2019.10/lib:/shared/centos7/openblas/0.3.6/lib
2022-10-25 05:41:25.855293: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-10-25 05:41:25.902004: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin 

In [2]:
def getResults(pth):
    y = np.load(os.path.join(pth,"yUnlabeledTest.npy"))
    ourPreds = np.load(os.path.join(pth,"mm","preds.npy"))
    clusterGlobalPreds = np.load(os.path.join(pth,"mm","clusterGlobalPreds.npy"))
    labelShiftPreds = np.load(os.path.join(pth,"mm2","Preds.npy"))
    globalPreds = np.load(os.path.join(pth,"mm2","clusterGlobalPreds.npy"))
    groupAwareGlobalPreds = np.load(os.path.join(pth,"ag","preds.npy"))
    fe = np.load(os.path.join(pth,"fe","preds.npy"))
    star = np.load(os.path.join(pth,"mmStar","preds.npy"))
    return dict(y=y,ourPreds=ourPreds,clusterGlobalPreds=clusterGlobalPreds,
                labelShiftPreds=labelShiftPreds, globalPreds=globalPreds,
                groupAwareGlobalPreds=groupAwareGlobalPreds,star=star,fe=fe)

In [3]:
import os
import numpy as np

In [11]:
results = {"income": [],
           "employment": [],
           "income_poverty_ratio": [],
           "amazon_reviews_pca": []}
for i in range(26):
    for k in results.keys():
        try:
            ri = getResults(f"/scratch/zeiberg.d/leveragingStructureResponseExperiments/experiments/{k}_setting_2_{i}/")
        except FileNotFoundError:
            print(f"failed to load {k} - {i}")
            continue
        results[k].append(ri)

failed to load amazon_reviews_pca - 0
failed to load amazon_reviews_pca - 1
failed to load amazon_reviews_pca - 2
failed to load amazon_reviews_pca - 3
failed to load amazon_reviews_pca - 4
failed to load amazon_reviews_pca - 5
failed to load amazon_reviews_pca - 6
failed to load income_poverty_ratio - 7
failed to load amazon_reviews_pca - 7
failed to load amazon_reviews_pca - 8
failed to load amazon_reviews_pca - 9
failed to load employment - 10
failed to load amazon_reviews_pca - 10
failed to load amazon_reviews_pca - 11
failed to load amazon_reviews_pca - 12
failed to load amazon_reviews_pca - 13
failed to load amazon_reviews_pca - 14
failed to load amazon_reviews_pca - 15
failed to load amazon_reviews_pca - 16
failed to load amazon_reviews_pca - 17
failed to load amazon_reviews_pca - 18
failed to load amazon_reviews_pca - 19
failed to load amazon_reviews_pca - 20
failed to load employment - 21
failed to load amazon_reviews_pca - 21
failed to load employment - 22
failed to load amaz

In [12]:
import pandas as pd

In [13]:
from sklearn.metrics import roc_auc_score

In [14]:
def formatResults(r):
    aucs = {}
    for k,v in r.items():
        if k != "y":
            auc = roc_auc_score(r["y"],v)
            aucs[k] = auc
    return aucs

### Income

In [16]:
incomedf = pd.DataFrame.from_records([pd.Series(formatResults(ri)) for ri in results["income"]])

In [21]:
incomedf

Unnamed: 0,ourPreds,clusterGlobalPreds,labelShiftPreds,globalPreds,groupAwareGlobalPreds,star,fe
0,0.835291,0.836911,0.83419,0.835589,0.819671,0.886011,0.823223
1,0.87463,0.854199,0.857362,0.85152,0.839524,0.8825,0.837787
2,0.880579,0.867955,0.877876,0.865592,0.85014,0.891639,0.852192
3,0.913335,0.875025,0.89019,0.872177,0.865679,0.919267,0.865157
4,0.909269,0.874964,0.904797,0.873737,0.868067,0.91117,0.854385
5,0.868214,0.864999,0.852909,0.863074,0.855532,0.903685,0.843647
6,0.894004,0.867012,0.890121,0.865853,0.862338,0.900694,0.845994
7,0.879838,0.871216,0.882594,0.869197,0.859881,0.904134,0.849637
8,0.896255,0.866607,0.880135,0.865453,0.858216,0.898444,0.851526
9,0.817952,0.840726,0.814597,0.839636,0.826154,0.872111,0.828671


In [25]:
incomeavg = incomedf.apply(np.mean).sort_values()

In [26]:
incomeavg

fe                       0.845976
groupAwareGlobalPreds    0.846234
globalPreds              0.859261
clusterGlobalPreds       0.861400
labelShiftPreds          0.863601
ourPreds                 0.871357
star                     0.892873
dtype: float64

In [27]:
incomeavg.ourPreds - incomeavg.fe

0.02538090124467518

### Employment

In [18]:
employmentdf = pd.DataFrame.from_records([pd.Series(formatResults(ri)) for ri in results["employment"]])

In [19]:
employmentdf

Unnamed: 0,ourPreds,clusterGlobalPreds,labelShiftPreds,globalPreds,groupAwareGlobalPreds,star,fe
0,0.901696,0.89404,0.892131,0.891598,0.875829,0.924602,0.864504
1,0.902567,0.90036,0.901784,0.89785,0.877122,0.933802,0.867967
2,0.912355,0.898809,0.882411,0.895188,0.886079,0.916151,0.865734
3,0.92506,0.89353,0.912482,0.892941,0.883597,0.924832,0.8718
4,0.895062,0.885249,0.879485,0.882635,0.872283,0.916906,0.835805
5,0.943076,0.905189,0.940106,0.904166,0.886081,0.946594,0.868761
6,0.933788,0.896806,0.912811,0.893548,0.882873,0.935369,0.847739
7,0.947297,0.91452,0.936095,0.914951,0.873068,0.947281,0.850205
8,0.920628,0.898024,0.917588,0.895678,0.881886,0.928783,0.8407
9,0.898538,0.890943,0.895914,0.888545,0.870004,0.919945,0.869872


In [28]:
employmentavg = employmentdf.apply(np.mean).sort_values()

In [29]:
employmentavg

fe                       0.851030
groupAwareGlobalPreds    0.876582
globalPreds              0.896708
clusterGlobalPreds       0.898874
labelShiftPreds          0.907680
ourPreds                 0.919170
star                     0.929105
dtype: float64

In [30]:
employmentavg.ourPreds - employmentavg.fe

0.06813985303567938

### IPR

In [22]:
iprdf = pd.DataFrame.from_records([pd.Series(formatResults(ri)) for ri in results["income_poverty_ratio"]])

In [23]:
iprdf

Unnamed: 0,ourPreds,clusterGlobalPreds,labelShiftPreds,globalPreds,groupAwareGlobalPreds,star,fe
0,0.827332,0.810039,0.823528,0.8018,0.73431,0.841822,0.742381
1,0.86155,0.821813,0.827141,0.802796,0.784734,0.856243,0.766545
2,0.833891,0.80805,0.809373,0.80082,0.732483,0.835207,0.745396
3,0.808455,0.806337,0.785508,0.794522,0.767363,0.858728,0.766021
4,0.804011,0.796474,0.773932,0.783791,0.750455,0.849416,0.759603
5,0.8314,0.804313,0.80252,0.797825,0.759215,0.847082,0.748208
6,0.878165,0.811158,0.847258,0.806849,0.788757,0.878138,0.732386
7,0.853637,0.796755,0.821516,0.790859,0.761396,0.854231,0.74278
8,0.880023,0.818377,0.86555,0.811996,0.777723,0.881093,0.769233
9,0.821906,0.803777,0.819995,0.800729,0.744485,0.850653,0.714615


In [31]:
ipravg = iprdf.apply(np.mean).sort_values()

In [32]:
ipravg

fe                       0.746058
groupAwareGlobalPreds    0.757727
globalPreds              0.797179
clusterGlobalPreds       0.806296
labelShiftPreds          0.813706
ourPreds                 0.837366
star                     0.851629
dtype: float64

In [33]:
ipravg.ourPreds - ipravg.fe

0.09130815408889348

### Amazon Reviews

In [None]:
pd.DataFrame.from_records([pd.Series(formatResults(ri)) for ri in results["amazon_reviews_pca"]])

In [None]:
pd.DataFrame.from_records([pd.Series(formatResults(ri)) for ri in results["amazon_reviews_pca"]]).apply(np.mean).sort_values()