# Leveraging Structure for Improved Classification of Grouped Biased Data

This repository contains the code used to generate the results for "Leveraging Structure for Improved Classification of Grouped Biased Data" presented at AAAI 2023. Please see the [expanded version](files/leveraging_structure.pdf) of the paper for the Appendix containing proofs and additional experiments.

## Create environment and Install Dependencies

`conda create --name structure python=3.9`

`conda activate structure`

`python -m pip install -r requirements.txt`

In [None]:
#hide
from LeveragingStructure.core import *

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Running Experiment on Synthetic Data

## Create Synthetic Dataset

In [None]:
from LeveragingStructure.data.leveragingStructure import SyntheticSetting2

In [None]:
import numpy as np

In [None]:
dataset = SyntheticSetting2.from_criteria(n_targets=10,n_clusters=2,dim=2,aucRange=[.75,.95],
                                          irreducibility_range=[.01,.9],
                                         num_points_labeled_partition=lambda: np.round(np.random.normal(1000,100)),
                                         num_points_unlabeled_partition=lambda: np.round(np.random.normal(10000,1000)),
                                         timeoutMins=2,nTimeouts=3)

### Visualize (Synthetic Dataset Only)

In [None]:
dataset.NMix.plotCIEllipse()

## Split Labeled and Unlabeled Sets

In [None]:
from sklearn.model_selection import GroupShuffleSplit

### Split Labeled Train and Validation Sets

In [None]:
XLabeled,yLabeled,instanceNumLabeled,bagLabeled = map(np.concatenate, list(zip(*[s[:]+(s.instanceNum,
                                                                                       np.ones(len(s),
                                                                                               dtype=int)*sNum) \
                                                                                 for (sNum,s) in enumerate(dataset.labeledSamples)])))
gss = GroupShuffleSplit(n_splits=1)

labeledTrainIndices,labeledValIndices = next(iter(gss.split(XLabeled,yLabeled,instanceNumLabeled)))

XLabeledTrain = XLabeled[labeledTrainIndices]
XLabeledVal = XLabeled[labeledValIndices]
yLabeledTrain = yLabeled[labeledTrainIndices]
yLabeledVal = yLabeled[labeledValIndices]
instanceNumLabeledTrain = instanceNumLabeled[labeledTrainIndices]
instanceNumLabeledVal = instanceNumLabeled[labeledValIndices]
bagLabeledTrain = bagLabeled[labeledTrainIndices]
bagLabeledVal = bagLabeled[labeledValIndices]

### Split Unlabeled Train and Test Sets

In [None]:
XUnlabeled,yUnlabeled,instanceNumUnlabeled = list(zip(*[s[:]+(s.instanceNum,) for s in dataset.unlabeledSamples]))

bagUnlabeled = [np.ones(len(y),dtype=int)*bagNum for bagNum,y in enumerate(yUnlabeled)]

XUnlabeled,yUnlabeled,bagUnlabeled,instanceNumUnlabeled = map(np.concatenate, [XUnlabeled,yUnlabeled,bagUnlabeled,instanceNumUnlabeled])

unlabeledTrainIndices,unlabeledTestIndices = next(iter(GroupShuffleSplit(n_splits=1).split(XUnlabeled,
                                                                                           yUnlabeled,
                                                                                           instanceNumUnlabeled)))

XUnlabeledTrain,yUnlabeledTrain = XUnlabeled[unlabeledTrainIndices],yUnlabeled[unlabeledTrainIndices]
bagUnlabeledTrain = bagUnlabeled[unlabeledTrainIndices]
instanceNumUnlabeledTrain = instanceNumUnlabeled[unlabeledTrainIndices]

XUnlabeledTest,yUnlabeledTest = XUnlabeled[unlabeledTestIndices],yUnlabeled[unlabeledTestIndices]
bagUnlabeledTest = bagUnlabeled[unlabeledTestIndices]
instanceNumUnlabeledTest = instanceNumUnlabeled[unlabeledTestIndices]

## Run Method on Synthetic Data

In [None]:
import os
if not os.path.isdir("experiments"):
    os.mkdir("experiments")

In [None]:
from LeveragingStructure.experiment_utils import Method,GroupAwareGlobal,FrustratinglyEasyDomainAdaptation

In [None]:
if not os.path.isdir("experiments/synthetic_experiment"):
    os.mkdir("experiments/synthetic_experiment")

In [None]:
method = Method("experiments/synthetic_experiment/ourMethod")

In [None]:
method.fit(XLabeledTrain,yLabeledTrain,XLabeledVal,yLabeledVal,XUnlabeledTrain,bagUnlabeledTrain,
           cluster_range=np.arange(1,4))

In [None]:
mm1 = Method("experiments/synthetic_experiment/singleClusterAblation")

In [None]:
mm1.fit(XLabeledTrain,yLabeledTrain,XLabeledVal,yLabeledVal,XUnlabeledTrain,bagUnlabeledTrain,
       cluster_range=[1])

In [None]:
ag = GroupAwareGlobal(savepath="experiments/synthetic_experiment/groupAwareGlobal")

In [None]:
ag.fit(XLabeledTrain,yLabeledTrain,bagLabeledTrain,
       XLabeledVal,yLabeledVal,bagLabeledVal,
       XUnlabeledTrain,bagUnlabeledTrain,
           cluster_range=np.arange(1,4))

In [None]:
fe = FrustratinglyEasyDomainAdaptation(savepath="experiments/synthetic_experiment/frustratinglyEasy")

In [None]:
fe.fit(XLabeledTrain,yLabeledTrain,XLabeledVal,yLabeledVal,XUnlabeledTrain,bagUnlabeledTrain,)

## Evaluate Performances on Synthetic Data

In [None]:
from sklearn.metrics import roc_auc_score

### Our Method

In [None]:
roc_auc_score(yUnlabeledTest,method.predict(XUnlabeledTest,bagUnlabeledTest))

### Cluster Global

In [None]:
roc_auc_score(yUnlabeledTest,method.predict(XUnlabeledTest,bagUnlabeledTest,clusterGlobal=True))

### Label Shift

In [None]:
roc_auc_score(yUnlabeledTest,mm1.predict(XUnlabeledTest,bagUnlabeledTest))

### Global

In [None]:
roc_auc_score(yUnlabeledTest,mm1.predict(XUnlabeledTest,bagUnlabeledTest,clusterGlobal=True))

### Group Aware Global

In [None]:
roc_auc_score(yUnlabeledTest,ag.predict(XUnlabeledTest,bagUnlabeledTest))

### Frustratingly Easy Domain Adaptation

In [None]:
roc_auc_score(yUnlabeledTest,fe.predict(XUnlabeledTest,bagUnlabeledTest))

# Experiment on Real Data

In [None]:
from LeveragingStructure.data.leveragingStructure import ACSLoaderSetting2, HuggingfaceDatasetSetting2

In [None]:
baseDSKwargs= dict(resampleGroupID=False,
                allowDuplicates=False,
                labelProportion=.5,
                minsize=500,
                cluster_range=np.arange(1,8),
                bagLabeledSampleDistribution=lambda bag_size: bag_size,
                bagUnlabeledSampleDistribution=lambda bag_size: bag_size,
                minibatchKMeans=True,
                reassignment_ratio=.001,
                batch_size=2^13,
                verbose=True,
                tol=.01)
dataset2 = HuggingfaceDatasetSetting2(**baseDSKwargs)

In [None]:
XLabeled,yLabeled,instanceNumLabeled,bagLabeled = map(np.concatenate, list(zip(*[s[:]+(s.instanceNum,
                                                                                       np.ones(len(s),
                                                                                               dtype=int)*sNum) \
                                                                                 for (sNum,s) in enumerate(dataset2.labeledSamples)])))
gss = GroupShuffleSplit(n_splits=1)

labeledTrainIndices,labeledValIndices = next(iter(gss.split(XLabeled,yLabeled,instanceNumLabeled)))

XLabeledTrain = XLabeled[labeledTrainIndices]
XLabeledVal = XLabeled[labeledValIndices]
yLabeledTrain = yLabeled[labeledTrainIndices]
yLabeledVal = yLabeled[labeledValIndices]
instanceNumLabeledTrain = instanceNumLabeled[labeledTrainIndices]
instanceNumLabeledVal = instanceNumLabeled[labeledValIndices]
bagLabeledTrain = bagLabeled[labeledTrainIndices]
bagLabeledVal = bagLabeled[labeledValIndices]

In [None]:
XUnlabeled,yUnlabeled,instanceNumUnlabeled = list(zip(*[s[:]+(s.instanceNum,) for s in dataset2.unlabeledSamples]))

bagUnlabeled = [np.ones(len(y),dtype=int)*bagNum for bagNum,y in enumerate(yUnlabeled)]

XUnlabeled,yUnlabeled,bagUnlabeled,instanceNumUnlabeled = map(np.concatenate, [XUnlabeled,yUnlabeled,bagUnlabeled,instanceNumUnlabeled])

unlabeledTrainIndices,unlabeledTestIndices = next(iter(GroupShuffleSplit(n_splits=1).split(XUnlabeled,
                                                                                           yUnlabeled,
                                                                                           instanceNumUnlabeled)))

XUnlabeledTrain,yUnlabeledTrain = XUnlabeled[unlabeledTrainIndices],yUnlabeled[unlabeledTrainIndices]
bagUnlabeledTrain = bagUnlabeled[unlabeledTrainIndices]
instanceNumUnlabeledTrain = instanceNumUnlabeled[unlabeledTrainIndices]

XUnlabeledTest,yUnlabeledTest = XUnlabeled[unlabeledTestIndices],yUnlabeled[unlabeledTestIndices]
bagUnlabeledTest = bagUnlabeled[unlabeledTestIndices]
instanceNumUnlabeledTest = instanceNumUnlabeled[unlabeledTestIndices]

In [None]:
import os
if not os.path.isdir("experiments/amazon_review_all_experiment"):
    os.mkdir("experiments/amazon_review_all_experiment")

In [None]:
method = Method("experiments/amazon_review_all_experiment/ourMethod")

In [None]:
method.fit(XLabeledTrain,yLabeledTrain,XLabeledVal,yLabeledVal,XUnlabeledTrain,bagUnlabeledTrain,
           cluster_range=np.arange(1,8))

In [None]:
mm1 = Method("experiments/amazon_review_all_experiment/singleClusterAblation")

In [None]:
mm1.fit(XLabeledTrain,yLabeledTrain,XLabeledVal,yLabeledVal,XUnlabeledTrain,bagUnlabeledTrain,
       cluster_range=[1])

In [None]:
ag = GroupAwareGlobal(savepath="experiments/amazon_review_all_experiment/groupAwareGlobal")

In [None]:
ag.fit(XLabeledTrain,yLabeledTrain,bagLabeledTrain,
       XLabeledVal,yLabeledVal,bagLabeledVal,
       XUnlabeledTrain,bagUnlabeledTrain,
           cluster_range=np.arange(1,4))

In [None]:
fe = FrustratinglyEasyDomainAdaptation(savepath="experiments/amazon_review_all_experiment/frustratinglyEasy")

In [None]:
fe.fit(XLabeledTrain,yLabeledTrain,XLabeledVal,yLabeledVal,XUnlabeledTrain,bagUnlabeledTrain,)

### Performance on Amazon Reviews

In [None]:
from sklearn.metrics import roc_auc_score

### Our Method

In [None]:
roc_auc_score(yUnlabeledTest,method.predict(XUnlabeledTest,bagUnlabeledTest))

### Cluster Global

In [None]:
roc_auc_score(yUnlabeledTest,method.predict(XUnlabeledTest,bagUnlabeledTest,clusterGlobal=True))

### Label Shift

In [None]:
roc_auc_score(yUnlabeledTest,mm1.predict(XUnlabeledTest,bagUnlabeledTest))

### Global

In [None]:
roc_auc_score(yUnlabeledTest,mm1.predict(XUnlabeledTest,bagUnlabeledTest,clusterGlobal=True))

### Group Aware Global

In [None]:
roc_auc_score(yUnlabeledTest,ag.predict(XUnlabeledTest,bagUnlabeledTest))

### Frustratingly Easy Domain Adaptation

In [None]:
roc_auc_score(yUnlabeledTest,fe.predict(XUnlabeledTest,bagUnlabeledTest))