In [1]:
import pandas as pd 
import pickle
import numpy as np
from copy import deepcopy
from tqdm import tqdm

In [2]:
# set random seet for numpy
np.random.seed(42)

In [3]:
TRAIN_FRAC = 0.1  # use 10 percent of authors for training
TEST_FRAC = 0.9

In [4]:
folder_path = "../out/reddit_chunked/"

In [5]:
with open(folder_path + "reddit_train_pandas_df_20240126_181755.pickle", "rb") as f:
    train_df_embeddings = pickle.load(f)

with open(folder_path + "reddit_test_pandas_df_20240126_181755.pickle", "rb") as f:
    test_df_embeddings = pickle.load(f)

In [6]:
with open(folder_path + "reddit_train_embeddings_20240126_184325.pickle", "rb") as f:
    train_chunks = pickle.load(f)


with open(folder_path + "reddit_test_embeddings_20240126_184325.pickle", "rb") as f:
    test_chunks = pickle.load(f)


all_chunks = train_chunks + test_chunks
all_chunks = all_chunks

In [7]:
text2embedding = {
    elem["text"]: elem["embedding"] for elem in all_chunks
}

In [9]:
df_total = pd.DataFrame(all_chunks)[["metadata", "text"]]
df_total["author"] = df_total["metadata"].apply(lambda x: x["author"])
df = df_total

unique_authors = df["author"].unique()
np.random.shuffle(unique_authors)

train_authors = unique_authors[:int(len(unique_authors) * TRAIN_FRAC)]
test_authors = unique_authors[int(len(unique_authors) * TRAIN_FRAC):]

train_df = df[df["author"].isin(train_authors)].sort_values(by="author")
test_df = df[df["author"].isin(test_authors)].sort_values(by="author")

In [10]:
# test out what happens if the same author can be in either set 
df = df.sample(frac=1, replace=False, random_state=42)
train_df = df.iloc[:int(len(df_total) * TRAIN_FRAC)]
test_df = df.iloc[int(len(df_total) * TRAIN_FRAC):]

In [12]:
print(f"Length of train_df: {len(train_df)}")
print(f"Length of test_df: {len(test_df)}")

Length of train_df: 1000
Length of test_df: 9000


In [13]:
def pair_up_same_author(df):
    # permute entire dataframe 
    old_df = deepcopy(df).sort_values(by="author")
    new_df = deepcopy(df).sample(frac=1, replace=False)

    # sort dataframe again by author to keep random order within author
    new_df = new_df.sort_values(by=["author"], kind = "stable")

    # rename all columns in parnter dataframe to avoid confusion
    new_df.columns = [str(col) + "_partner" for col in new_df.columns]
    
    colnames = list(old_df.columns) + list(new_df.columns)

    # horizontally concatenate the two dataframes ignoring the index
    new_df = pd.concat([old_df.reset_index(drop = True), new_df.reset_index(drop = True)], axis=1, ignore_index=True)
    new_df.columns = colnames

    assert new_df["author"].equals(new_df["author_partner"]) # check if all authors are the same
    
    return new_df


def pair_up_different_author(df, iterations=10):
    df_idx = df[["author"]]
    df_idx["old_idx"] = range(len(df_idx))

    pair_idx_df_list = []
    for _ in range(iterations):
        shuffled_df = df_idx.sample(frac=1, replace = False).reset_index(drop=True)

        shuffled_df.rename(columns={"author": "author_partner", "old_idx": "old_idx_partner"}, inplace=True)

        paired_df = pd.concat([deepcopy(df_idx).reset_index(drop=True), deepcopy(shuffled_df).reset_index(drop=True),], axis=1)

        pair_idx_df_list.append(paired_df)

    pair_idx_df = pd.concat(pair_idx_df_list, axis=0, ignore_index=True)
    
    # remove rows where author is the same
    pair_idx_df = pair_idx_df[pair_idx_df["author"] != pair_idx_df["author_partner"]]

    # drop duplicates for old indices 
    pair_idx_df = pair_idx_df.drop_duplicates(subset=["old_idx"])   
    
    assert len(pair_idx_df) == len(df)
 
    # order rows of origal df according to old_idx
    pair_df = df.iloc[pair_idx_df["old_idx_partner"].values, :]
    pair_df.columns = [str(col) + "_partner" for col in pair_df.columns]

    # concat df and pair_df
    pair_df = pd.concat([df.reset_index(drop=True), pair_df.reset_index(drop=True)], axis=1, ignore_index=False)


    assert pair_df["author"].equals(pair_df["author_partner"]) == False # check if all authors are the same

    return pair_df

def pair_up_different_author_baseline(df):
    # just pair up the dataframe with a random permutation of itself

    # permute entire dataframe
    new_df = deepcopy(df).sample(frac=1, replace=False)

    # rename all columns in parnter dataframe to avoid confusion
    new_df.columns = [str(col) + "_partner" for col in new_df.columns]

    colnames = list(df.columns) + list(new_df.columns)

    # horizontally concatenate the two dataframes ignoring the index
    new_df = pd.concat([df.reset_index(drop = True), new_df.reset_index(drop = True)], axis=1, ignore_index=True)
    new_df.columns = colnames

    return new_df



# Example usage with your dataframe 'df'
# result_df = pair_up_different_author(df)


def create_pair_classification_df(df, clean_columns=True):
    # create dataframe with same author pairs
    same_author_df = pair_up_same_author(df)
    # create dataframe with different author pairs
    different_author_df = pair_up_different_author(df)

    colnames_same_author_df = list(same_author_df.columns)
    colnames_different_partner_df = list(different_author_df.columns)

    # make sure that the columns are all of type string 
    colnames_same_author_df = [str(col) for col in colnames_same_author_df]
    colnames_different_partner_df = [str(col) for col in colnames_different_partner_df]

    assert len(colnames_same_author_df) == len(colnames_different_partner_df)
    assert all([colnames_different_partner_df[i] == colnames_different_partner_df[i] for i in range(len(colnames_same_author_df))])

    # rename columns in partner dataframe to avoid confusion
    
    same_author_df.columns = colnames_same_author_df
    different_author_df.columns = colnames_different_partner_df
    
    #same_author_df["label"] = 1
    #different_author_df["label"] = 0

    pair_classification_df = pd.concat([same_author_df, different_author_df], axis=0, ignore_index=True)
    
    pair_classification_df = pair_classification_df.sample(frac=1, replace=False)

    pair_classification_df["label"] = pair_classification_df["author"] == pair_classification_df["author_partner"]

    if clean_columns:
        pair_classification_df.drop(columns = [
            "author",
            "author_partner",
            "metadata",
            "metadata_partner",
            "text",
            "text_partner"
        ], inplace=True)

    
    return pair_classification_df


def create_pair_classification_df_upsample(df, clean_columns, sample_ratio = 1):
    ### use the craete_pair_classification_df function to create a pair classification dataframe. But stack several of them on top of each other to upsample the different author pairs

    classification_df_lis = []
    for _ in tqdm(range(sample_ratio)):
        classification_df_lis.append(create_pair_classification_df(deepcopy(df), clean_columns=clean_columns))

    classification_df = pd.concat(classification_df_lis, axis=0, ignore_index=False)
    classification_df = classification_df.sample(frac=1, replace=False)

    return classification_df

In [14]:
def add_embeddings_to_df(df, text2embedding):
    embedding_list = [
        text2embedding[text] for text in df["text"]
    ]
    embedding_df = pd.DataFrame(embedding_list)

    embedding_list_partner = [
        text2embedding[text] for text in df["text_partner"]
    ]
    embedding_df_partner = pd.DataFrame(embedding_list_partner)
    embedding_df_partner.columns = [str(col) + "_partner" for col in embedding_df_partner.columns]

    df = pd.concat([df.reset_index(drop=True), embedding_df.reset_index(drop=True), embedding_df_partner.reset_index(drop=True)], axis=1, ignore_index=False)
    
    return df


In [15]:
train_df_paired = create_pair_classification_df_upsample(train_df, clean_columns = False, sample_ratio = 3)
test_df_paired = create_pair_classification_df_upsample(test_df, clean_columns = False, sample_ratio = 3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_idx["old_idx"] = range(len(df_idx))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_idx["old_idx"] = range(len(df_idx))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_idx["old_idx"] = range(len(df_idx))
100%|██████████| 3/3 [00:00<00:00, 46.46it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_idx["old_idx"] = range(len(df_idx))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_idx["old_idx"] = range(len(df_idx))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_idx["old_idx"] = range(len(df_idx))
100%|██████████| 3/3 [00:00<00:00, 17.16it/s]


In [16]:
train_df = add_embeddings_to_df(train_df_paired, text2embedding)
test_df = add_embeddings_to_df(test_df_paired, text2embedding)

In [17]:
train_df = train_df.iloc[:, 6:]
test_df = test_df.iloc[:, 6:]

In [18]:
from autogluon.tabular import TabularDataset, TabularPredictor

  from .autonotebook import tqdm as notebook_tqdm


In [19]:
predictor = TabularPredictor(label='label')
predictor.fit(train_df, time_limit=60*15, presets = 'medium_quality')

No path specified. Models will be saved in: "AutogluonModels\ag-20240127_123536"
Presets specified: ['medium_quality']
Beginning AutoGluon training ... Time limit = 300s
AutoGluon will save models to "AutogluonModels\ag-20240127_123536"
AutoGluon Version:  1.0.0
Python Version:     3.10.11
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:          8
Memory Avail:       2.54 GB / 15.71 GB (16.2%)
Disk Space Avail:   11.88 GB / 474.72 GB (2.5%)
Train Data Rows:    6000
Train Data Columns: 3072
Label Column:       label
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [False, True]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Problem Type:       binary
Preprocessing data ...
Selected class <--> label mapping:  cla

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x2bdab0edcc0>

In [25]:
y_train

0      -0.035170
1      -0.006340
2      -0.033977
3      -0.047606
4      -0.006359
          ...   
5995   -0.047155
5996   -0.015223
5997   -0.031893
5998   -0.038155
5999   -0.014170
Name: 1535_partner, Length: 6000, dtype: float64

In [26]:
train_df

Unnamed: 0,label,0,1,2,3,4,5,6,7,8,...,1526_partner,1527_partner,1528_partner,1529_partner,1530_partner,1531_partner,1532_partner,1533_partner,1534_partner,1535_partner
0,False,0.006676,-0.021149,0.022263,-0.021941,0.006342,-0.017474,0.001129,-0.004604,0.004863,...,-0.003998,-0.002934,-0.007514,-0.031906,-0.014525,0.011988,-0.009024,-0.016225,-0.003482,-0.035170
1,True,-0.014148,-0.005231,-0.000418,-0.019133,-0.034236,0.015659,-0.004724,-0.018378,0.014784,...,0.013752,0.026534,-0.006965,-0.025878,-0.017088,0.016391,0.008674,-0.012037,-0.021586,-0.006340
2,True,0.001571,-0.002970,0.008940,-0.041818,-0.022341,0.016021,0.003361,-0.018672,-0.008262,...,-0.015367,-0.000937,-0.011893,-0.020796,-0.015569,0.014764,0.002307,-0.004279,0.025608,-0.033977
3,False,-0.042519,-0.029029,0.006739,-0.002219,-0.025803,-0.016742,-0.016562,-0.023372,0.002672,...,0.002832,-0.006415,0.008959,-0.012771,0.013837,0.008128,-0.024967,0.003268,-0.001159,-0.047606
4,True,-0.001586,0.015298,-0.020849,-0.018154,-0.005949,0.014555,-0.030013,-0.029402,-0.021620,...,0.015599,-0.016498,0.012098,-0.016253,-0.018814,0.019659,-0.000748,0.000499,-0.028283,-0.006359
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,True,-0.003460,0.002003,0.014805,-0.000302,-0.002699,0.014660,0.001945,-0.010135,-0.011398,...,0.001202,0.004714,-0.003008,-0.027706,-0.008507,-0.006443,0.001912,-0.004733,-0.013396,-0.047155
5996,True,0.016424,-0.019949,0.004093,-0.046820,-0.033572,0.007341,-0.007031,-0.011666,-0.014784,...,0.011453,-0.018839,0.023332,-0.008012,-0.028923,0.021111,-0.028071,0.003186,-0.008716,-0.015223
5997,True,-0.015440,-0.001745,-0.016699,-0.013188,-0.020614,0.013250,-0.045308,-0.029046,-0.013681,...,0.021805,0.002303,0.012785,-0.024337,-0.016467,0.002926,-0.027801,0.003552,0.005616,-0.031893
5998,True,-0.005819,-0.016166,0.007109,-0.004758,-0.023752,0.004835,0.009489,-0.005928,-0.019753,...,0.024626,0.018540,-0.017250,-0.018056,-0.020058,0.009330,0.018486,-0.023779,-0.013321,-0.038155


In [21]:
print(predictor.evaluate(test_df, silent=False, detailed_report =True))

Evaluation: accuracy on test data: 0.5435925925925926
Evaluations on test data:
{
    "accuracy": 0.5435925925925926,
    "balanced_accuracy": 0.5466298325216553,
    "mcc": 0.09737473949373732,
    "roc_auc": 0.5679744988469059,
    "f1": 0.4731509191962377,
    "precision": 0.5759562841530055,
    "recall": 0.40148739343370216
}
Detailed (per-class) classification report:
{
    "False": {
        "precision": 0.5257151070863878,
        "recall": 0.6917722716096085,
        "f1-score": 0.5974191440705652,
        "support": 26435.0
    },
    "True": {
        "precision": 0.5759562841530055,
        "recall": 0.40148739343370216,
        "f1-score": 0.4731509191962377,
        "support": 27565.0
    },
    "accuracy": 0.5435925925925926,
    "macro avg": {
        "precision": 0.5508356956196967,
        "recall": 0.5466298325216553,
        "f1-score": 0.5352850316334015,
        "support": 54000.0
    },
    "weighted avg": {
        "precision": 0.5513613671945603,
        "recal

{'accuracy': 0.5435925925925926, 'balanced_accuracy': 0.5466298325216553, 'mcc': 0.09737473949373732, 'roc_auc': 0.5679744988469059, 'f1': 0.4731509191962377, 'precision': 0.5759562841530055, 'recall': 0.40148739343370216, 'confusion_matrix':        False  True 
False  18287   8148
True   16498  11067, 'classification_report': {'False': {'precision': 0.5257151070863878, 'recall': 0.6917722716096085, 'f1-score': 0.5974191440705652, 'support': 26435.0}, 'True': {'precision': 0.5759562841530055, 'recall': 0.40148739343370216, 'f1-score': 0.4731509191962377, 'support': 27565.0}, 'accuracy': 0.5435925925925926, 'macro avg': {'precision': 0.5508356956196967, 'recall': 0.5466298325216553, 'f1-score': 0.5352850316334015, 'support': 54000.0}, 'weighted avg': {'precision': 0.5513613671945603, 'recall': 0.5435925925925926, 'f1-score': 0.5339848177990683, 'support': 54000.0}}}
