In [52]:
import pandas as pd 
import pickle
import numpy as np
from copy import deepcopy
from tqdm import tqdm

In [53]:
# set random seet for numpy
np.random.seed(42)

In [54]:
TRAIN_FRAC = 0.5  # use 10 percent of authors for training
TEST_FRAC = 0.5

In [55]:
folder_path = "../out/reddit_chunked/"

In [56]:
with open(folder_path + "reddit_train_embeddings_20240126_181755.pickle", "rb") as f:
    train_chunks = pickle.load(f)



with open(folder_path + "reddit_test_embeddings_20240126_181755.pickle", "rb") as f:
    test_chunks = pickle.load(f)


all_chunks = train_chunks + test_chunks
all_chunks = all_chunks[:10_000]

In [57]:
text2embedding = {
    elem["text"]: elem["embedding"] for elem in all_chunks
}

In [63]:
df_total = pd.DataFrame(all_chunks)[["metadata", "text"]]
df_total["author"] = df_total["metadata"].apply(lambda x: x['author'])
df = df_total

In [64]:

unique_authors = df["author"].unique()
np.random.shuffle(unique_authors)

train_authors = unique_authors[:int(len(unique_authors) * TRAIN_FRAC)]
test_authors = unique_authors[int(len(unique_authors) * TRAIN_FRAC):]

train_df = df[df["author"].isin(train_authors)].sort_values(by="author")
test_df = df[df["author"].isin(test_authors)].sort_values(by="author")


'\nunique_authors = df["author"].unique()\nnp.random.shuffle(unique_authors)\n\ntrain_authors = unique_authors[:int(len(unique_authors) * TRAIN_FRAC)]\ntest_authors = unique_authors[int(len(unique_authors) * TRAIN_FRAC):]\n\ntrain_df = df[df["author"].isin(train_authors)].sort_values(by="author")\ntest_df = df[df["author"].isin(test_authors)].sort_values(by="author")\n'

In [65]:
"""
df = df.sample(frac=1, replace=False, random_state=42)
train_df = df.iloc[:int(len(df_total) * TRAIN_FRAC)]
test_df = df.iloc[int(len(df_total) * TRAIN_FRAC):]
"""

In [66]:
train_df["author"].value_counts()

author
SelfishThailand        602
Jmtaylor1991           207
RHGOtakuxxx            164
AtLeastIAmNotOnFire    147
fadedblackleggings     127
                      ... 
COB98                    5
janbogi2011              5
wwstewart                4
thewayofxen              4
quietcranberry           3
Name: count, Length: 233, dtype: int64

In [67]:
test_df["author"].value_counts()

author
SelfishThailand        600
Jmtaylor1991           231
RHGOtakuxxx            173
AtLeastIAmNotOnFire    161
fadedblackleggings     135
                      ... 
scoofy                   6
firemonkey57             5
1cecream4breakfast       5
Ruludos                  3
aft33                    3
Name: count, Length: 233, dtype: int64

In [68]:
print(f"Length of train_df: {len(train_df)}")
print(f"Length of test_df: {len(test_df)}")

Length of train_df: 5000
Length of test_df: 5000


In [69]:
def pair_up_same_author(df):
    # permute entire dataframe 
    old_df = deepcopy(df).sort_values(by="author")
    new_df = deepcopy(df).sample(frac=1, replace=False)

    # sort dataframe again by author to keep random order within author
    new_df = new_df.sort_values(by=["author"], kind = "stable")

    # rename all columns in parnter dataframe to avoid confusion
    new_df.columns = [str(col) + "_partner" for col in new_df.columns]
    
    colnames = list(old_df.columns) + list(new_df.columns)

    # horizontally concatenate the two dataframes ignoring the index
    new_df = pd.concat([old_df.reset_index(drop = True), new_df.reset_index(drop = True)], axis=1, ignore_index=True)
    new_df.columns = colnames

    assert new_df["author"].equals(new_df["author_partner"]) # check if all authors are the same
    
    return new_df


def pair_up_different_author(df, iterations=10):
    df_idx = df[["author"]]
    df_idx["old_idx"] = range(len(df_idx))

    pair_idx_df_list = []
    for _ in range(iterations):
        shuffled_df = df_idx.sample(frac=1, replace = False).reset_index(drop=True)

        shuffled_df.rename(columns={"author": "author_partner", "old_idx": "old_idx_partner"}, inplace=True)

        paired_df = pd.concat([deepcopy(df_idx).reset_index(drop=True), deepcopy(shuffled_df).reset_index(drop=True),], axis=1)

        pair_idx_df_list.append(paired_df)

    pair_idx_df = pd.concat(pair_idx_df_list, axis=0, ignore_index=True)
    
    # remove rows where author is the same
    pair_idx_df = pair_idx_df[pair_idx_df["author"] != pair_idx_df["author_partner"]]

    # drop duplicates for old indices 
    pair_idx_df = pair_idx_df.drop_duplicates(subset=["old_idx"])   
    
    assert len(pair_idx_df) == len(df)
 
    # order rows of origal df according to old_idx
    pair_df = df.iloc[pair_idx_df["old_idx_partner"].values, :]
    pair_df.columns = [str(col) + "_partner" for col in pair_df.columns]

    # concat df and pair_df
    pair_df = pd.concat([df.reset_index(drop=True), pair_df.reset_index(drop=True)], axis=1, ignore_index=False)


    assert pair_df["author"].equals(pair_df["author_partner"]) == False # check if all authors are the same

    return pair_df

def pair_up_different_author_baseline(df):
    # just pair up the dataframe with a random permutation of itself

    # permute entire dataframe
    new_df = deepcopy(df).sample(frac=1, replace=False)

    # rename all columns in parnter dataframe to avoid confusion
    new_df.columns = [str(col) + "_partner" for col in new_df.columns]

    colnames = list(df.columns) + list(new_df.columns)

    # horizontally concatenate the two dataframes ignoring the index
    new_df = pd.concat([df.reset_index(drop = True), new_df.reset_index(drop = True)], axis=1, ignore_index=True)
    new_df.columns = colnames

    return new_df



# Example usage with your dataframe 'df'
# result_df = pair_up_different_author(df)


def create_pair_classification_df(df, clean_columns=True):
    # create dataframe with same author pairs
    same_author_df = pair_up_same_author(df)
    # create dataframe with different author pairs
    different_author_df = pair_up_different_author_baseline(df)

    colnames_same_author_df = list(same_author_df.columns)
    colnames_different_partner_df = list(different_author_df.columns)

    # make sure that the columns are all of type string 
    colnames_same_author_df = [str(col) for col in colnames_same_author_df]
    colnames_different_partner_df = [str(col) for col in colnames_different_partner_df]

    assert len(colnames_same_author_df) == len(colnames_different_partner_df)
    assert all([colnames_different_partner_df[i] == colnames_different_partner_df[i] for i in range(len(colnames_same_author_df))])

    # rename columns in partner dataframe to avoid confusion
    
    same_author_df.columns = colnames_same_author_df
    different_author_df.columns = colnames_different_partner_df
    
    #same_author_df["label"] = 1
    #different_author_df["label"] = 0

    pair_classification_df = pd.concat([same_author_df, different_author_df], axis=0, ignore_index=True)
    
    pair_classification_df = pair_classification_df.sample(frac=1, replace=False)

    pair_classification_df["label"] = pair_classification_df["author"] == pair_classification_df["author_partner"]

    if clean_columns:
        pair_classification_df.drop(columns = [
            "author",
            "author_partner",
            "metadata",
            "metadata_partner",
            "text",
            "text_partner"
        ], inplace=True)

    
    return pair_classification_df


def create_pair_classification_df_upsample(df, clean_columns, sample_ratio = 1):
    ### use the craete_pair_classification_df function to create a pair classification dataframe. But stack several of them on top of each other to upsample the different author pairs

    classification_df_lis = []
    for _ in tqdm(range(sample_ratio)):
        classification_df_lis.append(create_pair_classification_df(deepcopy(df), clean_columns=clean_columns))

    classification_df = pd.concat(classification_df_lis, axis=0, ignore_index=False)
    classification_df = classification_df.sample(frac=1, replace=False)

    return classification_df

In [70]:
def add_embeddings_to_df(df, text2embedding):
    embedding_list = [
        text2embedding[text] for text in df["text"]
    ]
    embedding_df = pd.DataFrame(embedding_list)

    embedding_list_partner = [
        text2embedding[text] for text in df["text_partner"]
    ]
    embedding_df_partner = pd.DataFrame(embedding_list_partner)
    embedding_df_partner.columns = [str(col) + "_partner" for col in embedding_df_partner.columns]

    df = pd.concat([df.reset_index(drop=True), embedding_df.reset_index(drop=True), embedding_df_partner.reset_index(drop=True)], axis=1, ignore_index=False)
    
    return df


In [71]:
train_df_paired = create_pair_classification_df_upsample(train_df, clean_columns = False, sample_ratio = 2)
test_df_paired = create_pair_classification_df_upsample(test_df, clean_columns = False, sample_ratio = 2)

100%|██████████| 2/2 [00:00<00:00, 47.18it/s]
100%|██████████| 2/2 [00:00<00:00, 92.94it/s]


In [72]:
train_df = add_embeddings_to_df(train_df_paired, text2embedding)
test_df = add_embeddings_to_df(test_df_paired, text2embedding)

In [73]:
train_df = train_df.iloc[:, 6:]
test_df = test_df.iloc[:, 6:]

In [74]:
from autogluon.tabular import TabularDataset, TabularPredictor

In [None]:
# use a predictor with a neural network 



In [82]:
predictor = TabularPredictor(label='label')
predictor.fit(train_df, time_limit=60*30, 
            presets='medium_quality',
            hyperparameters={'NN_TORCH': {}})


No path specified. Models will be saved in: "AutogluonModels\ag-20240127_153458"
Presets specified: ['medium_quality']
Beginning AutoGluon training ... Time limit = 1800s
AutoGluon will save models to "AutogluonModels\ag-20240127_153458"
AutoGluon Version:  1.0.0
Python Version:     3.10.11
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:          8
Memory Avail:       2.15 GB / 15.71 GB (13.7%)
Disk Space Avail:   3.23 GB / 474.72 GB (0.7%)
	We recommend a minimum available disk space of 10 GB, and large datasets may require more.
Train Data Rows:    20000
Train Data Columns: 3072
Label Column:       label
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [False, True]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x1ea1c9e8610>

In [83]:
print(predictor.evaluate(test_df, silent=False, detailed_report =True))

Evaluation: accuracy on test data: 0.64255
Evaluations on test data:
{
    "accuracy": 0.64255,
    "balanced_accuracy": 0.6451344398138645,
    "mcc": 0.2970781214324763,
    "roc_auc": 0.7112786677473117,
    "f1": 0.6048748134637705,
    "precision": 0.6962717903041099,
    "recall": 0.5346882939222201
}
Detailed (per-class) classification report:
{
    "False": {
        "precision": 0.6077753068116301,
        "recall": 0.7555805857055089,
        "f1-score": 0.6736659515223445,
        "support": 9766.0
    },
    "True": {
        "precision": 0.6962717903041099,
        "recall": 0.5346882939222201,
        "f1-score": 0.6048748134637705,
        "support": 10234.0
    },
    "accuracy": 0.64255,
    "macro avg": {
        "precision": 0.65202354855787,
        "recall": 0.6451344398138645,
        "f1-score": 0.6392703824930575,
        "support": 20000.0
    },
    "weighted avg": {
        "precision": 0.653058957414732,
        "recall": 0.64255,
        "f1-score": 0.63846

{'accuracy': 0.64255, 'balanced_accuracy': 0.6451344398138645, 'mcc': 0.2970781214324763, 'roc_auc': 0.7112786677473117, 'f1': 0.6048748134637705, 'precision': 0.6962717903041099, 'recall': 0.5346882939222201, 'confusion_matrix':        False  True 
False   7379   2387
True    4762   5472, 'classification_report': {'False': {'precision': 0.6077753068116301, 'recall': 0.7555805857055089, 'f1-score': 0.6736659515223445, 'support': 9766.0}, 'True': {'precision': 0.6962717903041099, 'recall': 0.5346882939222201, 'f1-score': 0.6048748134637705, 'support': 10234.0}, 'accuracy': 0.64255, 'macro avg': {'precision': 0.65202354855787, 'recall': 0.6451344398138645, 'f1-score': 0.6392703824930575, 'support': 20000.0}, 'weighted avg': {'precision': 0.653058957414732, 'recall': 0.64255, 'f1-score': 0.6384655261777722, 'support': 20000.0}}}


In [None]:
leaderboard = predictor.leaderboard(test_df)

In [None]:
leaderboard

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,NeuralNetTorch,0.7246,0.7035,accuracy,1.886754,0.287154,62.391248,1.886754,0.287154,62.391248,1,True,5
1,WeightedEnsemble_L2,0.7246,0.7035,accuracy,1.890753,0.29126,62.820122,0.003999,0.004106,0.428874,2,True,6
2,RandomForestEntr,0.62805,0.5055,accuracy,0.958664,0.153063,238.239205,0.958664,0.153063,238.239205,1,True,2
3,RandomForestGini,0.61535,0.513,accuracy,1.223486,0.163531,185.544983,1.223486,0.163531,185.544983,1,True,1
4,ExtraTreesEntr,0.5953,0.494,accuracy,1.090229,0.14811,24.281408,1.090229,0.14811,24.281408,1,True,4
5,ExtraTreesGini,0.5895,0.481,accuracy,1.136361,0.143613,24.101726,1.136361,0.143613,24.101726,1,True,3
