In [29]:
import pandas as pd
import re
import os
import shutil
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.metrics import classification_report
from autogluon.multimodal import MultiModalPredictor


In [99]:
csv_path = "C:\\Users\\kevin\\Documents\\GitHub\\open-vRAG\\political_affiliation\\"
print(f"Loading dataset from '{csv_path}'...")

columns = [
    "id",
    "label",
    "statement",
    "subjects",
    "speaker",
    "speaker_job",
    "state",
    "party",
    "barely_true_count",
    "false_count",
    "half_true_count",
    "mostly_true_count",
    "pants_on_fire_count",
    "context"
]
df_train = pd.read_csv(csv_path + "train.tsv", sep="\t", header = None, names = columns)
df_train = df_train[df_train['party'].isin(['democrat', 'republican'])][['statement', 'party']]
df_test = pd.read_csv(csv_path + "test.tsv", sep="\t", header = None, names = columns)
df_test = df_test[df_test['party'].isin(['democrat', 'republican'])][['statement', 'party']]

Loading dataset from 'C:\Users\kevin\Documents\GitHub\open-vRAG\political_affiliation\'...


In [108]:
save_path = 'AutogluonModels/politicalAffiliation_tabular'

if os.path.exists(save_path):
    print(f"Removing existing model directory: {save_path}")
    shutil.rmtree(save_path)

Removing existing model directory: AutogluonModels/politicalAffiliation_tabular


In [109]:
predictor = TabularPredictor(
    label= 'party',
    eval_metric='accuracy',
    problem_type='binary',
    path=save_path
)

In [110]:
predictor.fit(
    df_train,
    time_limit=1200, # Set a shorter limit for simpler models (5 min)
    presets='medium_quality_faster_train', # Use a preset suitable for tabular data
    
    # We use included_model_types to specify only the simple models we want.
    hyperparameters={
        'LR': {},          # Logistic Regression
        'RF': {},          # Random Forest
        'XGB': {},         # XGBoost
        'XT': {},
    }
)

Preset alias specified: 'medium_quality_faster_train' maps to 'medium_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.12.1
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.26100
CPU Count:          32
Memory Avail:       16.12 GB / 31.74 GB (50.8%)
Disk Space Avail:   39.40 GB / 464.32 GB (8.5%)
Presets specified: ['medium_quality_faster_train']
Beginning AutoGluon training ... Time limit = 1200s
AutoGluon will save models to "c:\Users\kevin\Documents\GitHub\open-vRAG\political_affiliation\AutogluonModels\politicalAffiliation_tabular"
Train Data Rows:    7833
Train Data Columns: 1
Label Column:       party
Problem Type:       binary
Preprocessing data ...
Selected class <--> label mapping:  class 1 = republican, class 0 = democrat
	Note: For your binary classification, AutoGluon arbitrarily selected which label-value represents positive (republican) vs negative (democrat) class.
	To explicitly set the positive_clas

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x2310bc4d0d0>

In [111]:
print("\n--- Model Leaderboard (All Trained Models) ---")
leaderboard = predictor.leaderboard(df_test)
print(leaderboard)


--- Model Leaderboard (All Trained Models) ---
                 model  score_test  score_val eval_metric  pred_time_test  \
0  WeightedEnsemble_L2    0.639713   0.649235    accuracy        0.389026   
1           ExtraTrees    0.638690   0.623724    accuracy        0.148378   
2         RandomForest    0.637666   0.646684    accuracy        0.117628   
3          LinearModel    0.621290   0.608418    accuracy        0.205231   
4              XGBoost    0.609007   0.628827    accuracy        0.093277   

   pred_time_val   fit_time  pred_time_test_marginal  pred_time_val_marginal  \
0       0.120861   3.663019                 0.029744                0.000000   
1       0.046329   1.105969                 0.148378                0.046329   
2       0.056001   1.030432                 0.117628                0.056001   
3       0.142308  18.753004                 0.205231                0.142308   
4       0.018531   1.505826                 0.093277                0.018531   

   fit_t

In [None]:
save_path2 = 'AutogluonModels/politicalAffiliation_multimodal'
predictor2 = MultiModalPredictor(
    label="party",
    eval_metric="accuracy",
    problem_type="binary",
    path= save_path2
)

predictor2.fit(df_train,
 time_limit=1200)

AutoGluon Version:  1.4.0
Python Version:     3.12.1
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.26100
CPU Count:          32
Pytorch Version:    2.7.1+cpu
CUDA Version:       CUDA is not available
GPU Count:          0
Memory Avail:       14.54 GB / 31.74 GB (45.8%)
Disk Space Avail:   39.17 GB / 464.32 GB (8.4%)


ValueError: Unknown preset type: medium_quality_faster_train

Preset alias specified: 'medium_quality_faster_train' maps to 'medium_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.12.1
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.26100
CPU Count:          32
Memory Avail:       12.55 GB / 31.74 GB (39.5%)
Disk Space Avail:   39.40 GB / 464.32 GB (8.5%)
Presets specified: ['medium_quality_faster_train']
Beginning AutoGluon training ... Time limit = 600s
AutoGluon will save models to "c:\Users\kevin\Documents\GitHub\open-vRAG\political_affiliation\AutogluonModels\politicalAffiliation_tabular"
Train Data Rows:    69168
Train Data Columns: 1
Label Column:       Party
Problem Type:       binary
Preprocessing data ...
Selected class <--> label mapping:  class 1 = Republican, class 0 = Democrat
	Note: For your binary classification, AutoGluon arbitrarily selected which label-value represents positive (Republican) vs negative (Democrat) class.
	To explicitly set the positive_clas

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x2310921fc20>


--- Model Leaderboard (All Trained Models) ---
                 model  score_test  score_val eval_metric  pred_time_test  \
0         RandomForest    0.928927     0.6956    accuracy        1.003100   
1              XGBoost    0.840793     0.7092    accuracy        0.685651   
2  WeightedEnsemble_L2    0.828649     0.7104    accuracy        2.700826   
3          LinearModel    0.710849     0.6928    accuracy        1.993018   

   pred_time_val    fit_time  pred_time_test_marginal  pred_time_val_marginal  \
0       0.097378   53.654550                 1.003100                0.097378   
1       0.084582   26.034742                 0.685651                0.084582   
2       0.594190  288.330255                 0.022157                0.001021   
3       0.508588  262.275979                 1.993018                0.508588   

   fit_time_marginal  stack_level  can_infer  fit_order  
0          53.654550            1       True          1  
1          26.034742            1       True

In [None]:
save_path2 = 'AutogluonModels/politicalAffiliation_multimodal'
predictor2 = MultiModalPredictor(
    label="label",
    eval_metric="accuracy",
    problem_type="binary",
    output_directory= save_path2
)

predictor2.fit(train_data,
 time_limit=1200,
 presets='good_quality')

In [65]:
df.iloc[1]['Tweet']

'rt USER : winter haven resident / alta vista teacher is one of several recognized by USER for national teacher appreciaâ€¦'