In [29]:
import pandas as pd
import re
import os
import shutil
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.metrics import classification_report
from autogluon.multimodal import MultiModalPredictor


In [99]:
csv_path = "C:\\Users\\kevin\\Documents\\GitHub\\open-vRAG\\political_affiliation\\"
print(f"Loading dataset from '{csv_path}'...")

columns = [
    "id",
    "label",
    "statement",
    "subjects",
    "speaker",
    "speaker_job",
    "state",
    "party",
    "barely_true_count",
    "false_count",
    "half_true_count",
    "mostly_true_count",
    "pants_on_fire_count",
    "context"
]
df_train = pd.read_csv(csv_path + "train.tsv", sep="\t", header = None, names = columns)
df_train = df_train[df_train['party'].isin(['democrat', 'republican'])][['statement', 'party']]
df_test = pd.read_csv(csv_path + "test.tsv", sep="\t", header = None, names = columns)
df_test = df_test[df_test['party'].isin(['democrat', 'republican'])][['statement', 'party']]

Loading dataset from 'C:\Users\kevin\Documents\GitHub\open-vRAG\political_affiliation\'...


In [108]:
save_path = 'AutogluonModels/politicalAffiliation_tabular'

if os.path.exists(save_path):
    print(f"Removing existing model directory: {save_path}")
    shutil.rmtree(save_path)

Removing existing model directory: AutogluonModels/politicalAffiliation_tabular


In [109]:
predictor = TabularPredictor(
    label= 'party',
    eval_metric='accuracy',
    problem_type='binary',
    path=save_path
)

In [110]:
predictor.fit(
    df_train,
    time_limit=1200, # Set a shorter limit for simpler models (5 min)
    presets='medium_quality_faster_train', # Use a preset suitable for tabular data
    
    # We use included_model_types to specify only the simple models we want.
    hyperparameters={
        'LR': {},          # Logistic Regression
        'RF': {},          # Random Forest
        'XGB': {},         # XGBoost
        'XT': {},
    }
)

Preset alias specified: 'medium_quality_faster_train' maps to 'medium_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.12.1
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.26100
CPU Count:          32
Memory Avail:       16.12 GB / 31.74 GB (50.8%)
Disk Space Avail:   39.40 GB / 464.32 GB (8.5%)
Presets specified: ['medium_quality_faster_train']
Beginning AutoGluon training ... Time limit = 1200s
AutoGluon will save models to "c:\Users\kevin\Documents\GitHub\open-vRAG\political_affiliation\AutogluonModels\politicalAffiliation_tabular"
Train Data Rows:    7833
Train Data Columns: 1
Label Column:       party
Problem Type:       binary
Preprocessing data ...
Selected class <--> label mapping:  class 1 = republican, class 0 = democrat
	Note: For your binary classification, AutoGluon arbitrarily selected which label-value represents positive (republican) vs negative (democrat) class.
	To explicitly set the positive_clas

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x2310bc4d0d0>

In [111]:
print("\n--- Model Leaderboard (All Trained Models) ---")
leaderboard = predictor.leaderboard(df_test)
print(leaderboard)


--- Model Leaderboard (All Trained Models) ---
                 model  score_test  score_val eval_metric  pred_time_test  \
0  WeightedEnsemble_L2    0.639713   0.649235    accuracy        0.389026   
1           ExtraTrees    0.638690   0.623724    accuracy        0.148378   
2         RandomForest    0.637666   0.646684    accuracy        0.117628   
3          LinearModel    0.621290   0.608418    accuracy        0.205231   
4              XGBoost    0.609007   0.628827    accuracy        0.093277   

   pred_time_val   fit_time  pred_time_test_marginal  pred_time_val_marginal  \
0       0.120861   3.663019                 0.029744                0.000000   
1       0.046329   1.105969                 0.148378                0.046329   
2       0.056001   1.030432                 0.117628                0.056001   
3       0.142308  18.753004                 0.205231                0.142308   
4       0.018531   1.505826                 0.093277                0.018531   

   fit_t

In [115]:
save_path2 = 'AutogluonModels/politicalAffiliation_multimodal'
predictor2 = MultiModalPredictor(
    label="party",
    eval_metric="accuracy",
    problem_type="binary",
    path= save_path2
)

predictor2.fit(df_train, time_limit=3600)

AutoGluon Version:  1.4.0
Python Version:     3.12.1
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.26100
CPU Count:          32
Pytorch Version:    2.7.1+cpu
CUDA Version:       CUDA is not available
GPU Count:          0
Memory Avail:       14.50 GB / 31.74 GB (45.7%)
Disk Space Avail:   39.17 GB / 464.32 GB (8.4%)

AutoMM starts to create your model. âœ¨âœ¨âœ¨

To track the learning progress, you can open a terminal and launch Tensorboard:
    ```shell
    # Assume you have installed tensorboard
    tensorboard --logdir c:\Users\kevin\Documents\GitHub\open-vRAG\political_affiliation\AutogluonModels\politicalAffiliation_multimodal
    ```

Seed set to 0
GPU Count: 0
GPU Count to be Used: 0

GPU available: False, used: False
TPU available: False, using: 0 TPU cores

  | Name              | Type                         | Params | Mode 
---------------------------------------------------------------------------
0 | model             | HFAutoModelForTextPr

Epoch 0:  50%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆ     | 441/882 [05:21<05:21,  1.37it/s]                 

Epoch 0, global step 27: 'val_accuracy' reached 0.54337 (best 0.54337), saving model to 'C:\\Users\\kevin\\Documents\\GitHub\\open-vRAG\\political_affiliation\\AutogluonModels\\politicalAffiliation_multimodal\\epoch=0-step=27.ckpt' as top 3


Epoch 0: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 882/882 [10:21<00:00,  1.42it/s]

Epoch 0, global step 56: 'val_accuracy' reached 0.58163 (best 0.58163), saving model to 'C:\\Users\\kevin\\Documents\\GitHub\\open-vRAG\\political_affiliation\\AutogluonModels\\politicalAffiliation_multimodal\\epoch=0-step=56.ckpt' as top 3


Epoch 1:  50%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆ     | 441/882 [05:25<05:25,  1.36it/s]

Epoch 1, global step 83: 'val_accuracy' reached 0.58036 (best 0.58163), saving model to 'C:\\Users\\kevin\\Documents\\GitHub\\open-vRAG\\political_affiliation\\AutogluonModels\\politicalAffiliation_multimodal\\epoch=1-step=83.ckpt' as top 3


Epoch 1: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 882/882 [10:42<00:00,  1.37it/s]

Epoch 1, global step 112: 'val_accuracy' reached 0.62117 (best 0.62117), saving model to 'C:\\Users\\kevin\\Documents\\GitHub\\open-vRAG\\political_affiliation\\AutogluonModels\\politicalAffiliation_multimodal\\epoch=1-step=112.ckpt' as top 3


Epoch 2:  50%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆ     | 441/882 [04:30<04:30,  1.63it/s]

Epoch 2, global step 139: 'val_accuracy' reached 0.65434 (best 0.65434), saving model to 'C:\\Users\\kevin\\Documents\\GitHub\\open-vRAG\\political_affiliation\\AutogluonModels\\politicalAffiliation_multimodal\\epoch=2-step=139.ckpt' as top 3


Epoch 2: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 882/882 [10:27<00:00,  1.40it/s]

Epoch 2, global step 168: 'val_accuracy' was not in top 3


Epoch 3:  50%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆ     | 441/882 [13:32<13:32,  0.54it/s]

Epoch 3, global step 195: 'val_accuracy' reached 0.68878 (best 0.68878), saving model to 'C:\\Users\\kevin\\Documents\\GitHub\\open-vRAG\\political_affiliation\\AutogluonModels\\politicalAffiliation_multimodal\\epoch=3-step=195.ckpt' as top 3


Epoch 3: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 882/882 [18:32<00:00,  0.79it/s]

Epoch 3, global step 224: 'val_accuracy' reached 0.63393 (best 0.68878), saving model to 'C:\\Users\\kevin\\Documents\\GitHub\\open-vRAG\\political_affiliation\\AutogluonModels\\politicalAffiliation_multimodal\\epoch=3-step=224.ckpt' as top 3


Epoch 4:  50%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆ     | 441/882 [04:06<04:06,  1.79it/s]

Epoch 4, global step 251: 'val_accuracy' was not in top 3


Epoch 4:  99%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–‰| 876/882 [08:28<00:03,  1.72it/s]

Time limit reached. Elapsed time is 1:00:00. Signaling Trainer to stop.


Epoch 4:  99%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–‰| 877/882 [09:34<00:03,  1.53it/s]

Epoch 4, global step 278: 'val_accuracy' reached 0.67730 (best 0.68878), saving model to 'C:\\Users\\kevin\\Documents\\GitHub\\open-vRAG\\political_affiliation\\AutogluonModels\\politicalAffiliation_multimodal\\epoch=4-step=278.ckpt' as top 3


Epoch 4:  99%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–‰| 877/882 [10:11<00:03,  1.43it/s]


Start to fuse 3 checkpoints via the greedy soup algorithm.
ðŸ’¡ Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.


Predicting DataLoader 0: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 25/25 [00:36<00:00,  0.69it/s]


ðŸ’¡ Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.


Predicting DataLoader 0: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 25/25 [00:15<00:00,  1.56it/s]


ðŸ’¡ Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.


Predicting DataLoader 0: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 25/25 [00:47<00:00,  0.52it/s]


AutoMM has created your model. ðŸŽ‰ðŸŽ‰ðŸŽ‰

To load the model, use the code below:
    ```python
    from autogluon.multimodal import MultiModalPredictor
    predictor = MultiModalPredictor.load("c:\Users\kevin\Documents\GitHub\open-vRAG\political_affiliation\AutogluonModels\politicalAffiliation_multimodal")
    ```

If you are not satisfied with the model, try to increase the training time, 
adjust the hyperparameters (https://auto.gluon.ai/stable/tutorials/multimodal/advanced_topics/customization.html),
or post issues on GitHub (https://github.com/autogluon/autogluon/issues).




<autogluon.multimodal.predictor.MultiModalPredictor at 0x23108797d70>

In [120]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

# Predict labels for the test set
y_pred = predictor2.predict(df_test)

# Predict probabilities for each class
y_pred_proba = predictor2.predict_proba(df_test)


y_true = df_test['party']

# Accuracy
acc = accuracy_score(y_true, y_pred)
print("Accuracy:", acc)

# F1 score
f1 = f1_score(y_true, y_pred, pos_label="democrat")  # or average="macro" for multi-class
print("F1 score:", f1)

# Detailed classification report
print(classification_report(y_true, y_pred))

# Confusion matrix
print(confusion_matrix(y_true, y_pred))



ðŸ’¡ Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.


Predicting DataLoader 0: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 31/31 [00:10<00:00,  3.07it/s]


ðŸ’¡ Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.


Predicting DataLoader 0: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 31/31 [00:12<00:00,  2.42it/s]
Accuracy: 0.6949846468781986
F1 score: 0.6227848101265823
              precision    recall  f1-score   support

    democrat       0.64      0.61      0.62       406
  republican       0.73      0.76      0.74       571

    accuracy                           0.69       977
   macro avg       0.69      0.68      0.68       977
weighted avg       0.69      0.69      0.69       977

[[246 160]
 [138 433]]



--- Model Leaderboard (All Trained Models) ---
                 model  score_test  score_val eval_metric  pred_time_test  \
0         RandomForest    0.928927     0.6956    accuracy        1.003100   
1              XGBoost    0.840793     0.7092    accuracy        0.685651   
2  WeightedEnsemble_L2    0.828649     0.7104    accuracy        2.700826   
3          LinearModel    0.710849     0.6928    accuracy        1.993018   

   pred_time_val    fit_time  pred_time_test_marginal  pred_time_val_marginal  \
0       0.097378   53.654550                 1.003100                0.097378   
1       0.084582   26.034742                 0.685651                0.084582   
2       0.594190  288.330255                 0.022157                0.001021   
3       0.508588  262.275979                 1.993018                0.508588   

   fit_time_marginal  stack_level  can_infer  fit_order  
0          53.654550            1       True          1  
1          26.034742            1       True

In [None]:
save_path2 = 'AutogluonModels/politicalAffiliation_multimodal'
predictor2 = MultiModalPredictor(
    label="label",
    eval_metric="accuracy",
    problem_type="binary",
    output_directory= save_path2
)

predictor2.fit(train_data,
 time_limit=1200,
 presets='good_quality')

In [65]:
df.iloc[1]['Tweet']

'rt USER : winter haven resident / alta vista teacher is one of several recognized by USER for national teacher appreciaâ€¦'