In [14]:
# !pip install "autogluon.tabular==1.0.0"

import pandas as pd
import re
import os
import shutil
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.metrics import classification_report
import emoji

# --- 1. Load Data ---
csv_path = "C:\\Users\\kevin\\Documents\\GitHub\\open-vRAG\\sentiment-analysis\\processed.csv"
if not os.path.exists(csv_path):
    print(f"ERROR: File '{csv_path}' not found.")
    print("Please run 'preprocess.py' first.")
    import sys
    sys.exit()

print(f"Loading dataset from '{csv_path}'...")
df = pd.read_csv(csv_path)

# Assuming your text column is named 'text'
text_col = 'text'  # change to your actual column name

# --- Full cleaning function ---
def clean_text(text):
    if not isinstance(text, str):
        return text
    
    # 1️⃣ Collapse repeated USER mentions
    text = re.sub(r'\bUSER\b(?:\s+\bUSER\b)+', 'USER', text)
    
    # 2️⃣ Remove URLs
    text = re.sub(r'http\S+|www\.\S+', '', text)
    
    # 3️⃣ Remove emojis
    text = emoji.replace_emoji(text, replace='')
    
    # 4️⃣ Remove unnecessary punctuation (except basic sentence punctuation)
    text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)
    
    # 5️⃣ Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # 6️⃣ Lowercase all text
    text = text.lower()
    
    return text

# --- Apply cleaning ---
df[text_col] = df[text_col].apply(clean_text)


# --- 2. Define Label ---
label_col = 'sentiment_3class'
print(f"Unique classes: {df[label_col].unique()}")

# Note: Data is already cleaned by preprocess.py
data = df

# --- 3. Split Data ---
train_df, test_df = train_test_split(
    data,
    test_size=0.2,
    random_state=0,
    stratify=data[label_col]
)

print(f"Training set size: {len(train_df)}")
print(f"Testing set size: {len(test_df)}")


Loading dataset from 'C:\Users\kevin\Documents\GitHub\open-vRAG\sentiment-analysis\processed.csv'...
Unique classes: [-1  1]
Training set size: 1280000
Testing set size: 320000


In [17]:
cleaned_csv_path = "C:\\Users\\kevin\\Documents\\GitHub\\open-vRAG\\sentiment-analysis\\processed_cleaned.csv"

df.to_csv(cleaned_csv_path, index=False)  # index=False avoids saving row numbers

In [22]:
save_path = 'AutogluonModels/sentiment-analysis_tabular'

if os.path.exists(save_path):
    print(f"Removing existing model directory: {save_path}")
    shutil.rmtree(save_path)

predictor = TabularPredictor(
    label= 'sentiment_3class',
    eval_metric='accuracy',
    problem_type='binary',
    path=save_path
)

Removing existing model directory: AutogluonModels/sentiment-analysis_tabular


In [23]:
predictor.fit(
    train_df,
    time_limit=1200, # Set a shorter limit for simpler models (5 min)
    presets='medium_quality_faster_train', # Use a preset suitable for tabular data
    
    # We use included_model_types to specify only the simple models we want.
    hyperparameters={
        'LR': {},          # Logistic Regression
        'RF': {},          # Random Forest
        'XGB': {},         # XGBoost
        'XT': {},
    }
)

Preset alias specified: 'medium_quality_faster_train' maps to 'medium_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.12.1
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.26100
CPU Count:          32
Memory Avail:       10.62 GB / 31.74 GB (33.4%)
Disk Space Avail:   22.58 GB / 464.32 GB (4.9%)
Presets specified: ['medium_quality_faster_train']
Beginning AutoGluon training ... Time limit = 1200s
AutoGluon will save models to "c:\Users\kevin\Documents\GitHub\open-vRAG\sentiment-analysis\AutogluonModels\sentiment-analysis_tabular"
Train Data Rows:    1280000
Train Data Columns: 1
Label Column:       sentiment_3class
Problem Type:       binary
Preprocessing data ...
Selected class <--> label mapping:  class 1 = 1, class 0 = -1
	Note: For your binary classification, AutoGluon arbitrarily selected which label-value represents positive (1) vs negative (-1) class.
	To explicitly set the positive_class, either rename clas

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x2a83c988ec0>

In [25]:
print("\n--- Model Leaderboard (All Trained Models) ---")
leaderboard = predictor.leaderboard(test_df)
print(leaderboard)


--- Model Leaderboard (All Trained Models) ---
                 model  score_test  score_val eval_metric  pred_time_test  \
0              XGBoost    0.751788   0.756172    accuracy        9.816408   
1  WeightedEnsemble_L2    0.751788   0.756172    accuracy        9.853835   
2           ExtraTrees    0.742425   0.744766    accuracy        2.992769   
3         RandomForest    0.740112   0.743906    accuracy        1.795156   

   pred_time_val    fit_time  pred_time_test_marginal  pred_time_val_marginal  \
0       0.149039   77.180781                 9.816408                0.149039   
1       0.152046   77.208922                 0.037427                0.003008   
2       0.165528  405.739315                 2.992769                0.165528   
3       0.079916  293.299355                 1.795156                0.079916   

   fit_time_marginal  stack_level  can_infer  fit_order  
0          77.180781            1       True          3  
1           0.028141            2       True