In [1]:
import os
import numpy as np
import pandas as pd
import torch
import joblib
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from dataloading import get_hasib18_fns

# Ensure NLTK resources are downloaded
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')

# Text preprocessing function (same as in training)
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

def load_dataset():
    """Load the original dataset used for training"""
    train_df, test_df = get_hasib18_fns()
    
    # Preprocess text
    train_df['processed_text'] = train_df['text'].apply(preprocess_text)
    test_df['processed_text'] = test_df['text'].apply(preprocess_text)
    
    # Concatenate for full dataset
    full_df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)
    
    return full_df

# Load dataset only once
print("Loading dataset...")
dataset = load_dataset()
print(f"Dataset loaded with {len(dataset)} entries.")


  from .autonotebook import tqdm as notebook_tqdm


Loading dataset...
Dataset loaded with 72134 entries.


In [None]:
def run_inference(model_type, index=None):
    """
    Run inference on a specific entry using the preloaded dataset.
    
    Args:
        model_type: 'gru', 'lstm', or 'svm'
        index: Specific index to run inference on (optional)
    """
    model_dir = 'models'
    
    if index is not None:
        if index < 0 or index >= len(dataset):
            print(f"Error: Index {index} is out of bounds. Dataset has {len(dataset)} entries.")
            return
    
    if model_type.lower() == 'svm':
        try:
            model_package = joblib.load(os.path.join(model_dir, 'gpu_svm_model.pkl'))
            vectorizer = model_package['vectorizer']
            model = model_package['model']
            print("Loaded GPU SVM model")
        except:
            model = joblib.load(os.path.join(model_dir, 'svm_model.pkl'))
            print("Loaded CPU SVM model")
        
        label_encoder = joblib.load(os.path.join(model_dir, 'label_encoder.pkl'))
        
        def get_prediction(text):
            processed = preprocess_text(text)
            features = vectorizer.transform([processed])
            pred_idx = model.predict(features)[0]
            return label_encoder.inverse_transform([pred_idx])[0]
    
    elif model_type.lower() in ['gru', 'lstm']:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {device}")
        
        model_name = model_type.lower()
        label_encoder = joblib.load(os.path.join(model_dir, f'label_encoder{"_lstm" if model_name == "lstm" else ""}.pkl'))
        vocab = joblib.load(os.path.join(model_dir, f'vocab{"_lstm" if model_name == "lstm" else ""}.pkl'))
        
        try:
            model = torch.load(os.path.join(model_dir, f'complete_{model_name}_model.pt'))
            print(f"Loaded complete {model_name.upper()} model")
        except:
            config = joblib.load(os.path.join(model_dir, f'{model_name}_config.pkl'))
            from torch import nn
            
            class GRUClassifier(nn.Module):
                def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers=1, dropout=0.5):
                    super().__init__()
                    self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
                    self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=True, batch_first=True, dropout=dropout if n_layers > 1 else 0)
                    self.fc = nn.Linear(hidden_dim * 2, output_dim)
                    self.dropout = nn.Dropout(dropout)
                
                def forward(self, text, text_lengths):
                    embedded = self.dropout(self.embedding(text))
                    packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True, enforce_sorted=False)
                    packed_output, hidden = self.gru(packed_embedded)
                    hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
                    return self.fc(hidden)
            
            class LSTMClassifier(nn.Module):
                def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers=1, dropout=0.5):
                    super().__init__()
                    self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
                    self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=True, batch_first=True, dropout=dropout if n_layers > 1 else 0)
                    self.fc = nn.Linear(hidden_dim * 2, output_dim)
                    self.dropout = nn.Dropout(dropout)
                
                def forward(self, text, text_lengths):
                    embedded = self.dropout(self.embedding(text))
                    packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True, enforce_sorted=False)
                    packed_output, (hidden, cell) = self.lstm(packed_embedded)
                    hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
                    return self.fc(hidden)
            
            model_class = GRUClassifier if model_name == 'gru' else LSTMClassifier
            model = model_class(config['INPUT_DIM'], config['EMBEDDING_DIM'], config['HIDDEN_DIM'], config['OUTPUT_DIM'], config['N_LAYERS'], config['DROPOUT'])
            model.load_state_dict(torch.load(os.path.join(model_dir, f'best_{model_name}_model.pt')))
            print(f"Loaded {model_name.upper()} model from config and weights")
        
        model = model.to(device)
        model.eval()
        
        def get_prediction(text):
            processed = preprocess_text(text)
            tokens = word_tokenize(processed)
            indices = [vocab.get(token, 1) for token in tokens]
            text_tensor = torch.tensor(indices).unsqueeze(0).to(device)
            length_tensor = torch.tensor([len(indices)]).to(device)
            
            with torch.no_grad():
                output = model(text_tensor, length_tensor)
                predicted_idx = torch.argmax(output, dim=1).item()
            
            return label_encoder.inverse_transform([predicted_idx])[0]
    
    else:
        print(f"Error: Unknown model type '{model_type}'. Use 'gru', 'lstm', or 'svm'.")
        return
    
    if index is not None:
        entry = dataset.iloc[index]
        prediction = get_prediction(entry['text'])
        print("\n" + "="*80)
        print(f"Entry #{index}:")
        print("="*80)
        print(f"Text: {entry['text']}")
        print("-"*80)
        print(f"Actual label: {entry['label']}")
        print(f"Predicted label: {prediction}")
        print(f"Prediction {'correct' if prediction == entry['label'] else 'incorrect'}")
        print("="*80)
    else:
        print("\nRunning inference on first 5 entries:")
        for i in range(min(5, len(dataset))):
            entry = dataset.iloc[i]
            prediction = get_prediction(entry['text'])
            print(f"\nEntry #{i}: {entry['text'][:100]}...")
            print(f"Actual: {entry['label']}, Predicted: {prediction}")
            print("-" * 80)



In [4]:
if __name__ == "__main__":
    model = 'lstm'  # 'lstm'/'svm'
    index = 30      #  Can be none
    
    # Run inference with the specified model and index
    run_inference(model, index)

Using device: cuda


  model = torch.load(os.path.join(model_dir, f'complete_{model_name}_model.pt'))
  model.load_state_dict(torch.load(os.path.join(model_dir, f'best_{model_name}_model.pt')))


Loaded LSTM model from config and weights

Entry #30:
Text: U.S. senator says panel could take up Russia sanctions bill this summer WASHINGTON (Reuters) - The Republican chairman of the U.S. Senate Foreign Relations Committee said on Thursday the panel could take up a bill as soon as this summer to impose new sanctions on Russia over its alleged interference in the 2016 U.S. presidential election. Senator Bob Corker said the panel could move forward on sanctions after hearing from U.S. Secretary of State Rex Tillerson. Some members of the committee, particularly Democrats, had wanted to act more quickly on sanctions over Russian activities that U.S. intelligence agencies concluded were intended to help get Republican Donald Trump elected. Corker said he expects Tillerson to report to the committee within weeks about the Trump administration’s policy toward Russia and the situation in Syria. Corker said he would be willing to consider a sanctions bill soon after that if, as he expected,