# AutoGluon: Named Entity Recognition (NER)

## Objective
This notebook demonstrates **Named Entity Recognition (NER)** using AutoGluon. NER is a token classification task that identifies and categorizes named entities (persons, organizations, locations, etc.) in text.

## Use Case
NER is useful for:
- Information extraction from documents
- Customer data extraction (names, addresses, phone numbers)
- Medical entity recognition (diseases, medications, symptoms)
- Legal document analysis (parties, dates, amounts)
- Resume parsing (skills, education, experience)

## Common Entity Types
- PER: Person names
- ORG: Organizations
- LOC: Locations
- DATE: Dates and times
- MISC: Miscellaneous entities

In [None]:
!pip install -q torch torchvision torchaudio
!pip install -q autogluon

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from autogluon.tabular import TabularDataset, TabularPredictor
import os
import shutil

In [None]:
# Load dataset
# TODO: Upload your NER dataset or use URL
# NER datasets typically have format:
# - Token-level annotations: Each row is a token with its label
#   Columns: 'token', 'label' (e.g., B-PER, I-PER, O)
# - Or sentence-level: 'text' column with list of tokens and 'labels' with list of tags

# Example: train_data = TabularDataset('https://your-ner-dataset-url.csv')

# Example placeholder - replace with your actual data
# train_data = TabularDataset('path/to/your/ner_data.csv')
# test_data = TabularDataset('path/to/your/test_data.csv')

train_data = None  # Replace with your data
test_data = None   # Replace with your data

print("Dataset loaded successfully!")
if train_data is not None:
    print(f"Training data shape: {train_data.shape}")
    print("\nSample data:")
    print(train_data.head(10))
    print("\nUnique entity labels:")
    # Assuming column name is 'label' or 'tag'
    if 'label' in train_data.columns:
        print(train_data['label'].value_counts())

In [None]:
# Set label column
LABEL = 'label'  # TODO: Replace with your label column name (e.g., 'tag', 'entity')

In [None]:
# Auto-detect problem type based on label
# NER is a token classification task, which is a type of multi-class classification
if train_data is not None and LABEL in train_data.columns:
    # Check if the label is numeric (regression) or categorical (classification)
    if pd.api.types.is_numeric_dtype(train_data[LABEL]):
        # Check if it's continuous or discrete
        unique_ratio = train_data[LABEL].nunique() / len(train_data)
        if unique_ratio > 0.05:  # More than 5% unique values suggests regression
            problem_type = 'regression'
            eval_metric = 'rmse'
        else:
            problem_type = 'classification'
            eval_metric = 'roc_auc'
    else:
        problem_type = 'classification'
        # For NER, accuracy or F1 score are more common metrics
        eval_metric = 'roc_auc'  # Will use appropriate metric for multi-class
else:
    # Default to classification for NER tasks
    problem_type = 'classification'
    eval_metric = 'roc_auc'

print(f"Problem Type: {problem_type}")
print(f"Evaluation Metric: {eval_metric}")
print("\nNote: NER is a token classification task (sequence labeling)")

In [None]:
# Train the model
# For NER, AutoGluon will use appropriate sequence labeling models
predictor = TabularPredictor(
    label=LABEL,
    problem_type=problem_type,
    eval_metric=eval_metric,
    path='./autogluon-ner-model'
).fit(
    train_data=train_data,
    presets='medium_quality',
    time_limit=900
)

print("Model training completed!")
print("The model can now recognize named entities in text.")

In [None]:
# Display and save leaderboard
leaderboard = predictor.leaderboard(test_data, silent=True)
print("\nModel Leaderboard:")
print(leaderboard)

# Save leaderboard to CSV
leaderboard.to_csv('leaderboard.csv', index=False)
print("\nLeaderboard saved to leaderboard.csv")

In [None]:
# Display and save feature importance
try:
    feature_importance = predictor.feature_importance(test_data)
    print("\nFeature Importance:")
    print(feature_importance)
    
    # Save feature importance to CSV
    feature_importance.to_csv('feature_importance.csv')
    print("\nFeature importance saved to feature_importance.csv")
except Exception as e:
    print(f"Could not compute feature importance: {e}")

In [None]:
# Make predictions
if test_data is not None:
    predictions = predictor.predict(test_data)
    print("\nPredictions (Entity Tags):")
    print(predictions.head(20))
    
    # For classification, also show prediction probabilities
    if problem_type == 'classification':
        pred_probs = predictor.predict_proba(test_data)
        print("\nPrediction Probabilities:")
        print(pred_probs.head())
        
    # Example: Display recognized entities in context
    print("\nExample entity recognition:")
    print("(Replace with your own text for testing)")
    # example_text = pd.DataFrame({
    #     'token': ['John', 'works', 'at', 'Google', 'in', 'New', 'York']
    # })
    # predictions = predictor.predict(example_text)
    # for token, label in zip(example_text['token'], predictions):
    #     print(f"{token}: {label}")

In [None]:
# Save model artifacts as zip file
model_path = './autogluon-ner-model'
zip_filename = 'autogluon_ner_model'

if os.path.exists(model_path):
    shutil.make_archive(zip_filename, 'zip', model_path)
    print(f"\nModel artifacts saved to {zip_filename}.zip")
else:
    print("Model path not found. Train the model first.")