# Claim Text Mining

Use topic modeling and text classification on synthetic adjuster narratives to prioritize investigative queues.

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from faker import Faker
import seaborn as sns
import matplotlib.pyplot as plt

# Robust stopword handling: prefer NLTK, fallback to scikit-learn's built-in list
stop_words = 'english'
try:
    import nltk
    from nltk.corpus import stopwords
    try:
        _ = stopwords.words('english')
        stop_words = _
    except LookupError:
        try:
            nltk.download('stopwords', quiet=True, raise_on_error=True)
            stop_words = stopwords.words('english')
        except Exception:
            stop_words = 'english'
except Exception:
    stop_words = 'english'

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

plt.style.use("seaborn-v0_8")
sns.set_context("talk")


## 1. Generate synthetic claim narratives

In [None]:
faker = Faker()
rng = np.random.default_rng(55)
n_samples = 400

incident_types = ["Wind", "Water", "Fire", "Liability", "Theft"]
resolution_codes = ["Fast-Track", "Desk Review", "Field Inspection", "SIU Referral"]

narratives = []
categories = []
severities = []

for i in range(n_samples):
    incident = rng.choice(incident_types, p=[0.25, 0.2, 0.18, 0.2, 0.17])
    resolution = rng.choice(resolution_codes, p=[0.4, 0.3, 0.2, 0.1])
    severity_score = np.clip(rng.normal(loc=incident_types.index(incident) + 1, scale=0.8), 0, 5)
    policyholder_sentence = faker.sentence(nb_words=12)
    adjuster_observation = faker.sentence(nb_words=15)
    remediation_step = faker.sentence(nb_words=10)

    narrative = (
        f"Incident type: {incident}. {policyholder_sentence} {adjuster_observation} "
        f"Recommended action: {remediation_step} Resolution path: {resolution}."
    )

    narratives.append(narrative)
    categories.append(resolution)
    severities.append(severity_score)

text_df = pd.DataFrame(
    {
        "claim_id": np.arange(1, n_samples + 1),
        "incident_type": [rng.choice(incident_types) for _ in range(n_samples)],
        "resolution_path": categories,
        "severity_score": severities,
        "adjuster_notes": narratives,
    }
)

Path("data").mkdir(parents=True, exist_ok=True)
data_path = Path("data/claim_texts.csv")
text_df.to_csv(data_path, index=False)
print(f"Dataset saved to {data_path.resolve()}")
text_df.head()

## 2. Topic modeling with LDA

In [None]:
df = pd.read_csv("data/claim_texts.csv")
vectorizer = CountVectorizer(max_df=0.9, min_df=5, stop_words=stop_words, ngram_range=(1, 2))
doc_term = vectorizer.fit_transform(df["adjuster_notes"])

lda = LatentDirichletAllocation(n_components=4, random_state=55, learning_method="batch")
topic_matrix = lda.fit_transform(doc_term)

def display_topics(model, feature_names, n_top_words=10):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        top_features = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        topics.append({"Topic": f"Topic {topic_idx + 1}", "Top Terms": ", ".join(top_features)})
    return pd.DataFrame(topics)

display_topics(lda, vectorizer.get_feature_names_out())

In [None]:
topic_assignments = topic_matrix.argmax(axis=1)
topic_summary = pd.crosstab(topic_assignments, df["resolution_path"], normalize="index").round(2)
topic_summary

## 3. Text classification to predict resolution path

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df["adjuster_notes"], df["resolution_path"], test_size=0.2, random_state=55, stratify=df["resolution_path"]
)

tfidf = TfidfVectorizer(stop_words=stop_words, ngram_range=(1, 2), min_df=3)
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_vec, y_train)
y_pred = classifier.predict(X_test_vec)

print(classification_report(y_test, y_pred))

In [None]:
feature_to_coeff = zip(tfidf.get_feature_names_out(), classifier.coef_[0])
top_tokens = sorted(feature_to_coeff, key=lambda x: abs(x[1]), reverse=True)[:15]
token_df = pd.DataFrame(top_tokens, columns=["token", "coefficient"])

plt.figure(figsize=(8, 6))
sns.barplot(data=token_df, x="coefficient", y="token", palette="rocket")
plt.title("Influential Tokens for Fast-Track vs Other Resolutions")
plt.tight_layout()
plt.show()