In [8]:
from datasets import load_dataset
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
import re
from transformers import pipeline
import torch
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
import joblib
import sys
import os
sys.path.append('../app')
from text_cleaning_module import TextCleaner, load_clean_filtered_dataset

In [10]:
df_train, df_val, df_test = load_clean_filtered_dataset()
cleaner = TextCleaner()
df_train["clean_case"] = cleaner.transform(df_train["full_case"])
df_val["clean_case"] = cleaner.transform(df_val["full_case"])
df_test["clean_case"] = cleaner.transform(df_test["full_case"])

100%|██████████| 3175/3175 [08:44<00:00,  6.06it/s] 
100%|██████████| 453/453 [01:25<00:00,  5.31it/s]
100%|██████████| 908/908 [02:12<00:00,  6.86it/s]


## Case Action Sought Model

In [14]:
# 1) Define your mapping
group_map = {
    # Criminal Justice
    "Immigration and/or the Border":     "Criminal Justice",
    "Prison Conditions":                 "Criminal Justice",
    "Jail Conditions":                   "Criminal Justice",
    "Policing":                          "Criminal Justice",
    "National Security":                 "Criminal Justice",
    "Criminal Justice (Other)":          "Criminal Justice",

    # Civil Rights
    "Equal Employment":                  "Civil Rights",
    "Fair Housing/Lending/Insurance":    "Civil Rights",
    "Disability Rights-Pub. Accom.":     "Civil Rights",
    "Speech and Religious Freedom":      "Civil Rights",
    "Election/Voting Rights":            "Civil Rights",

    # Social Welfare
    "Public Benefits / Government Services": "Social Welfare",
    "Public Accomm./Contracting":        "Social Welfare",
    "Public Housing":                    "Social Welfare",
    "Child Welfare":                     "Social Welfare",
    "Nursing Home Conditions":           "Social Welfare",
}


for df in (df_train, df_val, df_test):
    df['case_group'] = df['case_type'].map(group_map).fillna("Other")

print(df_train['case_group'].value_counts())


label_map = {"Yes": 1, "No": 0}
y_train_sought = df_train["class_action_sought"].map(label_map)
y_val_sought   = df_val["class_action_sought"].map(label_map)
y_test_sought   = df_test["class_action_sought"].map(label_map)

y_train_group = df_train['case_group']
y_val_group   = df_val['case_group']
y_test_group= df_test['case_group']

X_train = df_train["clean_case"]
X_val = df_val["clean_case"]
X_test = df_test["clean_case"]

case_group
Civil Rights        1438
Criminal Justice     892
Other                588
Social Welfare       257
Name: count, dtype: int64


In [24]:
tqdm.pandas()
best_params = {
    "tfidf__max_features": 5000,
    "tfidf__ngram_range": (1, 2),
    "clf__C": 10
}

# Rebuild the pipeline
final_pipe = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_features=best_params["tfidf__max_features"],
        ngram_range=best_params["tfidf__ngram_range"]
    )),
    ("clf", LogisticRegression(
        C=best_params["clf__C"],
        max_iter=1000,
        verbose=1
    ))
])

# Fit on full training set
final_pipe.fit(X_train, y_train_sought)

# Predict on validation set
y_test_pred = final_pipe.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test_sought, y_test_pred))
print(classification_report(y_test_sought, y_test_pred, target_names=["No", "Yes"]))

joblib.dump(final_pipe, "models/best_case_action_sought.joblib")


Test Accuracy: 0.9174008810572687
              precision    recall  f1-score   support

          No       0.92      0.96      0.94       619
         Yes       0.91      0.82      0.86       289

    accuracy                           0.92       908
   macro avg       0.92      0.89      0.90       908
weighted avg       0.92      0.92      0.92       908



['models/best_case_action_thought.joblib']

## Case Type Modeling

In [29]:
pipe = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_features=20000,
        ngram_range=(1, 1),
    )),
    ("clf", LogisticRegression(C=10, max_iter=1500, class_weight='balanced', verbose=2))
])


pipe.fit(X_train, y_train_group)
#print("Best params:", grid.best_params_)
y_pred = pipe.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test_group, y_pred))
print(classification_report(y_test_group, y_pred))
joblib.dump(pipe, "models/best_case_type_model.joblib")

Accuracy: 0.8931718061674009
                  precision    recall  f1-score   support

    Civil Rights       0.94      0.92      0.93       421
Criminal Justice       0.86      0.95      0.90       249
           Other       0.87      0.80      0.83       170
  Social Welfare       0.77      0.75      0.76        68

        accuracy                           0.89       908
       macro avg       0.86      0.85      0.86       908
    weighted avg       0.89      0.89      0.89       908



['models/best_case_type_model.joblib']