# FEATURES ENGINEERING PIPELINE

## Imports & Dataset

### Setup

In [1]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
import sys

# Add the folder path where rouge is installed (change this to your actual path)
sys.path.append(r"C:\Users\Ait El Mouddene\miniconda3\envs\py311-env\Lib\site-packages")

In [15]:
import json
import spacy

In [3]:
import pandas as pd
from tqdm import tqdm
from rouge import Rouge
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
import nltk
import lexnlp
from lexnlp.extract.en.entities.nltk_re import get_companies
from lexnlp.extract.en import conditions, acts, durations, dates, constraints

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('maxent_ne_chunker')
nltk.download('words')

### Dataset

In [6]:
df = pd.read_parquet("/content/preprocessed_legal_data.parquet")

## Utils

In [14]:
def load_legal_terms(csv_path='legal_terms.csv'):
    """
    Load legal terms from a CSV file.

    Args:
        csv_path: Path to the CSV file containing legal terms

    Returns:
        A set of legal terms
    """
    legal_terms = set()

    with open(csv_path, 'r') as f:
        reader = csv.reader(f)
        next(reader)  # Skip header row

        for row in reader:
            if len(row) >= 3:  # Check if the row has enough columns
                term = row[2].lower()  # Term is in the third column
                legal_terms.add(term)

    print(f"Loaded {len(legal_terms)} legal terms from {csv_path}")
    return legal_terms

## Entities Extraction

In [7]:
def calculate_entity_features(df):
    """
    Extract entity-based features using LexNLP.

    Returns:
    - all_entity_features: list of dicts containing counts
    - all_entity_features_text: list of dicts containing entity texts
    """
    print("Calculating entity recognition features with LexNLP...")

    all_entity_features = []


    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing documents for entity extraction"):
        doc_features_counts = []


        for sentence in row['sentences']:
            # Extract all entities
            acts_ = list(acts.get_acts(sentence))
            companies_ = list(get_companies(sentence))
            conditions_ = list(conditions.get_conditions(sentence))
            durations_ = list(durations.get_durations(sentence))
            constraints_ = list(constraints.get_constraints(sentence))
            dates_ = list(dates.get_dates(sentence))

            # Store counts
            count_features = {
                'count_act_mention': len(acts_),
                'count_company_mentions': len(companies_),
                'count_conditions': len(conditions_),
                'count_durations': len(durations_),
                'count_constraints': len(constraints_),
                'count_num_dates': len(dates_)
            }

            doc_features_counts.append(count_features)


        all_entity_features.append(doc_features_counts)


    return all_entity_features

## POS Features

In [8]:
# Calculate POS tag features for each sentence
def calculate_pos_features(df):
    print("Calculating POS tag features...")
    nlp = spacy.load('en_core_web_sm')
    # Count legal terms (basic implementation - could be expanded)
    legal_terms = load_legal_terms()
    all_pos_features = []

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing POS tags"):
        source_sentences = row['sentences']  # Use original sentences for POS analysis

        doc_pos_features = []
        for sentence in source_sentences:
            # Skip empty sentences
            if not sentence.strip():
                doc_pos_features.append({
                    'noun_ratio': 0.0,
                    'verb_ratio': 0.0,
                    'adj_ratio': 0.0,
                    'adv_ratio': 0.0,
                    'prop_noun_ratio': 0.0,
                    'num_ratio': 0.0,
                    'legal_term_count': 0
                })
                continue

            doc = nlp(sentence)

            if len(doc) == 0:
                doc_pos_features.append({
                    'noun_ratio': 0.0,
                    'verb_ratio': 0.0,
                    'adj_ratio': 0.0,
                    'adv_ratio': 0.0,
                    'prop_noun_ratio': 0.0,
                    'num_ratio': 0.0,
                    'legal_term_count': 0
                })
                continue

            # Count POS tags
            pos_counts = {}
            for token in doc:
                pos = token.pos_
                pos_counts[pos] = pos_counts.get(pos, 0) + 1

            total_tokens = len(doc)

            # Calculate ratios
            noun_ratio = pos_counts.get('NOUN', 0) / total_tokens if total_tokens > 0 else 0
            verb_ratio = pos_counts.get('VERB', 0) / total_tokens if total_tokens > 0 else 0
            adj_ratio = pos_counts.get('ADJ', 0) / total_tokens if total_tokens > 0 else 0
            adv_ratio = pos_counts.get('ADV', 0) / total_tokens if total_tokens > 0 else 0
            prop_noun_ratio = pos_counts.get('PROPN', 0) / total_tokens if total_tokens > 0 else 0
            num_ratio = pos_counts.get('NUM', 0) / total_tokens if total_tokens > 0 else 0



            legal_term_count = sum(1 for token in doc if token.text.lower() in legal_terms)

            doc_pos_features.append({
                'noun_ratio': noun_ratio,
                'verb_ratio': verb_ratio,
                'adj_ratio': adj_ratio,
                'adv_ratio': adv_ratio,
                'prop_noun_ratio': prop_noun_ratio,
                'num_ratio': num_ratio,
                'legal_term_count': legal_term_count
            })

        all_pos_features.append(doc_pos_features)

    return all_pos_features

## Postion Features

In [9]:
# 3. Position features and sentence length
def calculate_position_features(df):
    print("Calculating position and length features...")

    all_position_features = []

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing documents"):
        source_sentences = row['sentences']
        total_sentences = len(source_sentences)

        position_features = []
        for i, sentence in enumerate(source_sentences):
            # Calculate position features
            rel_position = i / total_sentences if total_sentences > 0 else 0
            is_first_quarter = 1 if rel_position <= 0.25 else 0
            is_last_quarter = 1 if rel_position >= 0.75 else 0

            sent_length = len(sentence.split())

            # POS ratios (would need to implement this separately with spaCy)

            position_features.append({
                'rel_position': rel_position,
                'is_first_quarter': is_first_quarter,
                'is_last_quarter': is_last_quarter,
                'is_first': 1 if i == 0 else 0,
                'is_last': 1 if i == total_sentences - 1 else 0,
                'sentence_length': sent_length,
                'sentence_parity': i % 2  # 0 for even, 1 for odd positions
            })

        all_position_features.append(position_features)

    return all_position_features

## ROUGE Scores

In [10]:
# 2. ROUGE-based labeling
def calculate_rouge_scores(df):
    print("Calculating ROUGE scores for each sentence...")
    rouge = Rouge()

    all_sentence_scores = []

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing documents"):
        source_sentences = row['sentences']
        reference_summary = row['summary/long']

        sentence_scores = []
        for sentence in source_sentences:
            try:
                # Calculate ROUGE scores between this sentence and the reference summary
                scores = rouge.get_scores(sentence, reference_summary)[0]

                # Extract metrics
                rouge1_f = scores['rouge-1']['f']
                rouge2_f = scores['rouge-2']['f']
                rougeL_f = scores['rouge-l']['f']

                # Save all scores
                sentence_scores.append({
                    'rouge1_f': rouge1_f,
                    'rouge2_f': rouge2_f,
                    'rougeL_f': rougeL_f
                })
            except Exception as e:
                # Handle empty sentences
                sentence_scores.append({
                    'rouge1_f': 0.0,
                    'rouge2_f': 0.0,
                    'rougeL_f': 0.0
                })

        all_sentence_scores.append(sentence_scores)

    return all_sentence_scores

## TF-IDF & Cousin Similarity Features

In [11]:
def fit_and_save_tfidf(df, save_path="tfidf_data.pkl", max_features=5000):
    print("Fitting TF-IDF on corpus...")

    # Flatten all sentences
    all_sentences = [s for sentence_list in df['sentences_tf_idf'] for s in sentence_list]

    # Fit vectorizer
    tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
    tfidf_matrix = tfidf_vectorizer.fit_transform(all_sentences)

    # Save both vectorizer and matrix
    with open(save_path, "wb") as f:
        pickle.dump({
            "vectorizer": tfidf_vectorizer,
            "matrix": tfidf_matrix,
            "sentences": all_sentences  # Needed to map matrix rows back
        }, f)


    print(f"TF-IDF vectorizer and matrix saved to {save_path}")

    return tfidf_vectorizer, tfidf_matrix

In [12]:
def transform_and_save_tfidf(df, tfidf_vectorizer, save_path="tfidf_data.pkl", max_features=5000):
    print("Fitting TF-IDF on corpus...")

    # Flatten all sentences
    all_sentences = [s for sentence_list in df['sentences_tf_idf'] for s in sentence_list]
    tfidf_matrix = tfidf_vectorizer.fit_transform(all_sentences)

    # Save both vectorizer and matrix
    with open(save_path, "wb") as f:
        pickle.dump({
            "vectorizer": tfidf_vectorizer,
            "matrix": tfidf_matrix,
            "sentences": all_sentences  # Needed to map matrix rows back
        }, f)


    print(f"TF-IDF vectorizer and matrix saved to {save_path}")
    return tfidf_vectorizer, tfidf_matrix

In [13]:
def calculate_tfidf_features(df, tfidf_matrix):
    """
    Calculates sentence-level features using a precomputed TF-IDF matrix:
    - avg_tfidf
    - max_tfidf
    - sum_tfidf
    - centroid_similarity (cosine similarity to the document centroid)
    """
    print("Calculating TF-IDF features using precomputed matrix...")

    sentence_features = []
    sentence_idx = 0

    for sentence_list in tqdm(df['sentences_tf_idf'], desc="Processing documents"):
        doc_features = []
        doc_vectors = []

        # Collect TF-IDF vectors for all sentences in the current document
        for _ in sentence_list:
            vec = tfidf_matrix[sentence_idx].toarray()[0]
            doc_vectors.append(vec)
            sentence_idx += 1

        doc_vectors_np = np.array(doc_vectors)
        centroid = np.mean(doc_vectors_np, axis=0).reshape(1, -1)
        centroid_similarities = cosine_similarity(doc_vectors_np, centroid).flatten()

        # Create features for each sentence
        for i, vec in enumerate(doc_vectors):
            doc_features.append({
                'avg_tfidf': np.mean(vec),
                'max_tfidf': np.max(vec),
                'sum_tfidf': np.sum(vec),
                'centroid_similarity': centroid_similarities[i]
            })

        sentence_features.append(doc_features)

    return sentence_features

## Combine & save Features

### Generate Features

LexNlp requires Python v3.6, so we did run this function on another environement, we will just import it for now as a json file.

In [16]:
# ner_features = calculate_entity_features(df)

with open('ner_features.json', 'r') as file:
    ner_features = json.load(file)

In [None]:
rouge_features = calculate_rouge_scores(df)

In [None]:
position_features = calculate_position_features(df)

In [None]:
pos_features = calculate_pos_features(df)

split data for tf-idf

In [None]:
split_idx = int(len(df) * 0.8)
X_train = df.iloc[:split_idx]
X_test = df.iloc[split_idx:]

tfidf for train

In [None]:
vectorizer, tfidf_matrix = fit_and_save_tfidf(X_train)

In [None]:
tfidf_train = calculate_tfidf_features(X_train, tfidf_matrix)

tfidf for test

In [None]:
vectorizer, tfidf_matrix_test = transform_and_save_tfidf(X_test, vectorizer)

In [None]:
tfidf_test = calculate_tfidf_features(X_test, tfidf_matrix_test)

combine tfidf

In [None]:
tfidf_features = tfidf_train + tfidf_test

### Build Features Matrix

In [None]:
def build_enhanced_feature_matrix_dynamic(df,
                                          tfidf_features,
                                          rouge_scores,
                                          position_features,
                                          pos_features,
                                          ner_features,
                                          top_pct=0.2):
    """
    Build feature matrices & dynamic labels where the top `top_pct` fraction of
    sentences (by rougeL_f) in each document are labeled 1, the rest 0.
    """
    all_features = []
    all_labels = []

    for doc_idx in range(len(df)):
        doc_feats = []
        doc_lbls = []

        # per-doc arrays
        doc_tfidf     = tfidf_features[doc_idx]
        doc_rouge     = rouge_scores[doc_idx]
        doc_position  = position_features[doc_idx]
        doc_pos       = pos_features[doc_idx]
        doc_ner = ner_features[doc_idx]
        num_sentences = len(doc_rouge)

        # Determine how many to pick
        k = max(1, ceil(top_pct * num_sentences))

        # Get indices sorted by rougeL_f descending
        sorted_idxs = sorted(
            range(num_sentences),
            key=lambda i: doc_rouge[i]['rougeL_f'],
            reverse=True
        )
        topk_idxs = set(sorted_idxs[:k])

        for i in range(num_sentences):
            # optional: skip very short sentences
            if i < len(doc_position) and doc_position[i]['sentence_length'] <= 2:
                continue

            # bounds check
            if i >= len(doc_tfidf) or i >= len(doc_position) or i >= len(doc_pos):
                continue

            # dynamic label
            label = 1 if i in topk_idxs else 0

            # assemble features
            feats = {
                **doc_tfidf[i],
                **doc_rouge[i],
                **doc_position[i],
                **doc_pos[i],
                **doc_ner[i]
            }

            doc_feats.append(feats)
            doc_lbls.append(label)

        all_features.append(doc_feats)
        all_labels.append(doc_lbls)

    return all_features, all_labels

In [None]:
def remove_rouge_features(all_features):
    keys_to_remove = ['rouge1_f', 'rouge2_f', 'rougeL_f']
    cleaned = []

    for doc in all_features:
        cleaned_doc = []
        for sentence in doc:
            filtered = {k: v for k, v in sentence.items() if k not in keys_to_remove}
            cleaned_doc.append(filtered)
        cleaned.append(cleaned_doc)

    return cleaned

In [None]:
all_features, all_labels = build_enhanced_feature_matrix_dynamic(df,tfidf_features, rouge_features, position_features, pos_features, ner_features)
all_features_cleaned = remove_rouge_features(all_features)

### Save Combined Features as JSON

In [None]:
all_features_combined = {
    "all_features": all_features_cleaned,
    "all_labels" : all_labels
}


# Save to file
with open("all_features_combined.json", "w") as f:
    json.dump(all_features_comined, f)