In [26]:
!pip install textacy



In [27]:
import numpy as np 
import pandas as pd 
import re
import torch
from torch.utils.data import DataLoader, Dataset
from collections import Counter
from textacy.datasets.supreme_court import SupremeCourt

In [28]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Function to preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    # Remove punctuation
    text = re.sub(r"[^\w\s]", "", text)
    # Remove numbers
    text = re.sub(r"\d+", "", text)
    # Remove stopwords
    words = text.split()
    words = [word for word in words if word not in ENGLISH_STOP_WORDS]
    # Rejoin words
    text = " ".join(words)
    return text

In [29]:
# Load SupremeCourt dataset
sc = SupremeCourt()
sc.download()

Total labels in labels_sc.txt: 8419
Total valid records: 8417
                                                text  label
0  [ Halliburton Oil Well Cementing Co. v. Walker...      8
1  Rehearing Denied Dec. 16, 1946. See . Mr.Claud...      1
2  Rehearing Denied Dec. 16, 1946\n See .\n Appea...      8
3  Mr.\nWalter J. Cummings, Jr., of Washington, D...      2
4  Mr.A. Devitt Vaneck, of Washington, D.C., for ...      8
Label Distribution: Counter({1: 1924, 8: 1667, 2: 1359, 9: 1148, 3: 658, 10: 367, 7: 346, 4: 335, 12: 304, 5: 110, 6: 98, 11: 58, 0: 23, 13: 19, 14: 1})


In [31]:
import pandas as pd

# Load the initial dataset
data = pd.read_csv("/kaggle/input/labels-web-of-law/15_labels_data.csv")

# Assuming `texts` and `labels` are columns in the loaded dataset
texts = data['text']
labels = data['label']

# Replace numerical labels with descriptions (mapping from sc.issue_area_codes)
mapped_labels = [
    sc.issue_area_codes.get(label, "None") if label != 0 else "None"
    for label in labels
]

# Create a new DataFrame with the mapped labels
data = pd.DataFrame({
    "text": texts,
    "label": mapped_labels
})

# Optionally, exclude rows with 'None' labels
# data = data[data['label'] != "None"].reset_index(drop=True)

print(len(data))
data.head()

8417


Unnamed: 0,text,label
0,halliburton oil cementing v walker mrearl babc...,Economic Activity
1,rehearing denied dec mrclaude t barnes salt la...,Criminal Procedure
2,rehearing denied dec appeal district court uni...,Economic Activity
3,mr walter j cummings jr washington dc petition...,Civil Rights
4,mra devitt vaneck washington dc petitioner mr ...,Economic Activity


In [32]:
# Load the Llama-classified dataset
llama_data = pd.read_csv("/kaggle/input/labels-web-of-law/classified_llama3_one_shot_n")

# Drop rows with missing text values and reset index
llama_data = llama_data.dropna(subset=['text']).reset_index(drop=True)

print(len(llama_data))
llama_data.head()

8417


Unnamed: 0.1,Unnamed: 0,text,class_name
0,0,[ Halliburton Oil Well Cementing Co. v. Walker...,EconomicActivity
1,1,"Rehearing Denied Dec. 16, 1946. See . Mr.Claud...",JudicialPower
2,2,"Rehearing Denied Dec. 16, 1946\n See .\n Appea...",EconomicActivity
3,3,"Mr.\nWalter J. Cummings, Jr., of Washington, D...",Federalism
4,4,"Mr.A. Devitt Vaneck, of Washington, D.C., for ...",EconomicActivity


In [34]:
import pandas as pd

# Ensure both data and llama_data are aligned by index for comparison
if len(data) != len(llama_data):
    print("Warning: The datasets have different lengths. Ensure they are aligned before comparison.")
    # Optionally truncate to the same length for comparison
    min_length = min(len(data), len(llama_data))
    data = data.iloc[:min_length].reset_index(drop=True)
    llama_data = llama_data.iloc[:min_length].reset_index(drop=True)

# Preprocess columns: remove spaces and convert to lowercase
data['processed_label'] = data['label'].str.replace(" ", "").str.lower()
llama_data['processed_class_name'] = llama_data['class_name'].str.replace(" ", "").str.lower()

# Compare the processed columns
comparison = data['processed_label'] == llama_data['processed_class_name']

# Add the comparison result to the DataFrame for analysis
result = pd.DataFrame({
    "text": data['text'],  # Original text for reference
    "label": data['label'],  # Original label
    "class_name": llama_data['class_name'],  # Original class_name
    "match": comparison  # True if label matches class_name, False otherwise
})

# Display the mismatches (optional)
# mismatches = result[~result['match']]
# print("Mismatches:")
# print(mismatches)

# Save the results to a new CSV file (if needed)
result.to_csv("/kaggle/working/comparison_results.csv", index=False)

# Display the first few rows of the result
result.head()


Unnamed: 0,text,label,class_name,match
0,halliburton oil cementing v walker mrearl babc...,Economic Activity,EconomicActivity,True
1,rehearing denied dec mrclaude t barnes salt la...,Criminal Procedure,JudicialPower,False
2,rehearing denied dec appeal district court uni...,Economic Activity,EconomicActivity,True
3,mr walter j cummings jr washington dc petition...,Civil Rights,Federalism,False
4,mra devitt vaneck washington dc petitioner mr ...,Economic Activity,EconomicActivity,True


In [37]:
# Count the number of True values in the 'match' column
true_count = result['match'].sum()

# Print the result
print(f"Accuracy: {true_count/len(data)}")


Accuracy: 0.5556611619341808
