In [1]:
import os 
#This module provides a portable way of using operating system dependent functionality.
import sys
#This module provides access to some variables used or maintained by the interpreter and 
#to functions that interact strongly with the interpreter
sys.path.append('..')

import lzma
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as se
se.set_style('white')
se.set_context('poster')

import pyquery
from ipywidgets import FloatProgress
from IPython.display import display

from config import settings
import utils
import requests
import re
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.cluster import DBSCAN
import numpy as np


In [2]:
cases_format="json"
#else "xml" 
#cases_format="xml"
compressed_file = utils.get_cases_from_bulk(jurisdiction="New Mexico", data_format=cases_format)

[35mdownloading New Mexico-20200302-text.zip into ../data dir[0m


66568it [00:15, 4177.55it/s]


[35mextracting New Mexico-20200302-text.zip into ../data dir[0m
[35mDone.[0m


In [3]:
#a list to hold the cases we're sampling
cases = []

#decompress the file line by line
with lzma.open(compressed_file) as infile:
    for line in infile:
        #decode the file into a convenient format
        record = json.loads(str(line, 'utf-8'))
        #if the decision date on the case matches one we're interested in, add to our list
        cases.append(record)

print("Selected jurisdiction: New Mexico")
print("Number of Cases: ", len(cases))

Selected jurisdiction: New Mexico
Number of Cases:  18338


In [4]:
def parse_json(case):
    feat_dict = {}
    feat_dict["case_id"] = case['id']
    feat_dict["case_url"] = case['url']
    feat_dict["case_name"] = case['name']
    feat_dict["case_name_abbreviation"] = case['name_abbreviation']
    feat_dict["date"] = case['decision_date']
    feat_dict["citations"] = [citation for citation in case['citations'] if citation['type'] == 'official'][0]
    #feat_dict["volume"] = case['volume']
    feat_dict["court"] = case['court']['name']
    feat_dict["jurisdiction"] = case['jurisdiction']#['name_long']
    #feat_dict["name"] = case['jurisdiction']['name']
    feat_dict["judges"]=case["casebody"]["data"]["judges"]
    feat_dict["attorneys"]=case["casebody"]["data"]["attorneys"]
    feat_dict["author"]=case["casebody"]["data"]["opinions"][0]["author"]
    
    #Main text: opinions
    feat_dict["opinion"]=case["casebody"]["data"]["opinions"][0]["text"]
    
    return feat_dict

In [5]:
#Add progress bar
bar = FloatProgress(min=0, max=len(cases))
display(bar)

#For each file, parse with parse_xml() and add to list of parsed files
parsed_files = []
for case in cases:
    doc = parse_json(case) #json
    #doc = ET_parse_xml(case) #xml
    parsed_files.append(doc)
    bar.value += 1

FloatProgress(value=0.0, max=18338.0)

# Author

In [6]:
raw_df = pd.DataFrame(parsed_files)

In [11]:
authors_cases_dict = {}


for index, row in raw_df.iterrows():
    case_id = row['case_id']
    author = row['author']
    
    if author and author[0].isalpha() :
        # Split names of judges separated by ",".
        author = [a.strip() for a in author.split(',')][0]
        author = [a.strip() for a in author.split('.')][0]
        author = [a.strip() for a in author.split(';')][0]
        author = author.upper() 

        if author in authors_cases_dict:
            authors_cases_dict[author].append(case_id)
        else:
            authors_cases_dict[author] = [case_id]

for author, case_id in authors_cases_dict.items():
    print(f"Number of Cases: {len(case_id)}")
    print(f"Author: {author}")
    print(f"Case IDs: {case_id}")
    print()


Number of Cases: 312
Author: MINZNER
Case IDs: [17491, 142194, 17275, 17773, 18391, 18126, 18555, 17424, 18448, 834248, 834296, 17149, 322306, 827451, 827374, 834293, 257758, 1217180, 322293, 142281, 827383, 827421, 827403, 257625, 257751, 834335, 834166, 834182, 834219, 834357, 106570, 257693, 106526, 106506, 106492, 106498, 1217162, 1217163, 106574, 142211, 142269, 142249, 142214, 1217109, 1217106, 1217174, 322308, 322263, 322254, 322273, 322314, 322344, 18767, 1224598, 929183, 929161, 260756, 260735, 352473, 352429, 257742, 257619, 257748, 1580208, 1580074, 1594868, 1595016, 1594993, 707559, 707159, 1590317, 1590323, 1590284, 1599007, 708070, 1598902, 1598976, 1599005, 1566483, 1566569, 1563584, 1563474, 1561222, 1561210, 1561232, 929147, 1224632, 716899, 716895, 723229, 723218, 723205, 720234, 720269, 720254, 1590315, 1590281, 1561356, 711394, 715091, 1563512, 1563581, 1590264, 731571, 731659, 731567, 725481, 1592815, 1592812, 1597100, 1597254, 1594925, 1599010, 1598929, 706927, 70

# Let's try to normalize

In [7]:

ignore_initials = set(["J", "CJ", "JJ", "Justice", "Chief Justice", "Judge", "THOMAS", "GENE", "BENJAMIN", "ANTHONY", "LINDA", "ROBERT", "CONCUR", "PAMELA", "PAUL",
                       "CYNTHIA", "IRA", "RODERICK", "PATRICIO", "JOSEPH", "CELIA", "PETRA", "JIMENEZ", "RICHARD", "EDWARD", "GERALD", "JAMES", "CHARLES",
                       "BARBARA", "DANIELS", "MICHAEL", "EDWARD"
                         ])

name_mapping = {
    "mcmanus": "McManus",
    "mc-manus": "McManus",
    "manus": "McManus",
    "McMANUS": "McManus"
  
}

def is_judge_name(name):
    # Remove initials separated by semicolons and those to be ignored.
    normalized_name = re.sub(r"[.,]", "", name).strip()
    # Count the number of uppercase and lowercase letters in the word
    uppercase_count = sum(1 for c in normalized_name if c.isupper())
    lowercase_count = sum(1 for c in normalized_name if c.islower())
    return len(normalized_name) >= 3 and normalized_name not in ignore_initials and uppercase_count > lowercase_count

def normalize_judges_names(judges_text):
    # Split the string into individual tokens
    words = re.findall(r'\b\w+\b', judges_text) 
    judge_names = []
    seen_judges = set()  # Set to keep track of seen judges, must be added once
    for word in words:
        # Check if the word is "McManus" and treat it as a valid judge
        if word == "McMANUS":
            judge_names.append(word)
        else:
            # Check if the word is a judge name and get the standardized name from the mapping if it exists
            normalized_name = name_mapping.get(word.lower(), word)
            if is_judge_name(normalized_name) and normalized_name not in seen_judges:
                judge_names.append(normalized_name)
                seen_judges.add(normalized_name)

    output = " and ".join(judge_names)
    # Replace any consecutive sequences of "and" with one
    output = re.sub(r'\b(?:and\s*)+\b', ' and ', output)


    print(f"Input: {judges_text}, Result: {output}")
    return output

In [8]:
raw_df["author"] = raw_df["author"].astype(str)
raw_df["normalized_judge_names"] = raw_df["author"].apply(normalize_judges_names)
print(raw_df[["author", "case_id", "normalized_judge_names"]])

Input: MINZNER, Justice., Result: MINZNER
Input: WECHSLER, Judge., Result: WECHSLER
Input: PICKARD, Judge., Result: PICKARD
Input: BOSSON, Judge., Result: BOSSON
Input: PICKARD, Judge., Result: PICKARD
Input: BUSTAMANTE, Judge., Result: BUSTAMANTE
Input: WECHSLER, Judge., Result: WECHSLER
Input: BOSSON, Judge., Result: BOSSON
Input: ALARID, Judge., Result: ALARID
Input: FRANCHINI, Chief Justice., Result: FRANCHINI
Input: FRANCHINI, Chief Justice., Result: FRANCHINI
Input: MINZNER, Justice., Result: MINZNER
Input: APODACA, Judge., Result: APODACA
Input: BOSSON, Judge., Result: BOSSON
Input: PER CURIAM., Result: PER  and CURIAM
Input: DONNELLY, Judge., Result: DONNELLY
Input: PICKARD, Judge., Result: PICKARD
Input: RANSOM, Justice., Result: RANSOM
Input: ARMIJO, Judge., Result: ARMIJO
Input: WECHSLER, Judge., Result: WECHSLER
Input: BOSSON, Judge., Result: BOSSON
Input: WECHSLER, Judge., Result: WECHSLER
Input: WECHSLER, Judge., Result: WECHSLER
Input: FLORES, Judge., Result: FLORES
Inpu