In [1]:
#Download ollama
!curl -fsSL https://ollama.com/install.sh | sh
import subprocess
process = subprocess.Popen("ollama serve", shell=True) #runs on a different thread
#Download model
!ollama pull llama3
!pip install ollama
!pip install textacy
import ollama

#Then everytime you want to chat
response = ollama.chat(model='llama3', messages=[
  {
    'role': 'user',
    'content': 'Why is the sky blue?',
  },
])
print(response['message']['content'])

>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
############################################################################################# 100.0%                                                                       0.3%#############################################                                       61.2%#################################################                                      61.7%#########################################################                          75.2%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.
[?25lpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴

In [2]:
# Import required libraries
import asyncio
import pandas as pd
from pydantic import BaseModel
from ollama import AsyncClient
import textacy.datasets
import requests
import json

# Load Supreme Court dataset
sc = textacy.datasets.SupremeCourt()
sc.download()

# Create a DataFrame from the dataset
cases = [{"text": record.text} for record in sc.records()]
cases_df = pd.DataFrame(cases)

# Display the first few rows of the dataset
cases_df.head()

100%|██████████| 111M/111M [00:01<00:00, 92.0MB/s] 


Unnamed: 0,text
0,[ Halliburton Oil Well Cementing Co. v. Walker...
1,"Rehearing Denied Dec. 16, 1946. See . Mr.Claud..."
2,"Rehearing Denied Dec. 16, 1946\n See .\n Appea..."
3,"Mr.\nWalter J. Cummings, Jr., of Washington, D..."
4,"Mr.A. Devitt Vaneck, of Washington, D.C., for ..."


In [3]:
# Convert dataset records to a DataFrame
records = list(sc.records())
df = pd.DataFrame(records)

# Display the first few rows
print(df.head())

                                                text  \
0  [ Halliburton Oil Well Cementing Co. v. Walker...   
1  Rehearing Denied Dec. 16, 1946. See . Mr.Claud...   
2  Rehearing Denied Dec. 16, 1946\n See .\n Appea...   
3  Mr.\nWalter J. Cummings, Jr., of Washington, D...   
4  Mr.A. Devitt Vaneck, of Washington, D.C., for ...   

                                                meta  
0  {'issue': '80180', 'issue_area': 8, 'n_min_vot...  
1  {'issue': '10500', 'issue_area': 1, 'n_min_vot...  
2  {'issue': '80250', 'issue_area': 8, 'n_min_vot...  
3  {'issue': '20150', 'issue_area': 2, 'n_min_vot...  
4  {'issue': '80060', 'issue_area': 8, 'n_min_vot...  


In [8]:
def classify_case_local(case_text, model="llama3"):
    """
    Use the local Ollama API to classify a single case text, handling streaming responses.
    Returns 'None' if the response is NaN or invalid.
    """
    url = "http://localhost:11434/api/chat"  # Local Ollama API endpoint
    headers = {"Content-Type": "application/json"}
    payload = {
        "model": model,
        "messages": [
            {
                "role": "system",
                "content": '''
                You are an expert in Supreme Court cases and legal classifications. Your task is to analyze the provided text of a Supreme Court case and classify it into one of the following predefined categories:
                
                Instructions:
                1. Carefully analyze the text and identify the primary category it belongs to.
                2. Provide only the category name as the response and nothing else.
                3. If uncertain, select the most relevant category based on the case's primary legal focus.
                4. Response should not exceed more than 2 words.
                ''',
            },
            {"role": "user", "content": f'''{case_text}

            Please reply with the category only and nothing else. Ensure its category is from the given list or the most relevant one. Here are the categories:
            - None: Cases that do not fit into any specific category.
            - Criminal Procedure: Cases involving the process of investigating and prosecuting crimes.
            - Civil Rights: Cases about the rights of individuals to receive equal treatment.
            - First Amendment: Cases addressing issues related to freedom of speech, religion, or press.
            - Due Process: Cases focusing on legal safeguards ensuring fair treatment.
            - Privacy: Cases dealing with the right to privacy.
            - Attorneys: Cases about the legal profession and lawyer regulations.
            - Unions: Cases concerning labor unions and collective bargaining.
            - Economic Activity: Cases involving business, trade, or commerce.
            - Judicial Power: Cases addressing the powers of courts and the judiciary.
            - Federalism: Cases about the division of power between state and federal governments.
            - Interstate Relations: Cases about interactions between states.
            - Federal Taxation: Cases involving federal tax laws.
            - Miscellaneous: Cases that do not clearly fit into any of the above categories.
            - Private Action: Cases involving disputes between private individuals or entities.
            '''},
        ],
        # Added extra parameters for tweaking response behavior
        "temperature": 0,  # Ensure deterministic behavior
        "top_p": 1.0,      # Consider all tokens equally (reduce randomness)
        "frequency_penalty": 0.0,  # No penalty for repeating words
        "presence_penalty": 0.0,   # No penalty for introducing new topics
    }

    try:
        response = requests.post(url, json=payload, headers=headers, stream=True)
        response.raise_for_status()  # Raise an error for HTTP issues

        # Process the streaming response
        result = ""
        for line in response.iter_lines(decode_unicode=True):
            if line.strip():  # Skip empty lines
                try:
                    # Parse each line as a JSON object
                    message = json.loads(line)
                    content = message.get("message", {}).get("content", "").strip()
                    if content:
                        result += content  # Append content to the result
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON chunk: {e}. Line: {line}")

        # Check if result is NaN or invalid
        if not result or result.lower() in {"nan", ""}:
            return "None"
        
        return result.strip()  # Return the combined result
    except requests.RequestException as e:
        print(f"Request failed: {e}")
        return "None"


In [9]:
# Process and classify cases
classified_cases = []

def process_cases():
    """
    Process all cases synchronously and classify them using the local API.
    """
    for idx, row in cases_df.iterrows():
        print(f"Processing case {idx + 1}/{len(cases_df)}...")

        # Classify the case
        class_name = classify_case_local(row["text"])  # Pass the preprocessed text

        if class_name:
            classified_cases.append({"text": row["text"], "class_name": class_name})
        else:
            print(f"Failed to classify case {idx + 1}")
        # break

    
    print("Processing completed.")

# Run the case processing
process_cases()



Processing case 1/8419...
Processing case 2/8419...
Processing case 3/8419...
Processing case 4/8419...
Processing case 5/8419...
Processing case 6/8419...
Processing case 7/8419...
Processing case 8/8419...
Processing case 9/8419...
Processing case 10/8419...
Processing case 11/8419...
Processing case 12/8419...
Processing case 13/8419...
Processing case 14/8419...
Processing case 15/8419...
Processing case 16/8419...
Processing case 17/8419...
Processing case 18/8419...
Processing case 19/8419...
Processing case 20/8419...
Processing case 21/8419...
Processing case 22/8419...
Processing case 23/8419...
Processing case 24/8419...
Processing case 25/8419...
Processing case 26/8419...
Processing case 27/8419...
Processing case 28/8419...
Processing case 29/8419...
Processing case 30/8419...
Processing case 31/8419...
Processing case 32/8419...
Processing case 33/8419...
Processing case 34/8419...
Processing case 35/8419...
Processing case 36/8419...
Processing case 37/8419...
Processing

In [12]:
# Save the results to a DataFrame
classified_df = pd.DataFrame(classified_cases)


# filling none for Nan values
classified_df.fillna({'class_name': 'None'}, inplace=True)


# Display the results
classified_df.head()

Unnamed: 0,text,class_name
0,[ Halliburton Oil Well Cementing Co. v. Walker...,EconomicActivity
1,"Rehearing Denied Dec. 16, 1946. See . Mr.Claud...",JudicialPower
2,"Rehearing Denied Dec. 16, 1946\n See .\n Appea...",EconomicActivity
3,"Mr.\nWalter J. Cummings, Jr., of Washington, D...",Federalism
4,"Mr.A. Devitt Vaneck, of Washington, D.C., for ...",EconomicActivity


In [13]:
classified_df.to_csv("/kaggle/working/classified_llama3")

In [16]:
# Find rows with any NaN values
na_rows = df[df.isna().any(axis=1)]

# Display the rows with NaN values
print(na_rows)


Empty DataFrame
Columns: [text, meta]
Index: []


Unnamed: 0,text,class_name
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
8414,False,False
8415,False,False
8416,False,False
8417,False,False
