In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder

# Step 1: Load ILDC Dataset (Assuming it's a CSV file)
ildc_df = pd.read_csv("ildc_dataset.csv")  # Replace with actual dataset path

# Step 2: Feature Extraction (Example NLP Processing)
# Assuming 'case_text' contains legal case details
import re
from sklearn.feature_extraction.text import TfidfVectorizer

# Simple regex-based feature extraction
def extract_features(text):
    prior_convictions = len(re.findall(r"prior conviction|previous offense", text, re.IGNORECASE))
    bail_history = len(re.findall(r"bail granted|bail denied", text, re.IGNORECASE))
    charge_severity = len(re.findall(r"serious charge|minor charge", text, re.IGNORECASE))
    return [prior_convictions, bail_history, charge_severity]

# Apply feature extraction
ildc_df[['prior_convictions', 'bail_history', 'charge_severity']] = ildc_df['case_text'].apply(lambda x: pd.Series(extract_features(x)))

# Encode labels
label_encoder = LabelEncoder()
ildc_df['risk_category'] = label_encoder.fit_transform(ildc_df['risk_category'])  # High = 2, Medium = 1, Low = 0

# Step 3: Train Random Forest Classifier to Assign Risk Scores
X = ildc_df[['prior_convictions', 'bail_history', 'charge_severity']]
y = ildc_df['risk_category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
ildc_df['risk_score'] = rf_model.predict(X)

# Step 4: Load PredEx Dataset (Assuming it's a CSV file)
predex_df = pd.read_csv("predex_dataset.csv")  # Replace with actual dataset path

# Extract risk scores and case outcomes
X_predex = predex_df[['risk_score']]
y_predex = predex_df['case_outcome']  # Assuming 'case_outcome' is a numeric representation

# Apply Linear Regression for weight calculation
lr_model = LinearRegression()
lr_model.fit(X_predex, y_predex)
predex_df['weight'] = lr_model.coef_[0]

# Step 5: Compute Weighted Scores
ildc_df['weighted_score'] = ildc_df['risk_score'] * predex_df['weight'].mean()

# Step 6: Predict Overall Risk Level
def assign_final_risk(score):
    if score > 2.5:
        return "High"
    elif score >= 1.5:
        return "Medium"
    else:
        return "Low"

ildc_df['final_risk_level'] = ildc_df['weighted_score'].apply(assign_final_risk)

# Save final results
ildc_df.to_csv("final_risk_assessment.csv", index=False)

print("Risk assessment completed and saved!")


In [None]:
!pip install datasets pandas scikit-learn


Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

In [None]:
from datasets import load_dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


In [None]:
# Load dataset from Hugging Face
dataset = load_dataset("L-NLProc/PredEx", split="train")

# Convert to Pandas DataFrame
df = pd.DataFrame(dataset)


README.md:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

train-instruction-tuning_pred-ex.csv:   0%|          | 0.00/232M [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/367M [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
df.head()


Unnamed: 0,Case Name,Input,Output,Label,Count,Decision_Count,text
0,"KAMLESH Vs. UNION OF INDIA THROUGH SECRETARY, ...",30.3.92 after noon regular appoint is made. Mi...,0[ds]7. It is clear from the order of appointm...,0,1253,564,### Instruction:\nForecast the likely verdict ...
1,KANWAR PAL SINGH Vs. THE STATE OF UTTAR PRADESH,raised by the appellant in the written submiss...,1[ds]5. We find the submission of the appellan...,1,4065,2268,"### Instruction:\nFirst, predict whether the a..."
2,Manke Ram Vs. State of Haryana,appellant in this case is that even if the pro...,1[ds]6. Having perused the material on record ...,1,1616,708,### Instruction:\nDetermine the likely decisio...
3,Kr. Jyoti Sarup and Another Vs. Board of Reven...,and by the substantive part of sub-section (1)...,"0[ds]7. We are in agreement with the view, exp...",0,2440,784,### Instruction:\nJudge the probable resolutio...
4,"Commissioner Of Income-Tax, Kerala Vs. Gemini ...",the true costs of trading in the particular ye...,1[ds]That case can have no application to the ...,1,3317,722,### Instruction:\nAssess the case to predict t...
