In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv

## Load the Dataset

In [10]:
def load_data(dataset_path, delimiter=","):
    try:
        dataset = pd.read_csv(dataset_path, sep=delimiter, quoting=1)
        print(f"Loaded dataset with shape: {dataset.shape}")
        print(f"Columns: {list(dataset.columns)}\n")
        return dataset
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

dataset_path = "datasets/legal.csv"
df = load_data(dataset_path)

Loaded dataset with shape: (7821, 2)
Columns: ['judgement', 'summary']



## Remove Duplicate Rows

In [11]:
def remove_duplicates(dataset):
    print(f"Original dataset shape: {dataset.shape}")
    print(f"Duplicate rows: {dataset.duplicated().sum()}")
    
    df_unique = dataset[~dataset.duplicated(keep='first')]
    
    print(f"Dataset after removing duplicates: {df_unique.shape}")
    print(f"Rows removed: {dataset.shape[0] - df_unique.shape[0]}\n")
    
    return df_unique

df_clean = remove_duplicates(df)

Original dataset shape: (7821, 2)
Duplicate rows: 77
Dataset after removing duplicates: (7744, 2)
Rows removed: 77



## Remove NaN Values

In [12]:
def remove_nan_values(dataset):
    """Remove rows with NaN values"""
    print(f"Original dataset shape: {dataset.shape}")
    print(f"Rows with NaN: {dataset.isna().any(axis=1).sum()}")
    
    df_clean = dataset.dropna()
    
    print(f"Dataset after removing NaN: {df_clean.shape}")
    print(f"Rows removed: {dataset.shape[0] - df_clean.shape[0]}\n")
    
    return df_clean

df_clean = remove_nan_values(df_clean)

Original dataset shape: (7744, 2)
Rows with NaN: 0
Dataset after removing NaN: (7744, 2)
Rows removed: 0



## Identify and Remove Outliers

In [13]:
def remove_outliers_iqr(dataset, text_columns=['judgement', 'summary']):
    print(f"Original dataset shape: {dataset.shape}\n")
    
    for col in text_columns:
        if col in dataset.columns:
            dataset[f'{col}_length'] = dataset[col].astype(str).apply(len)
    
    df_filtered = dataset.copy()
    for col in text_columns:
        if f'{col}_length' in df_filtered.columns:
            Q1 = df_filtered[f'{col}_length'].quantile(0.25)
            Q3 = df_filtered[f'{col}_length'].quantile(0.75)
            IQR = Q3 - Q1
            
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            
            outliers_before = df_filtered.shape[0]
            df_filtered = df_filtered[
                (df_filtered[f'{col}_length'] >= lower_bound) & 
                (df_filtered[f'{col}_length'] <= upper_bound)
            ]
            
            print(f"{col}:")
            print(f"  Q1: {Q1:.0f}, Q3: {Q3:.0f}, IQR: {IQR:.0f}")
            print(f"  Bounds: [{lower_bound:.0f}, {upper_bound:.0f}]")
            print(f"  Outliers removed: {outliers_before - df_filtered.shape[0]}\n")
    
    df_filtered = df_filtered.drop(columns=[col for col in df_filtered.columns if col.endswith('_length')])
    
    print(f"Dataset after removing outliers: {df_filtered.shape}")
    print(f"Total rows removed: {dataset.shape[0] - df_filtered.shape[0]}\n")
    
    return df_filtered

df_clean = remove_outliers_iqr(df_clean)

Original dataset shape: (7744, 2)

judgement:
  Q1: 12028, Q3: 32527, IQR: 20498
  Bounds: [-18719, 63275]
  Outliers removed: 675

summary:
  Q1: 2425, Q3: 5444, IQR: 3019
  Bounds: [-2104, 9972]
  Outliers removed: 275

Dataset after removing outliers: (6794, 2)
Total rows removed: 950



## Save Cleaned Dataset

In [14]:
def save_cleaned_dataset(dataset, output_path, delimiter=","):
    dataset.to_csv(
        output_path, 
        sep=delimiter, 
        index=False, 
        quoting=csv.QUOTE_ALL
    )
    print(f"✓ Cleaned dataset saved to: {output_path}")
    print(f"  Total rows: {len(dataset)}")
    print(f"  Total columns: {len(dataset.columns)}")

output_path = "datasets/legal_cleaned.csv"
save_cleaned_dataset(df_clean, output_path)

✓ Cleaned dataset saved to: datasets/legal_cleaned.csv
  Total rows: 6794
  Total columns: 2


In [15]:
print("=" * 60)
print("CLEANED DATASET SUMMARY")
print("=" * 60)
print(f"Final shape: {df_clean.shape}")
print(f"\nDataset Info:")
print(df_clean.info())
print(f"\nFirst few rows:")
print(df_clean.head())

CLEANED DATASET SUMMARY
Final shape: (6794, 2)

Dataset Info:
<class 'pandas.DataFrame'>
Index: 6794 entries, 4 to 7819
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   judgement  6794 non-null   str  
 1   summary    6794 non-null   str  
dtypes: str(2)
memory usage: 157.1 MB
None

First few rows:
                                            judgement  \
4   This appeal raises the issue as to whether a t...   
6   This appeal concerns the proper ambit of the o...   
7   In August 2007, the vessel B Atlantic, owned b...   
8   Part II of the Landlord and Tenant Act 1954 co...   
12  This appeal raises a question relating to the ...   

                                              summary  
4   This appeal concerns the extent to which a non...  
6   Ahava was a shop in Covent Garden, London, whi...  
7   In August 2007 B Atlantic (the Vessel), owned ...  
8   This appeal concerns qualified security of ten...  
12  The Appel