In [87]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


In [88]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')



[nltk_data] Downloading package punkt to /home/sharry/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/sharry/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sharry/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [89]:
nltk.data.find('tokenizers/punkt')

FileSystemPathPointer('/home/sharry/nltk_data/tokenizers/punkt')

In [101]:
from nltk.data import find
print(find('tokenizers/punkt/english.pickle'))


/home/sharry/nltk_data/tokenizers/punkt/english.pickle


In [90]:
df = pd.read_csv('cases_data.csv')


## Exploring the Dataset

In [91]:
df.head()

Unnamed: 0,Case Number,Parties,Date Delivered,Court,Case Action,Judge(s),Citation,County,Full Text
0,Cause 28 of 2020,Chelagat v Kabarak University,04 Jul 2024,Employment and Labour Relations Court at Nakuru,Ruling,David Njagi Nderitu,Chelagat v Kabarak University (Cause 28 of 202...,Nakuru,Chelagat v Kabarak University (Cause 28 of 202...
1,Tribunal Case 308 & 309 of 2020 (Consolidated),Alwy & another v Engroup (K) International Lim...,27 Aug 2024,Business Premises Rent Tribunal,Ruling,A Muma,Alwy & another v Engroup (K) International Lim...,Nairobi,Alwy & another v Engroup (K) International Lim...
2,Civil Case E001 of 2024,Ajiba v Manya,05 Sep 2024,High Court at Busia,Ruling,William Musya Musyoka,Ajiba v Manya (Civil Case E001 of 2024) [2024]...,Busia,Ajiba v Manya (Civil Case E001 of 2024) [2024]...
3,Tribunal Case E592 of 2024,Sharif v Munyoki t/a Kauma Hardware,29 Aug 2024,Business Premises Rent Tribunal,Ruling,A Muma,Sharif v Munyoki t/a Kauma Hardware (Tribunal ...,Nairobi,Sharif v Munyoki t/a Kauma Hardware (Tribunal ...
4,Tribunal Case E742 of 2021,Kamau t/a Victory Choma Roast v Kinyanjui; Mwa...,27 Sep 2023,Business Premises Rent Tribunal,Ruling,A Muma,Kamau t/a Victory Choma Roast v Kinyanjui; Mwa...,Nairobi,Kamau t/a Victory Choma Roast v Kinyanjui; Mwa...


In [92]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116 entries, 0 to 115
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Case Number     116 non-null    object
 1   Parties         116 non-null    object
 2   Date Delivered  116 non-null    object
 3   Court           110 non-null    object
 4   Case Action     116 non-null    object
 5   Judge(s)        116 non-null    object
 6   Citation        115 non-null    object
 7   County          116 non-null    object
 8   Full Text       116 non-null    object
dtypes: object(9)
memory usage: 8.3+ KB


In [93]:
df.isnull().sum()

Case Number       0
Parties           0
Date Delivered    0
Court             6
Case Action       0
Judge(s)          0
Citation          1
County            0
Full Text         0
dtype: int64

## Filling missing values
Filling missing values in Court and Citation columns with "unknown" value

In [94]:
# Fill missing categorical/text values with 'Unknown'
df['Court'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Court'].fillna('Unknown', inplace=True)


In [95]:
# Fill missing categorical/text values with 'Unknown'
df['Citation'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Citation'].fillna('Unknown', inplace=True)


In [96]:
#Checking to see if there are empty cells again 
df.isnull().sum()

Case Number       0
Parties           0
Date Delivered    0
Court             0
Case Action       0
Judge(s)          0
Citation          0
County            0
Full Text         0
dtype: int64

### Text data cleaning removing 'stop words'

In [97]:
lemmatizer = WordNetLemmatizer()




In [98]:
def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized_words)



In [103]:
#df['Full Text'] = df['Full Text'].apply(lemmatize_text)

In [104]:
# Convert 'county' column to categorical
df['County'] = df['County'].astype('category')

In [105]:
print(df)

                                        Case Number  \
0                                  Cause 28 of 2020   
1    Tribunal Case 308 & 309 of 2020 (Consolidated)   
2                           Civil Case E001 of 2024   
3                        Tribunal Case E592 of 2024   
4                        Tribunal Case E742 of 2021   
..                                              ...   
111                      Criminal Appeal 18 of 2018   
112                     Criminal Appeal 196 of 2017   
113                  Civil Application E220 of 2022   
114                    Succession Cause 104 of 2017   
115                  Succession Cause E1085 of 2021   

                                               Parties Date Delivered  \
0                        Chelagat v Kabarak University    04 Jul 2024   
1    Alwy & another v Engroup (K) International Lim...    27 Aug 2024   
2                                        Ajiba v Manya    05 Sep 2024   
3                  Sharif v Munyoki t/a Kauma H

In [107]:
print(df.dtypes)

Case Number         object
Parties             object
Date Delivered      object
Court               object
Case Action         object
Judge(s)            object
Citation            object
County            category
Full Text           object
dtype: object


In [108]:
df.head(10)

Unnamed: 0,Case Number,Parties,Date Delivered,Court,Case Action,Judge(s),Citation,County,Full Text
0,Cause 28 of 2020,Chelagat v Kabarak University,04 Jul 2024,Employment and Labour Relations Court at Nakuru,Ruling,David Njagi Nderitu,Chelagat v Kabarak University (Cause 28 of 202...,Nakuru,Chelagat v Kabarak University (Cause 28 of 202...
1,Tribunal Case 308 & 309 of 2020 (Consolidated),Alwy & another v Engroup (K) International Lim...,27 Aug 2024,Business Premises Rent Tribunal,Ruling,A Muma,Alwy & another v Engroup (K) International Lim...,Nairobi,Alwy & another v Engroup (K) International Lim...
2,Civil Case E001 of 2024,Ajiba v Manya,05 Sep 2024,High Court at Busia,Ruling,William Musya Musyoka,Ajiba v Manya (Civil Case E001 of 2024) [2024]...,Busia,Ajiba v Manya (Civil Case E001 of 2024) [2024]...
3,Tribunal Case E592 of 2024,Sharif v Munyoki t/a Kauma Hardware,29 Aug 2024,Business Premises Rent Tribunal,Ruling,A Muma,Sharif v Munyoki t/a Kauma Hardware (Tribunal ...,Nairobi,Sharif v Munyoki t/a Kauma Hardware (Tribunal ...
4,Tribunal Case E742 of 2021,Kamau t/a Victory Choma Roast v Kinyanjui; Mwa...,27 Sep 2023,Business Premises Rent Tribunal,Ruling,A Muma,Kamau t/a Victory Choma Roast v Kinyanjui; Mwa...,Nairobi,Kamau t/a Victory Choma Roast v Kinyanjui; Mwa...
5,Tribunal Case E123 of 2024,Matheka v Monga,29 Aug 2024,Business Premises Rent Tribunal,Ruling,Joyce Osodo,Matheka v Monga (Tribunal Case E123 of 2024) [...,Nairobi,Matheka v Monga (Tribunal Case E123 of 2024) [...
6,Tribunal Case E551 of 2023,Chepkosi v Njambi,30 Aug 2024,Business Premises Rent Tribunal,Ruling,A Muma,Chepkosi v Njambi (Tribunal Case E551 of 2023)...,Nairobi,Chepkosi v Njambi (Tribunal Case E551 of 2023)...
7,Employment and Labour Relations Cause 331 of 2014,Mwangi v Family Bank Limited,04 Jul 2024,Employment and Labour Relations Court at Nakuru,Ruling,David Njagi Nderitu,Mwangi v Family Bank Limited (Employment and L...,Nakuru,Mwangi v Family Bank Limited (Employment and L...
8,Miscellaneous Application E011 of 2022,A S Kuloba & Wangila Advocates v Walingo,04 Jul 2024,Employment and Labour Relations Court at Nakuru,Ruling,David Njagi Nderitu,A S Kuloba & Wangila Advocates v Walingo (Misc...,Nakuru,A S Kuloba & Wangila Advocates v Walingo (Misc...
9,Tribunal Case E079 of 2024,Colin Stuart t/a Little Bay Investment Ltd v Yego,29 Aug 2024,Business Premises Rent Tribunal,Ruling,A Muma,Colin Stuart t/a Little Bay Investment Ltd v Y...,Mombasa,Colin Stuart t/a Little Bay Investment Ltd v Y...


## Encoding data

In [109]:
# List of counties
counties = [
    'Mombasa', 'Kwale', 'Kilifi', 'Tana River', 'Lamu', 'Taita/Taveta', 'Garissa', 
    'Wajir', 'Mandera', 'Marsabit', 'Isiolo', 'Meru', 'Tharaka-Nithi', 'Embu', 
    'Kitui', 'Machakos', 'Makueni', 'Nyandarua', 'Nyeri', 'Kirinyaga', "Murang'a", 
    'Kiambu', 'Turkana', 'West Pokot', 'Samburu', 'Trans Nzoia', 'Uasin Gishu', 
    'Elgeyo/Marakwet', 'Nandi', 'Baringo', 'Laikipia', 'Nakuru', 'Narok', 'Kajiado', 
    'Kericho', 'Bomet', 'Kakamega', 'Vihiga', 'Bungoma', 'Busia', 'Siaya', 'Kisumu', 
    'Homa Bay', 'Migori', 'Kisii', 'Nyamira', 'Nairobi City'
]


In [111]:
df = pd.DataFrame(counties, columns=['County'])

# Create a column with sequential numbers starting from 1 and formatting them to 3 digits
df['County_Code'] = [f'{i:03}' for i in range(1, len(df) + 1)]

# Display the DataFrame
print(df)

             County County_Code
0           Mombasa         001
1             Kwale         002
2            Kilifi         003
3        Tana River         004
4              Lamu         005
5      Taita/Taveta         006
6           Garissa         007
7             Wajir         008
8           Mandera         009
9          Marsabit         010
10           Isiolo         011
11             Meru         012
12    Tharaka-Nithi         013
13             Embu         014
14            Kitui         015
15         Machakos         016
16          Makueni         017
17        Nyandarua         018
18            Nyeri         019
19        Kirinyaga         020
20         Murang'a         021
21           Kiambu         022
22          Turkana         023
23       West Pokot         024
24          Samburu         025
25      Trans Nzoia         026
26      Uasin Gishu         027
27  Elgeyo/Marakwet         028
28            Nandi         029
29          Baringo         030
30      