# AILA: Artificial Intelligence for Legal Assisatance 
## Similar Case Matching
### We are given a dataset consisting of 2914 prior cases and a test dataset of 50 queries. We need to retrieve the most similar prior case for each of the queries.

In [52]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

#Imports
import glob
import functools
import datetime as dt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import random
import re
import numpy as np 
import pandas as pd

# Input data files are available in the read-only "../input/" directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input/legalai/Object_casedocs'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))
# the above code will list all files under the input directory

## Data Handling
### We first wrap up all the text files into a single csv(comma separated file)

In [93]:
import glob
import csv

read_files = glob.glob('/kaggle/input/legalai/Object_casedocs/*')

with open("object_casedocs.csv", "w") as outfile:
    w=csv.writer(outfile)
    for f in read_files:
        with open(f, "r") as infile:
            w.writerow([" ".join([line.strip() for line in infile])])

lst_arr = os.listdir('/kaggle/input/legalai/Object_casedocs/')
df_filename = pd.DataFrame(lst_arr, columns = ['Name'])
df_filename

Unnamed: 0,Name
0,C757.txt
1,C1092.txt
2,C1985.txt
3,C39.txt
4,C2055.txt
...,...
2909,C792.txt
2910,C2775.txt
2911,C924.txt
2912,C432.txt


In [107]:
evaluate = pd.read_csv('/kaggle/input/legalai/relevance_judgments_priorcases.txt', delimiter = " ", header = None)
evaluate.columns = ["Query_Number", "Q0", "Document" ,"Relevance"]
evaluate=evaluate.drop(columns=["Q0"])
evaluate

Unnamed: 0,Query_Number,Document,Relevance
0,AILA_Q1,C168,0
1,AILA_Q1,C382,0
2,AILA_Q1,C428,0
3,AILA_Q1,C949,0
4,AILA_Q1,C2303,0
...,...,...,...
145695,AILA_Q50,C1367,0
145696,AILA_Q50,C2079,0
145697,AILA_Q50,C2066,0
145698,AILA_Q50,C1951,0


### A Glimpse about how the data inside the csv file looks!

In [54]:
df = pd.read_csv('object_casedocs.csv',header=None)
df.columns = ["Text"]
df

Unnamed: 0,Text
0,L. Laxmikanta v State by Superintendent of Pol...
1,Homi Rajvansh v State of Maharashtra and other...
2,Direct Recruit Class Ii Engineering OfficersAs...
3,Rajinder Kumar Kindra v Delhi Administration T...
4,Kalyan and Others v State of Uttar Pradesh Sup...
...,...
2909,Haryana State Cooperative Labour and others v ...
2910,State of Karnataka v Chikkahottappa Alias Vara...
2911,Kilari Malakondiaah @ Malayadri and Others v S...
2912,Kanthimathy Plantations Pvt- Limited v State O...


In [55]:
df = pd.concat([df_filename,df], axis = 1)
df

Unnamed: 0,Name,Text
0,C757.txt,L. Laxmikanta v State by Superintendent of Pol...
1,C1092.txt,Homi Rajvansh v State of Maharashtra and other...
2,C1985.txt,Direct Recruit Class Ii Engineering OfficersAs...
3,C39.txt,Rajinder Kumar Kindra v Delhi Administration T...
4,C2055.txt,Kalyan and Others v State of Uttar Pradesh Sup...
...,...,...
2909,C792.txt,Haryana State Cooperative Labour and others v ...
2910,C2775.txt,State of Karnataka v Chikkahottappa Alias Vara...
2911,C924.txt,Kilari Malakondiaah @ Malayadri and Others v S...
2912,C432.txt,Kanthimathy Plantations Pvt- Limited v State O...


### Let us get some basic information about the data

In [56]:
len(df)

2914

In [57]:
df.shape

(2914, 2)

In [58]:
df.info

<bound method DataFrame.info of            Name                                               Text
0      C757.txt  L. Laxmikanta v State by Superintendent of Pol...
1     C1092.txt  Homi Rajvansh v State of Maharashtra and other...
2     C1985.txt  Direct Recruit Class Ii Engineering OfficersAs...
3       C39.txt  Rajinder Kumar Kindra v Delhi Administration T...
4     C2055.txt  Kalyan and Others v State of Uttar Pradesh Sup...
...         ...                                                ...
2909   C792.txt  Haryana State Cooperative Labour and others v ...
2910  C2775.txt  State of Karnataka v Chikkahottappa Alias Vara...
2911   C924.txt  Kilari Malakondiaah @ Malayadri and Others v S...
2912   C432.txt  Kanthimathy Plantations Pvt- Limited v State O...
2913   C244.txt  Union of India and Others v K. P. Prabhakaran ...

[2914 rows x 2 columns]>

## Text preprocessing techniques: Cleansing the data
### 1. Convert to lowercase, remove punctuation and special characters, using RegeX and strip
### 2. Remove stopwords
### 3. Stemming 
### 4. Lemmatization

In [59]:
import re
#Convert lowercase remove punctuation and Character and then strip 
text = df.iloc[0]
print(text)
text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
txt = text.split()
print(txt)

Name                                             C757.txt
Text    L. Laxmikanta v State by Superintendent of Pol...
Name: 0, dtype: object
['name', 'c757txt', 'text', 'l', 'laxmikanta', 'v', 'state', 'by', 'superintendent', 'of', 'pol', 'name', '0', 'dtype', 'object']


In [60]:
#remove stopwords
import nltk
lst_stopwords = nltk.corpus.stopwords.words("english")
txt = [word for word in txt if word not in lst_stopwords]
print(txt)

['name', 'c757txt', 'text', 'l', 'laxmikanta', 'v', 'state', 'superintendent', 'pol', 'name', '0', 'dtype', 'object']


In [61]:
#stemming
ps = nltk.stem.porter.PorterStemmer()
print([ps.stem(word) for word in txt])

['name', 'c757txt', 'text', 'l', 'laxmikanta', 'v', 'state', 'superintend', 'pol', 'name', '0', 'dtype', 'object']


In [62]:
#Lemmetization
nltk.download('wordnet')
lem = nltk.stem.wordnet.WordNetLemmatizer()
print([lem.lemmatize(word) for word in txt])

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
['name', 'c757txt', 'text', 'l', 'laxmikanta', 'v', 'state', 'superintendent', 'pol', 'name', '0', 'dtype', 'object']


## Preprocessing the data: Apply these techniques on all records of the dataset

In [63]:
#to apply all the technique to all the records on dataset
def utils_preprocess_text(text, flg_stemm=True, flg_lemm =True, lst_stopwords=None ):
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    
    #tokenization(convert from string to List)
    lst_text = text.split()
    
    #remove stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in
                   lst_stopwords]
        
     #stemming
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
        
    #Lemmentization
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
        
    # back to string from list
    text = " ".join(lst_text)
    return text

df['clean_text'] = df['Text'].apply(lambda x: utils_preprocess_text(x, flg_stemm = False, flg_lemm=True))

## A Glimpse into the cleansed data!

In [64]:
#df
df

Unnamed: 0,Name,Text,clean_text
0,C757.txt,L. Laxmikanta v State by Superintendent of Pol...,l laxmikanta v state by superintendent of poli...
1,C1092.txt,Homi Rajvansh v State of Maharashtra and other...,homi rajvansh v state of maharashtra and other...
2,C1985.txt,Direct Recruit Class Ii Engineering OfficersAs...,direct recruit class ii engineering officersas...
3,C39.txt,Rajinder Kumar Kindra v Delhi Administration T...,rajinder kumar kindra v delhi administration t...
4,C2055.txt,Kalyan and Others v State of Uttar Pradesh Sup...,kalyan and others v state of uttar pradesh sup...
...,...,...,...
2909,C792.txt,Haryana State Cooperative Labour and others v ...,haryana state cooperative labour and others v ...
2910,C2775.txt,State of Karnataka v Chikkahottappa Alias Vara...,state of karnataka v chikkahottappa alias vara...
2911,C924.txt,Kilari Malakondiaah @ Malayadri and Others v S...,kilari malakondiaah malayadri and others v sta...
2912,C432.txt,Kanthimathy Plantations Pvt- Limited v State O...,kanthimathy plantation pvt limited v state of ...


In [65]:
train = df["clean_text"]
train

0       l laxmikanta v state by superintendent of poli...
1       homi rajvansh v state of maharashtra and other...
2       direct recruit class ii engineering officersas...
3       rajinder kumar kindra v delhi administration t...
4       kalyan and others v state of uttar pradesh sup...
                              ...                        
2909    haryana state cooperative labour and others v ...
2910    state of karnataka v chikkahottappa alias vara...
2911    kilari malakondiaah malayadri and others v sta...
2912    kanthimathy plantation pvt limited v state of ...
2913    union of india and others v k p prabhakaran su...
Name: clean_text, Length: 2914, dtype: object

In [67]:
import texthero as hero

### Creating a test dataframe using the Query file

In [68]:
test = pd.read_csv("/kaggle/input/legalai/Query_doc.txt",delimiter = "|",header=None)
test.columns = ["AILA","NAN", "Query"]
test=test.drop(columns=["AILA","NAN"])

In [69]:
test

Unnamed: 0,Query
0,"The appellant on February 9, 1961 was appointe..."
1,The appellant before us was examined as prime ...
2,This appeal arises from the judgment of the le...
3,The Petitioner was married to the Respondent N...
4,This appeal is preferred against the judgment ...
5,"On 19.3.1999, SI P1 along Ct. P2 went to Villa..."
6,This criminal appeal is directed against the j...
7,"This appeal, by special leave, has been prefer..."
8,The complainant P1 filed a Special Leave Petit...
9,"The four appellants, along with P1 son of P2, ..."


## Cleanse the test data
### We use the same methods as above to cleanse the test data

In [70]:
test['Query_processed'] = test['Query'].apply(lambda x: utils_preprocess_text(x, flg_stemm = False, flg_lemm=True))

In [71]:
test

Unnamed: 0,Query,Query_processed
0,"The appellant on February 9, 1961 was appointe...",the appellant on february 9 1961 wa appointed ...
1,The appellant before us was examined as prime ...,the appellant before u wa examined a prime wit...
2,This appeal arises from the judgment of the le...,this appeal arises from the judgment of the le...
3,The Petitioner was married to the Respondent N...,the petitioner wa married to the respondent no...
4,This appeal is preferred against the judgment ...,this appeal is preferred against the judgment ...
5,"On 19.3.1999, SI P1 along Ct. P2 went to Villa...",on 1931999 si p1 along ct p2 went to village v...
6,This criminal appeal is directed against the j...,this criminal appeal is directed against the j...
7,"This appeal, by special leave, has been prefer...",this appeal by special leave ha been preferred...
8,The complainant P1 filed a Special Leave Petit...,the complainant p1 filed a special leave petit...
9,"The four appellants, along with P1 son of P2, ...",the four appellant along with p1 son of p2 wer...


# BM-25 ranking

In [72]:
!pip install rank_bm25



In [73]:
from rank_bm25 import BM25Okapi

query_array_processed = [0]*50

corpus_array_processed = [0]*2914

train_array=df.iloc[:,1:].values

for i in range(2914):
    corpus_array_processed[i] = train_array[i][0]

query_array=test.iloc[:,1:].values

#test["Query_processed"]
#test.values(columns=[test["Query_processed"]])
#query_array[49][0]

for i in range(50):
    query_array_processed[i] = query_array[i][0]

In [87]:
train_array=df.iloc[:,1:].values
tokenized_corpus = [doc.split(" ") for doc in corpus_array_processed]

In [75]:
bm25 = BM25Okapi(tokenized_corpus)
bm25

<rank_bm25.BM25Okapi at 0x7fa74f185610>

In [125]:
name = df["Name"]
name = name.str.rstrip('.txt')
name

0        C757
1       C1092
2       C1985
3         C39
4       C2055
        ...  
2909     C792
2910    C2775
2911     C924
2912     C432
2913     C244
Name: Name, Length: 2914, dtype: object

In [126]:
bm25.get_top_n(corpus_array_processed[4].split(" "), name, n=10)

['C2055',
 'C241',
 'C6',
 'C4',
 'C822',
 'C1511',
 'C1096',
 'C63',
 'C1855',
 'C1357']

In [129]:
evaluate = evaluate.loc[evaluate['Relevance'] == 1]
evaluate

Unnamed: 0,Query_Number,Document,Relevance
1192,AILA_Q1,C14,1
2274,AILA_Q1,C9,1
3076,AILA_Q2,C27,1
3676,AILA_Q2,C22,1
6033,AILA_Q3,C1,1
...,...,...,...
140861,AILA_Q49,C38,1
142203,AILA_Q49,C76,1
142450,AILA_Q49,C92,1
143069,AILA_Q50,C27,1


In [None]:
retrieved = bm25.get_top_n(query_array_processed[i].split(" "), name, n=10)
relevant = evaluate.loc[evaluate['Query_Number'] == "AILA_Q"+str(i+1)]["Document"]

In [128]:
count = 0
for i in range(50):
    for j in retrieved:
        for k in relevant:
            if (j==k):
                count=count+1

print(count)

20


In [132]:
Precision = count/500
Recall = count/195

print(Precision)
print(Recall)

0.04
0.10256410256410256
