# AILA: Artificial Intelligence for Legal Assisatance 
## Similar Case Matching
### We are given a dataset consisting of 2914 prior cases and a test dataset of 50 queries. We need to retrieve the most similar prior case for each of the queries.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

#Imports
import glob
import functools
import datetime as dt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import random
import re
import numpy as np 
import pandas as pd

# Input data files are available in the read-only "../input/" directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input/legalai/Object_casedocs'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))
# the above code will list all files under the input directory

## Data Handling
### We first wrap up all the text files into a single csv(comma separated file)

In [2]:
import glob
import csv

read_files = glob.glob('/kaggle/input/legalai/Object_casedocs/*')

with open("object_casedocs.csv", "w") as outfile:
    w=csv.writer(outfile)
    for f in read_files:
        with open(f, "r") as infile:
            w.writerow([" ".join([line.strip() for line in infile])])

### A Glimpse about how the data inside the csv file looks!

In [3]:
df = pd.read_csv('object_casedocs.csv',header=None)
df.columns = ["Text"]
df

Unnamed: 0,Text
0,L. Laxmikanta v State by Superintendent of Pol...
1,Homi Rajvansh v State of Maharashtra and other...
2,Direct Recruit Class Ii Engineering OfficersAs...
3,Rajinder Kumar Kindra v Delhi Administration T...
4,Kalyan and Others v State of Uttar Pradesh Sup...
...,...
2909,Haryana State Cooperative Labour and others v ...
2910,State of Karnataka v Chikkahottappa Alias Vara...
2911,Kilari Malakondiaah @ Malayadri and Others v S...
2912,Kanthimathy Plantations Pvt- Limited v State O...


### Let us get some basic information about the data

In [4]:
len(df)

2914

In [5]:
df.shape

(2914, 1)

In [6]:
df.info

<bound method DataFrame.info of                                                    Text
0     L. Laxmikanta v State by Superintendent of Pol...
1     Homi Rajvansh v State of Maharashtra and other...
2     Direct Recruit Class Ii Engineering OfficersAs...
3     Rajinder Kumar Kindra v Delhi Administration T...
4     Kalyan and Others v State of Uttar Pradesh Sup...
...                                                 ...
2909  Haryana State Cooperative Labour and others v ...
2910  State of Karnataka v Chikkahottappa Alias Vara...
2911  Kilari Malakondiaah @ Malayadri and Others v S...
2912  Kanthimathy Plantations Pvt- Limited v State O...
2913  Union of India and Others v K. P. Prabhakaran ...

[2914 rows x 1 columns]>

## Text preprocessing techniques: Cleansing the data
### 1. Convert to lowercase, remove punctuation and special characters, using RegeX and strip
### 2. Remove stopwords
### 3. Stemming 
### 4. Lemmatization

In [7]:
import re
#Convert lowercase remove punctuation and Character and then strip 
text = df.iloc[0]
print(text)
text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
txt = text.split()
print(txt)

Text    L. Laxmikanta v State by Superintendent of Pol...
Name: 0, dtype: object
['text', 'l', 'laxmikanta', 'v', 'state', 'by', 'superintendent', 'of', 'pol', 'name', '0', 'dtype', 'object']


In [8]:
#remove stopwords
import nltk
lst_stopwords = nltk.corpus.stopwords.words("english")
txt = [word for word in txt if word not in lst_stopwords]
print(txt)

['text', 'l', 'laxmikanta', 'v', 'state', 'superintendent', 'pol', 'name', '0', 'dtype', 'object']


In [9]:
#stemming
ps = nltk.stem.porter.PorterStemmer()
print([ps.stem(word) for word in txt])

['text', 'l', 'laxmikanta', 'v', 'state', 'superintend', 'pol', 'name', '0', 'dtype', 'object']


In [10]:
#Lemmetization
nltk.download('wordnet')
lem = nltk.stem.wordnet.WordNetLemmatizer()
print([lem.lemmatize(word) for word in txt])

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
['text', 'l', 'laxmikanta', 'v', 'state', 'superintendent', 'pol', 'name', '0', 'dtype', 'object']


## Preprocessing the data: Apply these techniques on all records of the dataset

In [11]:
#to apply all the technique to all the records on dataset
def utils_preprocess_text(text, flg_stemm=True, flg_lemm =True, lst_stopwords=None ):
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    
    #tokenization(convert from string to List)
    lst_text = text.split()
    
    #remove stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in
                   lst_stopwords]
        
     #stemming
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
        
    #Lemmentization
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
        
    # back to string from list
    text = " ".join(lst_text)
    return text

df['clean_text'] = df['Text'].apply(lambda x: utils_preprocess_text(x, flg_stemm = False, flg_lemm=True))

## A Glimpse into the cleansed data!

In [13]:
df

Unnamed: 0,Text,clean_text
0,L. Laxmikanta v State by Superintendent of Pol...,l laxmikanta v state by superintendent of poli...
1,Homi Rajvansh v State of Maharashtra and other...,homi rajvansh v state of maharashtra and other...
2,Direct Recruit Class Ii Engineering OfficersAs...,direct recruit class ii engineering officersas...
3,Rajinder Kumar Kindra v Delhi Administration T...,rajinder kumar kindra v delhi administration t...
4,Kalyan and Others v State of Uttar Pradesh Sup...,kalyan and others v state of uttar pradesh sup...
...,...,...
2909,Haryana State Cooperative Labour and others v ...,haryana state cooperative labour and others v ...
2910,State of Karnataka v Chikkahottappa Alias Vara...,state of karnataka v chikkahottappa alias vara...
2911,Kilari Malakondiaah @ Malayadri and Others v S...,kilari malakondiaah malayadri and others v sta...
2912,Kanthimathy Plantations Pvt- Limited v State O...,kanthimathy plantation pvt limited v state of ...


In [14]:
!pip install texthero==1.0.5 #texthero is a powerful nlp tool



In [15]:
import texthero as hero

## TF-IDF: Term frequency–inverse document frequency 

### A numerical statistic that is intended to reflect how important a word is to a document in a corpus.

In [16]:
df['tfidf'] = hero.do_tfidf(df['clean_text'])

In [17]:
df['tfidf']

0       [0.036527890324617486, 0.022072598945900933, 0...
1       [0.0680975915495247, 0.014402194732130617, 0.0...
2       [0.0, 0.002176426570238964, 0.0172982357520640...
3       [0.0, 0.014077931517055916, 0.0051378696136571...
4       [0.10465800676875278, 0.004918774036765286, 0....
                              ...                        
2909    [0.0, 0.06597640534295754, 0.01203934568864838...
2910    [0.07186571273694932, 0.016887919656505383, 0....
2911    [0.0942349275447202, 0.07402597486681295, 0.01...
2912    [0.0, 0.30147098614966705, 0.0, 0.007074728345...
2913    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0223263946041...
Name: tfidf, Length: 2914, dtype: object

### Creating a test dataframe using the Query file

In [18]:
test = pd.read_csv("/kaggle/input/legalai/Query_doc.txt",delimiter = "|",header=None)
test.columns = ["AILA","NAN", "Query"]
test=test.drop(columns=["AILA","NAN"])

In [19]:
test

Unnamed: 0,Query
0,"The appellant on February 9, 1961 was appointe..."
1,The appellant before us was examined as prime ...
2,This appeal arises from the judgment of the le...
3,The Petitioner was married to the Respondent N...
4,This appeal is preferred against the judgment ...
5,"On 19.3.1999, SI P1 along Ct. P2 went to Villa..."
6,This criminal appeal is directed against the j...
7,"This appeal, by special leave, has been prefer..."
8,The complainant P1 filed a Special Leave Petit...
9,"The four appellants, along with P1 son of P2, ..."


## Cleanse the test data
### We use the same methods as above to cleanse the test data

In [20]:
test['clean_text'] = test['Query'].apply(lambda x: utils_preprocess_text(x, flg_stemm = False, flg_lemm=True))

In [21]:
test['tfidf'] = hero.do_tfidf(test['clean_text'])

## A glimpse of how the test dataframe looks like
### It consists of 3 columns- The original query, the cleaned text and the vectorized data

In [22]:
test

Unnamed: 0,Query,clean_text,tfidf
0,"The appellant on February 9, 1961 was appointe...",the appellant on february 9 1961 wa appointed ...,"[0.0, 0.0, 0.0, 0.023265269274371628, 0.0, 0.0..."
1,The appellant before us was examined as prime ...,the appellant before u wa examined a prime wit...,"[0.03603730006560546, 0.0, 0.01801865003280273..."
2,This appeal arises from the judgment of the le...,this appeal arises from the judgment of the le...,"[0.0, 0.17116461074366593, 0.0, 0.108547582725..."
3,The Petitioner was married to the Respondent N...,the petitioner wa married to the respondent no...,"[0.0, 0.09919502078555022, 0.0, 0.104844294602..."
4,This appeal is preferred against the judgment ...,this appeal is preferred against the judgment ...,"[0.034799206112638245, 0.12179722139423385, 0...."
5,"On 19.3.1999, SI P1 along Ct. P2 went to Villa...",on 1931999 si p1 along ct p2 went to village v...,"[0.07263643606978144, 0.12106072678296907, 0.0..."
6,This criminal appeal is directed against the j...,this criminal appeal is directed against the j...,"[0.0, 0.0, 0.0, 0.030927126783117872, 0.041908..."
7,"This appeal, by special leave, has been prefer...",this appeal by special leave ha been preferred...,"[0.05344425372445778, 0.12470325869040148, 0.0..."
8,The complainant P1 filed a Special Leave Petit...,the complainant p1 filed a special leave petit...,"[0.026273195373738057, 0.19704896530303542, 0...."
9,"The four appellants, along with P1 son of P2, ...",the four appellant along with p1 son of p2 wer...,"[0.05659528663050006, 0.08489292994575008, 0.0..."


# Vectorizing the data using Tfidf Vectorizer

In [23]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorise the data
vec = TfidfVectorizer()

X = vec.fit_transform([df['clean_text'][0]]) # `X` will now be a TF-IDF representation of the data, the first row of `X` corresponds to the first sentence in `data`
Y = vec.transform([test['clean_text'][0]]) # 'Y' will now be a TF-IDF representation of the data

## Finding Cosine Similarity for each of the Queries to each of the training samples
### For each of the 50 queries, we iterate over all the prior cases and find the one which has the highest cosine similarity with the that query. We then append both these values into separate arrays.

In [24]:
max_index_array = []
max_similarity_score_array = []

for i in range(len(test)):
    Y = vec.fit_transform([test['clean_text'][i]])
    
    max_similarity = 0
    max_index = -1
    for j in range(len(df)):
        
        X = vec.transform([df['clean_text'][j]])
        S = cosine_similarity(X,Y)
        #print(S[i][0])
        if (S[0][0]>max_similarity):
            max_similarity = S[0][0]
            max_index = j
        
    max_index_array.append(max_index)
    max_similarity_score_array.append(max_similarity)
    

In [28]:
print(max_index_array) # the index of the prior document with the highest cosine similarity for each of the 50 queries

[232, 822, 339, 2694, 663, 1089, 1428, 1559, 668, 906, 2770, 2379, 2525, 1238, 1138, 2490, 2604, 2187, 2774, 2394, 1940, 402, 2110, 1596, 1244, 822, 295, 28, 1141, 1598, 918, 1534, 2285, 1688, 1858, 746, 2507, 1179, 2180, 2315, 2575, 2694, 2390, 2073, 2649, 1381, 2575, 1873, 79, 822]


In [29]:
print(max_similarity_score_array) # the value of the cosine similarity for each of the 50 queries

[0.9529367001327095, 0.9409197374568615, 0.9370746787515178, 0.9434257480309546, 0.9284010961694416, 0.9178189599766118, 0.9495154131757083, 0.940872962834488, 0.941410860601876, 0.8997674205562887, 0.9552130351589725, 0.8013936362070003, 0.9263123745716466, 0.9002578877916514, 0.978782413885863, 0.8825793489183309, 0.9498736471895454, 0.9468904718093757, 0.96733592138956, 0.949479776054762, 0.9472911853507456, 0.9709765904549609, 0.9347725228457751, 0.9506069528132177, 0.9326516683507472, 0.9613735520602388, 0.893275688670986, 0.9114074610301138, 0.9482734197666621, 0.9563297685390909, 0.9536303583580877, 0.9131507471776771, 0.9488887516914465, 0.9399663087901251, 0.9493459348775186, 0.9017758603772148, 0.9448850183294216, 0.9361226750802891, 0.9567124763482466, 0.9129556972813528, 0.9592978985338997, 0.9136064028313299, 0.8767267584477333, 0.9379712924011954, 0.945967620620473, 0.922772599949533, 0.964234556401232, 0.9355647739786285, 0.880158553263502, 0.9387155582204889]
