In [None]:
#FINE-TUNED MODEL
import ast
import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import DataCollatorForTokenClassification
from sklearn.metrics import precision_recall_fscore_supaport, accuracy_score
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments


dataset_path = r"C:\Users\ASUS\OneDrive\Documents\Semester 9\Mini Project\NER.csv"
df = pd.read_csv(dataset_path)

df['tokens'] = df['tokens'].apply(ast.literal_eval)
df['BIO_tags'] = df['BIO_tags'].apply(ast.literal_eval)

# Create a mapping from labels to IDs and vice versa
unique_labels = set(label for labels in df['BIO_tags'] for label in labels)
label_to_id = {label: idx for idx, label in enumerate(sorted(unique_labels))}
id_to_label = {idx: label for label, idx in label_to_id.items()}

df['labels'] = df['BIO_tags'].apply(lambda x: [label_to_id[label] for label in x])

from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

train_dataset = Dataset.from_pandas(train_df[['tokens', 'labels']])
val_dataset = Dataset.from_pandas(val_df[['tokens', 'labels']])

tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples['tokens'], 
        truncation=True, 
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100) 
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])  
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
val_dataset = val_dataset.map(tokenize_and_align_labels, batched=True)

train_dataset.set_format("torch")
val_dataset.set_format("torch")

num_labels = len(label_to_id)
model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=num_labels)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    logging_steps=50,
    fp16=True
)


data_collator = DataCollatorForTokenClassification(tokenizer)


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id_to_label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id_to_label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_predictions = [item for sublist in true_predictions for item in sublist]
    true_labels = [item for sublist in true_labels for item in sublist]

    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, true_predictions, average='weighted')
    accuracy = accuracy_score(true_labels, true_predictions)

    return {
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'accuracy': accuracy,
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

model.save_pretrained('./ner_model')
tokenizer.save_pretrained('./ner_model')

Map:   0%|          | 0/8491 [00:00<?, ? examples/s]

Map:   0%|          | 0/944 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


In [1]:
# Import necessary libraries
import os
import re
import docx
import torch
import queue
import PyPDF2
import logging
import requests
import threading
import tkinter as tk
from tkinter import ttk
from sklearn.svm import SVC
from bs4 import BeautifulSoup
from tkinter import filedialog, messagebox, Text
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline, BertTokenizerFast, BertForTokenClassification




In [2]:
# Loading the fine-tuned model and tokenizer
model = BertForTokenClassification.from_pretrained('./ner_model')
tokenizer = BertTokenizerFast.from_pretrained('./ner_model')
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")


In [3]:
result_queue = queue.Queue()

In [4]:
def upload_file():
    file_path = filedialog.askopenfilename(
        title="Select Case File",
        filetypes=[("PDF Files", "*.pdf"), ("Word Documents", "*.docx"), ("Text Files", "*.txt")]
    )
    if file_path:
        threading.Thread(target=process_file, args=(file_path,), daemon=True).start()
        start_progress_bar()
    else:
        messagebox.showinfo("Info", "No file selected.")

In [5]:
#Processing the file and printing necessary outputs
def process_file(file_path):
    try:
        logging.info(f"File Selected: {file_path}")
        file_label.config(text=f"Selected File Path: {file_path}")

        if file_path.endswith(".pdf"):
            content = extract_pdf(file_path)
        elif file_path.endswith(".docx"):
            content = extract_docx(file_path)
        elif file_path.endswith(".txt"):
            content = extract_txt(file_path)
        else:
            messagebox.showerror("Error", "Unsupported file format.")
            return

        entities = extract_entities(content)
        unique_entities = list({(ent['word'], ent['entity_group']) for ent in entities})
        print("Extracted Entities:", unique_entities)

        search_query = " ".join([ent[0] for ent in unique_entities])
        case_links = search_indian_kanoon(search_query)
        print("Found Case Links:", case_links)

        for i in range(min(len(unique_entities), len(case_links))):
            ent = unique_entities[i]
            link = case_links[i]
            print(f"{ent[0]} : {link}")

        result_queue.put(content)
        root.after(100, update_gui)
    except Exception as e:
        logging.error(f"An error occurred: {e}")
        messagebox.showerror("Error", f"An error occurred: {e}")
    finally:
        stop_progress_bar()

In [6]:
def update_gui():
    try:
        while not result_queue.empty():
            content = result_queue.get()
            display_content(content)
    except Exception as e:
        messagebox.showerror("Error", f"An error occurred while updating GUI: {e}")
    finally:
        root.after(100, update_gui)

In [7]:
def display_content(content):
    content_window = tk.Toplevel()
    content_window.title("Extracted Content")
    text_area = Text(content_window, wrap='word')
    text_area.insert('1.0', "Extracted Content:\n\n" + content + "\n\n")
    text_area.config(state='disabled')
    text_area.pack(expand=True, fill='both')
    content_window.geometry("600x500")

In [8]:
def extract_pdf(file_path):
    print(f"Processing PDF: {file_path}")
    content = ""
    try:
        with open(file_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text = page.extract_text()
                if text:
                    content += text + "\n"
    except Exception as e:
        print(f"Error processing PDF: {e}")
    return content.strip()

In [9]:
def extract_docx(file_path):
    print(f"Processing DOCX: {file_path}")
    content = ""
    try:
        doc = docx.Document(file_path)
        for paragraph in doc.paragraphs:
            content += paragraph.text + "\n"
    except Exception as e:
        print(f"Error processing DOCX: {e}")
    return content.strip()

In [10]:
def extract_txt(file_path):
    print(f"Processing TXT: {file_path}")
    content = ""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
    except Exception as e:
        print(f"Error processing TXT: {e}")
    return content.strip()

In [11]:
def extract_entities(text):
    try:
        entities = ner_pipeline(text)
        return entities
    except Exception as e:
        print(f"Error extracting entities: {e}")
        return []

In [12]:
def search_indian_kanoon(query):
    base_url = "https://indiankanoon.org/search/?formInput="
    search_url = base_url + query.replace(" ", "+")
    try:
        response = requests.get(search_url)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, "html.parser")

        case_links = []
        for link in soup.find_all("a", href=True):
            href = link['href']
            if "/doc/" in href:
                case_links.append("https://indiankanoon.org" + href)
        return case_links
    except Exception as e:
        print(f"Error occurred while searching: {e}")
        return []


In [13]:
def start_progress_bar():
    progress_bar.pack(pady=10)
    progress_bar.start(10)

In [14]:
def stop_progress_bar():
    progress_bar.stop()
    progress_bar.pack_forget()

In [15]:
def search_entities():
    query = search_entry.get()
    if query:
        threading.Thread(target=process_search, args=(query,), daemon=True).start()
        start_progress_bar()
    else:
        messagebox.showinfo("Info", "Please enter a search term.")

In [16]:
def display_search_results(entities, case_links):
    results_window = tk.Toplevel()
    results_window.title("Search Results")
    results_window.geometry("600x500")
    text_area = Text(results_window, wrap='word')

    content = "Extracted Entities and Case Links:\n\n"
    for ent, link in zip(entities, case_links):
        content += f"{ent['word']} : {link}\n"
    
    # Display clickable links in text area
    text_area.insert('1.0', content)
    text_area.config(state='disabled')
    text_area.pack(expand=True, fill='both')

    for link in case_links:
        text_area.insert(tk.END, f"\n{link}\n", 'hyperlink')


In [17]:
def process_search(query):
    try:
        entities = extract_entities(query)
        print("Extracted Entities from Search Query:", entities)

        search_query = " ".join([ent['word'] for ent in entities])
        case_links = search_indian_kanoon(search_query)
        print("Found Case Links:", case_links)
        content = "Extracted Entities and Case Links:\n\n"
        for ent, link in zip(entities, case_links):
            content += f"{ent['word']} : {link}\n"
        print(content)
        
        display_search_results(entities, case_links)
    except Exception as e:
        messagebox.showerror("Error", f"An error occurred during the search: {e}")
    finally:
        stop_progress_bar()

In [18]:
# Initialize the Tkinter GUI
root = tk.Tk()
root.title("File Upload and Content Extractor")
root.geometry("600x400")
root.config(bg="#f0f0f0")

In [19]:
instruction_label = tk.Label(root, text="Upload the case file as .pdf, .docx or .txt",
                               bg="#f0f0f0", font=("Arial", 12))
instruction_label.pack(pady=10)

In [20]:
# Entry for search term
search_entry = tk.Entry(root, width=50, font=("Arial", 12))
search_entry.pack(pady=10)

In [21]:
# Search button
search_btn = tk.Button(root, text="Search", command=search_entities, width=20, bg="#4CAF50", fg="white", font=("Arial", 12))
search_btn.pack(pady=10)

In [22]:
# Frame for the upload button
frame = tk.Frame(root, bg="#ffffff", bd=2, relief=tk.GROOVE)
frame.pack(pady=20, padx=10, fill='both', expand=True)

In [23]:
upload_btn = tk.Button(frame, text="Upload File", command=upload_file, width=20, bg="#4CAF50", fg="white", font=("Arial", 12))
upload_btn.pack(pady=10)

In [24]:
# Label to display selected file path
file_label = tk.Label(frame, text="Selected File Path: ", bg="#f0f0f0", font=("Arial", 10))
file_label.pack(pady=5)

In [25]:
# Progress bar (initially hidden)
progress_bar = ttk.Progressbar(root, orient="horizontal", mode="indeterminate")

In [26]:
# Start the Tkinter event loop
root.mainloop()

In [27]:
import ast
import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import DataCollatorForTokenClassification
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
import asyncio
import queue
import logging
import requests
import threading
import tkinter as tk
from tkinter import filedialog, messagebox, Text
import PyPDF2
import docx
from bs4 import BeautifulSoup

# Load the cases CSV file
cases_df = pd.read_csv(r"C:\Users\ASUS\OneDrive\Documents\Semester 9\Mini Project\Cases.csv")

# Check the CSV structure (make sure 'CaseName' and 'CaseDetails' columns exist)
print(cases_df.head())

dataset_path = r"C:\Users\ASUS\OneDrive\Documents\Semester 9\Mini Project\NER.csv"
df = pd.read_csv(dataset_path)

df['tokens'] = df['tokens'].apply(ast.literal_eval)
df['BIO_tags'] = df['BIO_tags'].apply(ast.literal_eval)

# Create a mapping from labels to IDs and vice versa
unique_labels = set(label for labels in df['BIO_tags'] for label in labels)
label_to_id = {label: idx for idx, label in enumerate(sorted(unique_labels))}
id_to_label = {idx: label for label, idx in label_to_id.items()}

df['labels'] = df['BIO_tags'].apply(lambda x: [label_to_id[label] for label in x])

from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

train_dataset = Dataset.from_pandas(train_df[['tokens', 'labels']])
val_dataset = Dataset.from_pandas(val_df[['tokens', 'labels']])

tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples['tokens'], 
        truncation=True, 
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100) 
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])  
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
val_dataset = val_dataset.map(tokenize_and_align_labels, batched=True)

train_dataset.set_format("torch")
val_dataset.set_format("torch")

num_labels = len(label_to_id)
model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=num_labels)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    logging_steps=50,
    fp16=True
)

data_collator = DataCollatorForTokenClassification(tokenizer)

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

tokenizer.save_pretrained('./ner_model')

# Loading the fine-tuned model and tokenizer
model = BertForTokenClassification.from_pretrained('./ner_model')
tokenizer = BertTokenizerFast.from_pretrained('./ner_model')
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

result_queue = queue.Queue()

# Function to upload case files
def upload_file():
    file_path = filedialog.askopenfilename(
        title="Select Case File",
        filetypes=[("PDF Files", "*.pdf"), ("Word Documents", "*.docx"), ("Text Files", "*.txt")]
    )
    if file_path:
        threading.Thread(target=process_file, args=(file_path,), daemon=True).start()
        start_progress_bar()
    else:
        messagebox.showinfo("Info", "No file selected.")

# Processing the file and printing necessary outputs
def process_file(file_path):
    try:
        logging.info(f"File Selected: {file_path}")
        file_label.config(text=f"Selected File Path: {file_path}")

        if file_path.endswith(".pdf"):
            content = extract_pdf(file_path)
        elif file_path.endswith(".docx"):
            content = extract_docx(file_path)
        elif file_path.endswith(".txt"):
            content = extract_txt(file_path)
        else:
            messagebox.showerror("Error", "Unsupported file format.")
            return

        # Extract entities from content
        entities = extract_entities(content)
        unique_entities = list({(ent['word'], ent['entity_group']) for ent in entities})
        print("Extracted Entities:", unique_entities)

        # Search for these entities in the Cases CSV file
        case_matches = []
        for ent in unique_entities:
            word = ent[0]  # The entity word
            # Check for the entity word in the 'CaseName' or 'CaseDetails' column
            matched_cases = cases_df[cases_df['CaseName'].str.contains(word, case=False, na=False)]
            case_matches.extend(matched_cases[['CaseName', 'CaseDetails']].values.tolist())

        if case_matches:
            print("Matched Cases:")
            for case in case_matches:
                print(f"Case Name: {case[0]}, Case Details: {case[1]}")

            # Display matched cases in the GUI
            result_queue.put(case_matches)
            root.after(100, update_gui)
        else:
            print("No matching cases found.")
            
    except Exception as e:
        logging.error(f"An error occurred: {e}")
        messagebox.showerror("Error", f"An error occurred: {e}")
    finally:
        stop_progress_bar()

def update_gui():
    try:
        while not result_queue.empty():
            content = result_queue.get()
            display_content(content)
    except Exception as e:
        messagebox.showerror("Error", f"An error occurred while updating GUI: {e}")
    finally:
        root.after(100, update_gui)

def display_content(content):
    content_window = tk.Toplevel()
    content_window.title("Extracted Entities and Case Matches")
    text_area = Text(content_window, wrap='word')

    content_to_display = "Matched Cases:\n\n"
    for case in content:
        content_to_display += f"Case Name: {case[0]}\nCase Details: {case[1]}\n\n"

    text_area.insert('1.0', content_to_display)
    text_area.config(state='disabled')
    text_area.pack(expand=True, fill='both')
    content_window.geometry("600x500")

def extract_pdf(file_path):
    print(f"Processing PDF: {file_path}")
    content = ""
    try:
        with open(file_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text = page.extract_text()
                if text:
                    content += text + "\n"
    except Exception as e:
        print(f"Error processing PDF: {e}")
    return content.strip()

def extract_docx(file_path):
    print(f"Processing DOCX: {file_path}")
    content = ""
    try:
        doc = docx.Document(file_path)
        for paragraph in doc.paragraphs:
            content += paragraph.text + "\n"
    except Exception as e:
        print(f"Error processing DOCX: {e}")
    return content.strip()

def extract_txt(file_path):
    print(f"Processing TXT: {file_path}")
    content = ""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
    except Exception as e:
        print(f"Error processing TXT: {e}")
    return content.strip()

def extract_entities(text):
    try:
        entities = ner_pipeline(text)
        return entities
    except Exception as e:
        print(f"Error extracting entities: {e}")
        return []

def start_progress_bar():
    progress_bar.pack(pady=10)
    progress_bar.start(10)

def stop_progress_bar():
    progress_bar.stop()
    progress_bar.pack_forget()

# Initialize the Tkinter GUI
root = tk.Tk()
root.title("File Upload and Content Extractor")
root.geometry("600x400")
root.config(bg="#f0f0f0")
instruction_label = tk.Label(root, text="Upload the case file as .pdf, .docx or .txt",
                               bg="#f0f0f0", font=("Arial", 12))
instruction_label.pack(pady=10)

# File label widget definition
file_label = tk.Label(root, text="Selected File Path: ", bg="#f0f0f0", font=("Arial", 10))
file_label.pack(pady=10)

upload_btn = tk.Button(root, text="Upload File", command=upload_file, bg="#4CAF50", fg="white", font=("Arial", 12), height=2)
upload_btn.pack(pady=10)

search_btn = tk.Button(root, text="Search", command=upload_file, bg="#2196F3", fg="white", font=("Arial", 12), height=2)
search_btn.pack(pady=10)

progress_bar = ttk.Progressbar(root, mode='indeterminate')

root.mainloop()


                                          Case_Title             Citation  \
0    State Of Karnataka Vs State Of Karnataka (1986)  2013 BLA 1241 (KAR)   
1  Karnataka Wakf Board vs. State of Karnataka an...   2003 BLA 1243 (SC)   
2  Super Sales Corporation Vs Sr. Superintendent ...  1999 BLA 1246 (OHC)   
3  Lakshmi Narayana Puli vs Indian Oil Corporatio...   2012 BLA 1248 (AP)   
4      Union of India vs Anwar Ahemad Qureshi (1998)  1998 BLA 1249 (BOM)   

                       Court Judgement_Date  \
0       Karnataka High Court       3/1/2013   
1     Supreme Court of India     30-01-2003   
2          Orissa High Court     16-01-1999   
3  Andhra Pradesh High Court       7/1/2012   
4          Bombay High Court     28-01-1998   

                                              Judges  \
0          K.N. Keshavanarayana J, Ram Mohan Reddy J   
1  S. Rajendra Babu J, K.G. Balakrishnan J, G.P. ...   
2                          R.K. Patra J, R.K. Dash J   
3                  Madan B. Lo

Map:   0%|          | 0/8491 [00:00<?, ? examples/s]

Map:   0%|          | 0/944 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
import ast
import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import DataCollatorForTokenClassification
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
import queue
import threading
import tkinter as tk
from tkinter import filedialog, messagebox, Text
import logging

# Load the dataset
dataset_path = r"C:\Users\ASUS\OneDrive\Documents\Semester 9\Mini Project\Cases.csv"
cases_df = pd.read_csv(dataset_path)

# Define the tokenizer and model for NER (pre-trained or fine-tuned model)
tokenizer = BertTokenizerFast.from_pretrained('./ner_model')
model = BertForTokenClassification.from_pretrained('./ner_model')
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

result_queue = queue.Queue()

# Function to search for relevant cases from the dataset
def search_cases_from_dataset(query):
    filtered_cases = cases_df[cases_df.apply(
        lambda row: row.astype(str).str.contains(query, case=False, na=False).any(), axis=1
    )]
    
    case_links = []
    for _, row in filtered_cases.iterrows():
        case_links.append({
            'Case_Title': row['Case_Title'],
            'Citation': row['Citation'],
            'Court': row['Court'],
            'Judgement_Date': row['Judgement_Date'],
            'Judges': row['Judges'],
            'Petitioner': row['Petitioner'],
            'Respondent': row['Respondent'],
            'Bench': row['Bench'],
            'Legal_Laws': row['Legal_Laws'],
            'Preceding': row['Preceding'],
            'Result': row['Result']
        })
    
    return case_links


# Function to process the uploaded case files (PDF, DOCX, TXT)
def upload_file():
    file_path = filedialog.askopenfilename(
        title="Select Case File",
        filetypes=[("PDF Files", "*.pdf"), ("Word Documents", "*.docx"), ("Text Files", "*.txt")]
    )
    if file_path:
        threading.Thread(target=process_file, args=(file_path,), daemon=True).start()
        start_progress_bar()
    else:
        messagebox.showinfo("Info", "No file selected.")

# Function to process the file and print necessary outputs
def process_file(file_path):
    try:
        logging.info(f"File Selected: {file_path}")
        file_label.config(text=f"Selected File Path: {file_path}")

        if file_path.endswith(".pdf"):
            content = extract_pdf(file_path)
        elif file_path.endswith(".docx"):
            content = extract_docx(file_path)
        elif file_path.endswith(".txt"):
            content = extract_txt(file_path)
        else:
            messagebox.showerror("Error", "Unsupported file format.")
            return

        entities = extract_entities(content)
        unique_entities = list({(ent['word'], ent['entity_group']) for ent in entities})
        print("Extracted Entities:", unique_entities)

        search_query = " ".join([ent[0] for ent in unique_entities])
        case_links = search_cases_from_dataset(search_query)  # Search in dataset instead of Indian Kanoon
        print("Found Case Links:", case_links)

        # Display extracted entities and case links
        for i in range(min(len(unique_entities), len(case_links))):
            ent = unique_entities[i]
            link = case_links[i]
            print(f"{ent[0]} : {link}")

        result_queue.put(content)
        root.after(100, update_gui)
    except Exception as e:
        logging.error(f"An error occurred: {e}")
        messagebox.showerror("Error", f"An error occurred: {e}")
    finally:
        stop_progress_bar()

# Update the GUI with the extracted content
def update_gui():
    try:
        while not result_queue.empty():
            content = result_queue.get()
            display_content(content)
    except Exception as e:
        messagebox.showerror("Error", f"An error occurred while updating GUI: {e}")
    finally:
        root.after(100, update_gui)

# Display the extracted content in a new window
def display_content(content):
    content_window = tk.Toplevel()
    content_window.title("Extracted Content")
    text_area = Text(content_window, wrap='word')
    text_area.insert('1.0', "Extracted Content:\n\n" + content + "\n\n")
    text_area.config(state='disabled')
    text_area.pack(expand=True, fill='both')
    content_window.geometry("600x500")

# Extract content from PDF files
def extract_pdf(file_path):
    print(f"Processing PDF: {file_path}")
    content = ""
    try:
        with open(file_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text = page.extract_text()
                if text:
                    content += text + "\n"
    except Exception as e:
        print(f"Error processing PDF: {e}")
    return content.strip()

# Extract content from DOCX files
def extract_docx(file_path):
    print(f"Processing DOCX: {file_path}")
    content = ""
    try:
        doc = docx.Document(file_path)
        for paragraph in doc.paragraphs:
            content += paragraph.text + "\n"
    except Exception as e:
        print(f"Error processing DOCX: {e}")
    return content.strip()

# Extract content from TXT files
def extract_txt(file_path):
    print(f"Processing TXT: {file_path}")
    content = ""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
    except Exception as e:
        print(f"Error processing TXT: {e}")
    return content.strip()

# Extract entities using the fine-tuned NER model
def extract_entities(text):
    try:
        entities = ner_pipeline(text)
        return entities
    except Exception as e:
        print(f"Error extracting entities: {e}")
        return []

# Start the progress bar
def start_progress_bar():
    progress_bar.pack(pady=10)
    progress_bar.start(10)

# Stop the progress bar
def stop_progress_bar():
    progress_bar.stop()
    progress_bar.pack_forget()

# Initialize the Tkinter GUI
root = tk.Tk()
root.title("File Upload and Content Extractor")
root.geometry("600x400")
root.config(bg="#f0f0f0")
instruction_label = tk.Label(root, text="Upload the case file as .pdf, .docx or .txt",
                               bg="#f0f0f0", font=("Arial", 12))
instruction_label.pack(pady=10)

# Entry for search term
search_entry = tk.Entry(root, width=50, font=("Arial", 12))
search_entry.pack(pady=10)

# Search button
search_btn = tk.Button(root, text="Search", command=upload_file, width=20, bg="#4CAF50", fg="white", font=("Arial", 12))
search_btn.pack(pady=10)

# Frame for the upload button
frame = tk.Frame(root, bg="#ffffff", bd=2, relief=tk.GROOVE)
frame.pack(pady=20, padx=10, fill='both', expand=True)
upload_btn = tk.Button(frame, text="Upload File", command=upload_file, width=20, bg="#4CAF50", fg="white", font=("Arial", 12))
upload_btn.pack(pady=10)

# Label to display selected file path
file_label = tk.Label(frame, text="Selected File Path: ", bg="#f0f0f0", font=("Arial", 10))
file_label.pack(pady=5)

# Progress bar (initially hidden)
progress_bar = ttk.Progressbar(root, orient="horizontal", mode="indeterminate")

# Start the Tkinter event loop
root.mainloop()


Processing PDF: C:/Users/ASUS/OneDrive/Documents/Semester 9/Mini Project/Case File.pdf
Extracted Entities: [(', 560001 - Model of Phone Purchased : iPhone 14 Pro - Price Paid : â‚¹1, 20, 000 - Receipt Number : 789456123 Upon purchasing the phone,', 'LABEL_26'), ('15, 2024', 'LABEL_15'), ('##nagar,', 'LABEL_26'), ('. Case Number : 2024 - CR - 1567 Date Filed :', 'LABEL_26'), ('##hakar. rao @ example. com Defendant Information : - Name :', 'LABEL_26'), ('Karnataka', 'LABEL_3'), ('California', 'LABEL_3'), ('21, 202', 'LABEL_15'), (', 560022 - Contact Number : + 91 98765 43210 - Email : p', 'LABEL_26'), ('Inc', 'LABEL_18'), ('Cup', 'LABEL_3'), ('##ino', 'LABEL_16'), ('Indira', 'LABEL_3'), ('##4 Court : District Consumer Disputes Redressal Forum,', 'LABEL_26'), ('USA', 'LABEL_3'), ('- Purchase Location : iRetail Store, 12, MG Road,', 'LABEL_26'), ('P', 'LABEL_8'), ('##rab', 'LABEL_6'), ('Complainant Information : - Name :', 'LABEL_26'), ('##ert', 'LABEL_26'), ('India', 'LABEL_3'), ('Apple',