In [None]:
import re
import spacy
import json
import os

# Loading the NLP model
nlp = spacy.load("en_core_web_sm")

# Function to extract features from a single text document
def extract_features(text,filename):
    features = {}
    text = re.sub(r"Take notes.*?-- Sign up today.*?\.", "", text, flags=re.DOTALL).strip()
    
    # 1. Extract Date of Judgment
    date_match = re.search(r'\b\d{1,2} [A-Z][a-z]+, \d{4}\b', text)
    if date_match:
        features['Date of Judgment'] = date_match.group()
    
    #2
    case_title_match = re.search(r"^.*?Supreme Court of India\n(.*?) on", text)
    features['case_title'] = case_title_match.group(1).strip() if case_title_match else None

    author_match = re.search(r"Author:\s*(.*?)\n", text)
    features['author'] = author_match.group(1).strip() if author_match else None

    # 3. Extract Court Name
    court_match = re.search(r'\b(Supreme Court of India|High Court of \w+|District Court)\b', text)
    if court_match:
        features['Court Name'] = court_match.group()
    
    # 4. Jurisdiction Level
    if "Supreme Court" in text:
        features['Jurisdiction Level'] = "Appellate"
    elif "High Court" in text:
        features['Jurisdiction Level'] = "Appellate or Original"
    else:
        features['Jurisdiction Level'] = "Trial"
    
    # 5. Geographical Region
    state_match = None
    if "High Court" in text:
        state_match = re.search(r'\b(\w+) High Court\b', text)
    if state_match:
        features['Geographical Region'] = state_match.group(1)
    else:
        features['Geographical Region'] = "India"

    # 6. Case Type
    if "CIVIL" in text or "Civil Appeal" in text:
        features['Case Type'] = "Civil Appeal"
    elif "CRIMINAL" in text:
        features['Case Type'] = "Criminal"
    
    # 7. Legal Area
    def extract_legal_area(text):
        legal_areas = {
            "Motor Accident Claim": r"Motor Vehicles Act|Motor Accident",
            "Property Dispute": r"Transfer of Property Act|Land Acquisition Act",
            "Family Law": r"Hindu Marriage Act|Divorce|Custody|Alimony|Maintenance",
            "Contract Law": r"Indian Contract Act|Breach of Contract",
            "Labour Law": r"Industrial Disputes Act|Labour Welfare|Employment Dispute",
            "Intellectual Property": r"Copyright Act|Trademark Act|Patent Act|IPR",
            "Tax Law": r"Income Tax Act|GST Act|Wealth Tax|Direct Tax",
            "Criminal Law": r"Indian Penal Code|Criminal Procedure Code|IPC|CrPC",
            "Constitutional Law": r"Constitution of India|Fundamental Rights|Writ Petition",
            "Environmental Law": r"Environment Protection Act|Wildlife Protection Act|Pollution Control",
            # Add more legal areas and their corresponding keywords or acts as needed
        }

        # Check if any legal area keywords are in the text
        for area, pattern in legal_areas.items():
            if re.search(pattern, text, re.IGNORECASE):
                return area  # Return the first matched legal area

        return "General Law"  # Default if no specific area is matched
    features['Legal Area'] = extract_legal_area(text)
    


    def extract_core_legal_issues(text):
        core_issues = {
            "Compensation": r"compensation|damages|reparation",
            "Dependency": r"dependency|dependents|support",
            "Negligence": r"negligence|carelessness|breach of duty",
            "Liability": r"liability|responsibility|accountability",
            "Family Pension": r"family pension|survivor benefits",
            "Breach of Contract": r"breach of contract|contract violation",
            "Property Rights": r"property rights|ownership|title dispute",
            "Environmental Protection": r"environmental protection|pollution|conservation",
            "Intellectual Property": r"intellectual property|patent|copyright|trademark",
            "Employment Dispute": r"employment dispute|labor rights|unfair dismissal",
            "Criminal Offense": r"criminal offense|crime|offense",
            "Fraud": r"fraud|deception|misrepresentation",
            "Defamation": r"defamation|slander|libel",
            "Divorce": r"divorce|marital dissolution|separation",
            "Child Custody": r"child custody|custodial rights|parental rights",
            "Inheritance": r"inheritance|succession|estate",
            # Add more issues and keywords as needed
        }

        matched_issues = []

        # Check if any core legal issue keywords are in the text
        for issue, pattern in core_issues.items():
            if re.search(pattern, text, re.IGNORECASE):
                matched_issues.append(issue)

        return matched_issues if matched_issues else ["General Issue"]
    issues = extract_core_legal_issues(text)
    features['Core Legal Issues'] = issues

    
    # 9. Cited Precedents
    cited_cases = re.findall(r'\b[A-Z][a-z]+\s+v\.\s+[A-Z][a-z]+\b', text)
    features['Cited Precedents'] = cited_cases if cited_cases else []
    
    # 10. Paragraph Count
    paragraphs = text.split('\n\n')
    features['Paragraph Count'] = len(paragraphs)
    
    # 11. Word Count
    features['Word Count'] = len(text.split())
    
    # 12. Language Complexity (average sentence length)
    def process_text_in_chunks(text, chunk_size=100000):
        sentences = []
        for i in range(0, len(text), chunk_size):
            chunk = text[i:i+chunk_size]
            doc = nlp(chunk)
            sentences.extend(list(doc.sents))  # Collect sentences from each chunk
        return sentences

    # Now use the function to get sentences and calculate the average sentence length
    sentences = process_text_in_chunks(text)
    avg_sentence_length = sum(len(sentence) for sentence in sentences) / len(sentences) if sentences else 0
    features['Average Sentence Length'] = round(avg_sentence_length, 2)

    # 13. Sentiment/Tone (for simplicity, labeling neutral here)
    features['Sentiment/Tone'] = "Neutral"
    
    # 14. Judge(s) Authoring the Judgment
    judges = re.findall(r'(?i)JUDGMENT\s*+\n?\s*+([A-Za-z\s]+),\s*J\.', text)

    bench_match = re.search(r"Bench:\s*(.*?)\n", text)  # Search for the bench composition line

    if bench_match:
        # Strip whitespace and split by commas if there are multiple judges listed
        additional_judges = [x.strip() for x in bench_match.group(1).strip().split(',')]
        judges.extend(additional_judges)  # Use extend to add the list of judges

    features['Judges'] = list(set(judges))  # Remove duplicates by converting to a set and back to a list

    # 15. Bench Composition
    features['Bench Composition'] = len(features['Judges'])
    
    # 16. Concurrence/Dissent Status
    features['Concurrence/Dissent Status'] = "Concurrence" if "concurrence" in text.lower() else "Unanimous"
    
    # 17. Case Outcome
    outcome_match = re.search(r'\bappeals? (allowed|dismissed|partially allowed|granted)\b', text, re.IGNORECASE)
    if outcome_match:
        features['Case Outcome'] = outcome_match.group().capitalize()
    
    # 18. Relief Granted (extract monetary figures)
    relief_amounts = re.findall(r'Rs\.\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?', text)
    # Remove 'Rs.', commas, and convert each amount to an integer before summing
    features['Relief Granted'] = sum(float(amount.replace('Rs.', '').replace(',', '').strip()) for amount in relief_amounts) if relief_amounts else 0

    
    # 19. Appeal Status
    features['Appeal Status'] = "Appellate Decision" if "appeal" in text.lower() else "Original Jurisdiction"
    
    # 20. Number of Citations
    features['Number of Citations'] = len(features['Cited Precedents'])
    
    # 21. Primary Statutes or Articles Cited
    statutes = re.findall(r'\b(?:Section|Article)\s+\d+\b', text)
    features['Primary Statutes'] = list(set(statutes))
    features['Number of Primary Statutes'] = len(list(set(statutes)))
    
    
    def get_court_abbreviation(text):
        # Define abbreviations for different courts
        court_abbreviations = {
            "Supreme Court of India": "SCI",
            "Allahabad High Court": "AHC",
            "Madhya Pradesh High Court": "MPHC",
        }

        # Search for the court name in the text and return the corresponding abbreviation
        for court, abbreviation in court_abbreviations.items():
            if court in text:
                return abbreviation

        return "UNK"  # Default abbreviation if no court is found

    features['case_id'] = f"{get_court_abbreviation(text)}_{os.path.splitext(filename)[0]}"

    features['Number of Core Legal Issues'] = len(issues)
    
    return features
# Function to process all text files in a folder and combine results into a single JSON file
def process_folder(folder_path):
    all_features = []  # List to store features from all documents
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            
            # Read text from file
            with open(file_path, "r", encoding="utf-8") as file:
                text = file.read()
            
            # Extract features
            features = extract_features(text,filename)
            all_features.append(features)  # Append features to the list
            
            print(f"Processed: {filename}")

    # Save all extracted features to a single JSON file
    combined_json_path = os.path.join(folder_path, "combined_features.json")
    with open(combined_json_path, "w", encoding="utf-8") as json_file:
        json.dump(all_features, json_file, indent=4)

    print(f"All features saved to: {combined_json_path}")

# Folder path containing the .txt files
folder_path = "Yearwise_data/1950"

# Process all .txt files in the folder
process_folder(folder_path)
