<a href="https://colab.research.google.com/github/Bhakthi45/Generative-AI/blob/main/HACKATHON.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Sample dataset
data = {
    'text': [
        "This contract is made between employer and employee for work terms...",
        "This agreement ensures that confidential data is not shared...",
        "This lease agreement allows the tenant to use the premises...",
        "The service provider agrees to deliver services as outlined..."
    ],
    'label': ['Employment', 'NDA', 'Lease', 'Service']
}

df = pd.DataFrame(data)

# Text preprocessing
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text'])
y = np.array(df['label'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = MultinomialNB()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")

# Test with new legal text
new_text = ["This document protects confidential information."]
new_vector = vectorizer.transform(new_text)
print("Predicted Contract Type:", model.predict(new_vector)[0])


Accuracy: 0.00%
Predicted Contract Type: Lease


In [5]:
import spacy

# Load pre-trained legal NLP model
nlp = spacy.load("en_core_web_sm")

# Sample legal document
legal_text = """This Non-Disclosure Agreement (NDA) is entered into on April 5, 2023,
between ABC Corp and XYZ Ltd, ensuring confidential information remains protected."""

# Process text
doc = nlp(legal_text)

# Extract Named Entities
print("Entities in the contract:")
for ent in doc.ents:
    print(f"{ent.text} - {ent.label_}")


Entities in the contract:
NDA - ORG
April 5, 2023 - DATE
ABC Corp - ORG
XYZ Ltd - ORG


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Sample legal clauses
clauses = [
    "The employee must provide 30 days' notice before resignation.",
    "The contractor must provide a written notice before terminating services.",
    "Confidential information must not be disclosed to third parties.",
    "The employer reserves the right to terminate employment for misconduct."
]

# Convert text to TF-IDF vectors
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(clauses)

# Compute cosine similarity
similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Display similarity
import pandas as pd
df_sim = pd.DataFrame(similarity_matrix, columns=["Clause1", "Clause2", "Clause3", "Clause4"])
print(df_sim)


    Clause1   Clause2   Clause3   Clause4
0  1.000000  0.401162  0.055632  0.103654
1  0.401162  1.000000  0.055632  0.103654
2  0.055632  0.055632  1.000000  0.072123
3  0.103654  0.103654  0.072123  1.000000


In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Sample dataset
data = {
    'text': [
        "The employee must not disclose confidential company information.",
        "The contract may be terminated with 30 days' notice.",
        "The employer shall provide health benefits as per the policy.",
        "This agreement protects intellectual property from misuse."
    ],
    'label': ['Confidentiality', 'Termination', 'Employment', 'Confidentiality']
}

df = pd.DataFrame(data)

# Text vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text'])
y = df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")


Accuracy: 0.00%


In [10]:
!pip install sumy
!python -c "import nltk; nltk.download('punkt')"
!python -c "import nltk; nltk.download('punkt_tab')"

from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer

# Sample long legal document
legal_text = """
This Service Agreement is made on April 5, 2023, between ABC Corp and XYZ Ltd.
The Service Provider agrees to deliver IT consulting services for a period of one year.
The Client shall pay a monthly retainer of $5,000. If the Client fails to make timely payments,
the Service Provider reserves the right to terminate services after a 30-day notice period.
All disputes shall be resolved through arbitration.
"""

# Summarization
parser = PlaintextParser.from_string(legal_text, Tokenizer("english"))
summarizer = TextRankSummarizer()
summary = summarizer(parser.document, 2)  # Summarize to 2 sentences

print("Legal Document Summary:")
for sentence in summary:
    print(sentence)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
Legal Document Summary:
The Service Provider agrees to deliver IT consulting services for a period of one year.
If the Client fails to make timely payments, the Service Provider reserves the right to terminate services after a 30-day notice period.


In [11]:
import time
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer

# Sample dataset
data = {
    'text': [
        "The employee must provide 30 days' notice before resignation.",
        "The contractor must provide a written notice before terminating services.",
        "Confidential information must not be disclosed to third parties.",
        "The employer reserves the right to terminate employment for misconduct."
    ],
    'label': ['Employment', 'Termination', 'Confidentiality', 'Termination']
}
df = pd.DataFrame(data)

# Text vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text'])
y = np.array(df['label'])

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Naïve Bayes Classifier ---
start_time = time.time()
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)
nb_accuracy = accuracy_score(y_test, y_pred_nb) * 100
nb_time = time.time() - start_time

# --- Random Forest Classifier ---
start_time = time.time()
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf) * 100
rf_time = time.time() - start_time

# --- Named Entity Recognition (NER) ---
nlp = spacy.load("en_core_web_sm")
start_time = time.time()
legal_text = "This contract was signed by XYZ Ltd on April 5, 2023."
doc = nlp(legal_text)
ner_time = time.time() - start_time
ner_entities = [(ent.text, ent.label_) for ent in doc.ents]

# --- Clause Similarity (TF-IDF + Cosine Similarity) ---
start_time = time.time()
tfidf_matrix = vectorizer.transform(df['text'])
similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
similarity_time = time.time() - start_time

# --- Legal Document Summarization (TextRank) ---
start_time = time.time()
parser = PlaintextParser.from_string(legal_text, Tokenizer("english"))
summarizer = TextRankSummarizer()
summary = summarizer(parser.document, 2)
summarization_time = time.time() - start_time
summary_text = " ".join([str(sentence) for sentence in summary])

# --- Results ---
comparison_results = pd.DataFrame({
    "Model": ["Naïve Bayes", "Random Forest", "NER", "Clause Similarity", "Summarization"],
    "Accuracy (%)": [nb_accuracy, rf_accuracy, "-", "-", "-"],
    "Execution Time (s)": [nb_time, rf_time, ner_time, similarity_time, summarization_time],
    "Usability (1-5)": [4, 3, 5, 4, 5],
    "Scalability (1-5)": [3, 4, 5, 3, 3]
})

print(comparison_results)
print("\nNER Entities Extracted:", ner_entities)
print("\nSummarized Text:", summary_text)


               Model Accuracy (%)  Execution Time (s)  Usability (1-5)  \
0        Naïve Bayes          0.0            0.012282                4   
1      Random Forest          0.0            0.381444                3   
2                NER            -            0.032886                5   
3  Clause Similarity            -            0.003185                4   
4      Summarization            -            0.070702                5   

   Scalability (1-5)  
0                  3  
1                  4  
2                  5  
3                  3  
4                  3  

NER Entities Extracted: [('XYZ Ltd', 'ORG'), ('April 5, 2023', 'DATE')]

Summarized Text: This contract was signed by XYZ Ltd on April 5, 2023.


In [12]:
import time
import spacy
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer

# Load SpaCy NER model
nlp = spacy.load("en_core_web_sm")

# Sample legal contracts dataset
data = {
    'text': [
        "The employee must provide 30 days' notice before resignation.",
        "The contractor must provide a written notice before terminating services.",
        "Confidential information must not be disclosed to third parties.",
        "The employer reserves the right to terminate employment for misconduct."
    ],
    'label': ['Employment', 'Termination', 'Confidentiality', 'Termination']
}
df = pd.DataFrame(data)

# Convert text data into numerical features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text'])
y = np.array(df['label'])

# Split dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Contract Type Classification (Naïve Bayes) ---
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)
nb_accuracy = accuracy_score(y_test, y_pred_nb) * 100

# --- Clause Classification (Random Forest) ---
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf) * 100

# --- Named Entity Recognition (NER) ---
legal_text = "This contract was signed by XYZ Ltd on April 5, 2023."
doc = nlp(legal_text)
ner_entities = [(ent.text, ent.label_) for ent in doc.ents]

# --- Clause Similarity (TF-IDF + Cosine Similarity) ---
tfidf_matrix = vectorizer.transform(df['text'])
similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

# --- Legal Document Summarization (TextRank) ---
parser = PlaintextParser.from_string(legal_text, Tokenizer("english"))
summarizer = TextRankSummarizer()
summary = summarizer(parser.document, 2)
summary_text = " ".join([str(sentence) for sentence in summary])

# --- Final Results ---
hybrid_results = pd.DataFrame({
    "Model": ["Naïve Bayes (Contract Type)", "Random Forest (Clause Classification)", "NER (Entity Extraction)", "Clause Similarity", "Summarization"],
    "Accuracy (%)": [nb_accuracy, rf_accuracy, "-", "-", "-"],
})

print(hybrid_results)
print("\nNER Entities Extracted:", ner_entities)
print("\nSummarized Text:", summary_text)


                                   Model Accuracy (%)
0            Naïve Bayes (Contract Type)          0.0
1  Random Forest (Clause Classification)          0.0
2                NER (Entity Extraction)            -
3                      Clause Similarity            -
4                          Summarization            -

NER Entities Extracted: [('XYZ Ltd', 'ORG'), ('April 5, 2023', 'DATE')]

Summarized Text: This contract was signed by XYZ Ltd on April 5, 2023.
