In [13]:
# Importing Liabraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score

In [2]:
# Load the dataset
data = pd.read_csv('/content/dataset_clause_config.csv')

In [3]:

# Preprocessing tasks such as tokenization and character removal
def preprocess_text(text):
    # Tokenization (assuming words are separated by spaces)
    tokens = text.split()
    # Character removal (can be extended as needed)
    tokens = [word.lower().strip(".,!?") for word in tokens]
    return " ".join(tokens)

data['Paragraph'] = data['Paragraph'].apply(preprocess_text)

In [4]:
# Classifying paragraphs into specific clauses using cosine similarity
def classify_paragraph_cosine_similarity(new_paragraph, data):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(data['Paragraph'])
    new_tfidf = tfidf_vectorizer.transform([new_paragraph])
    similarities = cosine_similarity(new_tfidf, tfidf_matrix)
    max_index = np.argmax(similarities)
    return data.iloc[max_index]['Clause']

In [5]:
# Create a function to train a multi-stage classification model
def train_multi_stage_model(data):
    X_train, X_test, y_train, y_test = train_test_split(data['Paragraph'], data['Clause'], test_size=0.2, random_state=42)

    # Stage 1: Vectorization
    vectorizer = TfidfVectorizer()
    X_train_vec = vectorizer.fit_transform(X_train)

    # Stage 2: Classifier (Random Forest)
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_classifier.fit(X_train_vec, y_train)

    # Stage 3: Classifier (SVM)
    svm_classifier = make_pipeline(StandardScaler(with_mean=False), SVC())
    svm_classifier.fit(X_train_vec, y_train)

    return vectorizer, rf_classifier, svm_classifier

In [6]:
# Function to classify a paragraph using multi-stage model
def classify_paragraph_multi_stage(new_paragraph, vectorizer, rf_classifier, svm_classifier):
    new_tfidf = vectorizer.transform([new_paragraph])
    rf_prediction = rf_classifier.predict(new_tfidf)[0]
    svm_prediction = svm_classifier.predict(new_tfidf)[0]
    # Final prediction can be based on voting or any other logic
    final_prediction = rf_prediction  # For simplicity, taking RF prediction as final
    return final_prediction

In [10]:
# Evaluate the model
def evaluate_model(X_test, y_test, vectorizer, rf_classifier, svm_classifier):
    X_test_vec = vectorizer.transform(X_test)
    rf_predictions = rf_classifier.predict(X_test_vec)
    svm_predictions = svm_classifier.predict(X_test_vec)
    final_predictions = [rf_predictions[i] for i in range(len(rf_predictions))]
    accuracy = accuracy_score(y_test, final_predictions)
    return accuracy

# Take input from the user
new_paragraph = input("Enter the paragraph: ")

# Train the multi-stage model
vectorizer, rf_classifier, svm_classifier = train_multi_stage_model(data)

# Classify the paragraph using the multi-stage model
result = classify_paragraph_multi_stage(new_paragraph, vectorizer, rf_classifier, svm_classifier)
print("Predicted Clause:", result)

Enter the paragraph: Vessel not to be employed on consecutive short haul trades less than 15 days total.
Predicted Clause: 2


In [11]:
# Predict using cosine similarity
cosine_similarity_result = classify_paragraph_cosine_similarity(new_paragraph, data)
print("Result using cosine similarity method:", cosine_similarity_result)

Result using cosine similarity method: 2


In [14]:
# Evaluate the multi-stage model
X = data['Paragraph']
y = data['Clause']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
accuracy = evaluate_model(X_test, y_test, vectorizer, rf_classifier, svm_classifier)
print("Accuracy of the multi-stage model:", accuracy)

Accuracy of the multi-stage model: 0.6744186046511628


In [15]:
# Evaluate cosine similarity method
y_test_cosine = [classify_paragraph_cosine_similarity(paragraph, data) for paragraph in X_test]
accuracy_cosine = accuracy_score(y_test, y_test_cosine)
print("Accuracy of the cosine similarity method:", accuracy_cosine)

Accuracy of the cosine similarity method: 0.9883720930232558
