In [10]:
# Step 1: Import all required libraries
import json
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
from collections import Counter
import pickle

print("✅ All libraries imported successfully!")
print("\n" + "="*60)

✅ All libraries imported successfully!



In [11]:
# Step 2: Load and inspect the JSON data
def load_json_data(file_path):
    """Load contract data from JSON file"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        print(f"✅ Successfully loaded {len(data)} documents from {file_path}")
        return data
    except FileNotFoundError:
        print(f"❌ Error: File '{file_path}' not found!")
        return None
    except json.JSONDecodeError:
        print(f"❌ Error: Invalid JSON format in file '{file_path}'!")
        return None
    except Exception as e:
        print(f"❌ Error loading file: {e}")
        return None

# Load your data (update the path as needed)
json_file_path = '/content/COMBINED_CONTRACTS.json'  # Change this to your actual path
data = load_json_data(json_file_path)

if data is not None:
    print(f"\n📊 First document sample:")
    print(f"Filename: {data[0]['filename']}")
    print(f"Content preview: {data[0]['content'][:100]}...")
    print(f"\nTotal documents: {len(data)}")

    # Show filename distribution
    filenames = [item['filename'] for item in data]
    print(f"\n📁 Filename examples: {filenames[:5]}...")
else:
    print("❌ Cannot proceed without data")

print("\n" + "="*60)

✅ Successfully loaded 5 documents from /content/COMBINED_CONTRACTS.json

📊 First document sample:
Filename: ACS_PARTNERSHIP_AGREEMENT.pdf
Content preview: synthetic partnership agreements letter 1  partnership agreement partnership agreement example this ...

Total documents: 5

📁 Filename examples: ['ACS_PARTNERSHIP_AGREEMENT.pdf', 'ACS_VENDOR_AGREEMENT.pdf', 'ACS_EMPLOYMENT_CONTRACT.pdf', 'ACS_NDA.pdf', 'ACS_SLA.pdf']...



In [12]:
# Step 3: Extract contract types from filenames and analyze
def extract_contract_type(filename):
    """Extract contract type from filename with more patterns"""
    filename_upper = filename.upper()

    # More comprehensive pattern matching
    if any(keyword in filename_upper for keyword in ['PARTNERSHIP', 'PARTNER']):
        return 'Partnership'
    elif any(keyword in filename_upper for keyword in ['VENDOR', 'SUPPLIER', 'PROCUREMENT']):
        return 'Vendor'
    elif any(keyword in filename_upper for keyword in ['EMPLOYMENT', 'EMPLOYEE', 'JOB', 'HIRE']):
        return 'Employment'
    elif any(keyword in filename_upper for keyword in ['NDA', 'NON_DISCLOSURE', 'CONFIDENTIALITY']):
        return 'NDA'
    elif any(keyword in filename_upper for keyword in ['SLA', 'SERVICE', 'SUPPORT', 'MAINTENANCE']):
        return 'Service'
    elif any(keyword in filename_upper for keyword in ['LEASE', 'RENTAL', 'RENT']):
        return 'Lease'
    elif any(keyword in filename_upper for keyword in ['PURCHASE', 'SALE', 'BUY', 'SELL']):
        return 'Purchase'
    else:
        return 'Unknown'

# Extract contract types
contract_types = []
for item in data:
    contract_type = extract_contract_type(item['filename'])
    contract_types.append(contract_type)

print("📋 Contract Type Distribution:")
type_counter = Counter(contract_types)
for contract_type, count in type_counter.items():
    print(f"   {contract_type}: {count} documents")

print(f"\n📈 Total labeled documents: {sum(type_counter.values())}")
print(f"❓ Unknown documents: {type_counter.get('Unknown', 0)}")

# Show examples of each type
print("\n🔍 Examples of each contract type:")
for contract_type in set(contract_types):
    if contract_type != 'Unknown':
        example_idx = next(i for i, ct in enumerate(contract_types) if ct == contract_type)
        print(f"   {contract_type}: {data[example_idx]['filename']}")

print("\n" + "="*60)

📋 Contract Type Distribution:
   Partnership: 1 documents
   Vendor: 1 documents
   Employment: 1 documents
   NDA: 1 documents
   Service: 1 documents

📈 Total labeled documents: 5
❓ Unknown documents: 0

🔍 Examples of each contract type:
   Partnership: ACS_PARTNERSHIP_AGREEMENT.pdf
   Service: ACS_SLA.pdf
   NDA: ACS_NDA.pdf
   Employment: ACS_EMPLOYMENT_CONTRACT.pdf
   Vendor: ACS_VENDOR_AGREEMENT.pdf



In [13]:
# Step 4: Text preprocessing and cleaning
def advanced_preprocess_text(text):
    """Enhanced text preprocessing"""
    if not text:
        return ""

    # Convert to lowercase
    text = text.lower()

    # Replace common legal abbreviations
    legal_replacements = {
        'inc.': 'incorporated',
        'llc': 'limited liability company',
        'ltd.': 'limited',
        'corp.': 'corporation',
        '&': 'and',
        'w/': 'with',
        'w/o': 'without'
    }

    for abbrev, full_form in legal_replacements.items():
        text = text.replace(abbrev, full_form)

    # Remove extra whitespace and newlines
    text = re.sub(r'\s+', ' ', text)

    # Remove special characters but keep important legal terms
    text = re.sub(r'[^\w\s-]', ' ', text)

    # Remove extra spaces
    text = text.strip()

    return text

# Test preprocessing on a sample
sample_text = data[0]['content'][:200] if data else "Sample contract text with Inc. & LLC"
cleaned_sample = advanced_preprocess_text(sample_text)

print("🧹 Text Preprocessing Test:")
print(f"Original: {sample_text}")
print(f"Cleaned: {cleaned_sample}")

# Preprocess all documents
documents = []
labels = []
unknown_count = 0

for i, item in enumerate(data):
    contract_type = extract_contract_type(item['filename'])
    if contract_type != 'Unknown':
        cleaned_content = advanced_preprocess_text(item['content'])
        documents.append(cleaned_content)
        labels.append(contract_type)
    else:
        unknown_count += 1

print(f"\n📊 Processed {len(documents)} documents for training")
print(f"📊 Skipped {unknown_count} documents with unknown types")
print(f"📊 Label distribution: {Counter(labels)}")

print("\n" + "="*60)

🧹 Text Preprocessing Test:
Original: synthetic partnership agreements letter 1  partnership agreement partnership agreement example this partnership agreement is made this  day of , 20, by and between the following individuals:  address:
Cleaned: synthetic partnership agreements letter 1 partnership agreement partnership agreement example this partnership agreement is made this day of   20  by and between the following individuals  address

📊 Processed 5 documents for training
📊 Skipped 0 documents with unknown types
📊 Label distribution: Counter({'Partnership': 1, 'Vendor': 1, 'Employment': 1, 'NDA': 1, 'Service': 1})



In [14]:
# Step 5: Create TF-IDF vectors and analyze features
def create_enhanced_tfidf():
    """Create TF-IDF vectorizer with legal document optimizations"""
    # Legal stop words to consider
    legal_stopwords = [
        'shall', 'may', 'must', 'will', 'agreement', 'party', 'parties',
        'thereof', 'hereof', 'hereby', 'herein', 'whereas', 'therefore',
        'said', 'such', 'any', 'all', 'each', 'every', 'other', 'same'
    ]

    return TfidfVectorizer(
        max_features=2000,
        ngram_range=(1, 3),  # Include trigrams for legal phrases
        stop_words='english',
        min_df=2,  # Only include terms that appear in at least 2 documents
        max_df=0.95,  # Exclude terms that appear in more than 95% of documents
        sublinear_tf=True,  # Use sublinear TF scaling
        lowercase=True,
        token_pattern=r'\b[a-zA-Z]{2,}\b'  # Only words with 2+ letters
    )

# Create and fit TF-IDF
tfidf = create_enhanced_tfidf()
print("🔧 Creating TF-IDF vectors...")
X = tfidf.fit_transform(documents)

print(f"✅ TF-IDF transformation complete!")
print(f"📐 Feature matrix shape: {X.shape}")
print(f"🔠 Number of features (words/ngrams): {X.shape[1]}")
print(f"📄 Number of documents: {X.shape[0]}")

# Show some feature names
feature_names = tfidf.get_feature_names_out()
print(f"\n🔤 Sample feature names: {feature_names[:20]}...")

print("\n" + "="*60)

🔧 Creating TF-IDF vectors...
✅ TF-IDF transformation complete!
📐 Feature matrix shape: (5, 576)
🔠 Number of features (words/ngrams): 576
📄 Number of documents: 5

🔤 Sample feature names: ['able' 'access' 'accordance' 'accordance provisions' 'according'
 'account' 'acknowledges' 'act' 'actually' 'addition' 'additional'
 'additional services' 'address' 'advance' 'agency' 'agree' 'agreed'
 'agreement agreement' 'agreement agreement effective' 'agreement cause']...



In [15]:
# Step 6: Train the classifier and evaluate
print("🤖 Training Logistic Regression classifier...")

classifier = LogisticRegression(
    random_state=42,
    max_iter=2000,
    class_weight='balanced',  # Handle class imbalance
    C=1.0  # Regularization parameter
)

# Train the model
classifier.fit(X, labels)

# Make predictions
predictions = classifier.predict(X)
accuracy = accuracy_score(labels, predictions)

print(f"✅ Training complete!")
print(f"📊 Training Accuracy: {accuracy:.4f} ({accuracy:.2%})")

# Detailed classification report
print(f"\n📋 Classification Report:")
print(classification_report(labels, predictions))

# Show some example predictions with confidence
print(f"\n🎯 Sample predictions with confidence:")
proba_predictions = classifier.predict_proba(X)
for i in range(min(5, len(documents))):
    actual = labels[i]
    predicted = predictions[i]
    confidence = np.max(proba_predictions[i])
    print(f"  Document {i+1}: Actual={actual}, Predicted={predicted}, Confidence={confidence:.3f}")

print("\n" + "="*60)

🤖 Training Logistic Regression classifier...
✅ Training complete!
📊 Training Accuracy: 1.0000 (100.00%)

📋 Classification Report:
              precision    recall  f1-score   support

  Employment       1.00      1.00      1.00         1
         NDA       1.00      1.00      1.00         1
 Partnership       1.00      1.00      1.00         1
     Service       1.00      1.00      1.00         1
      Vendor       1.00      1.00      1.00         1

    accuracy                           1.00         5
   macro avg       1.00      1.00      1.00         5
weighted avg       1.00      1.00      1.00         5


🎯 Sample predictions with confidence:
  Document 1: Actual=Partnership, Predicted=Partnership, Confidence=0.330
  Document 2: Actual=Vendor, Predicted=Vendor, Confidence=0.315
  Document 3: Actual=Employment, Predicted=Employment, Confidence=0.307
  Document 4: Actual=NDA, Predicted=NDA, Confidence=0.300
  Document 5: Actual=Service, Predicted=Service, Confidence=0.288



In [None]:
# Step 8: Save the trained model
def save_model(tfidf, classifier, filename='models/TF_IDF_Logitic_Regression.pkl'):
    """Save the trained model"""
    model_data = {
        'tfidf': tfidf,
        'classifier': classifier,
        'classes': classifier.classes_
    }

    with open(filename, 'wb') as f:
        pickle.dump(model_data, f)
    print(f"💾 Model saved as {filename}")

save_model(tfidf, classifier)

print("\n" + "="*60)

💾 Model saved as contract_classifier.pkl



In [20]:
# Step 9: Test the model on new examples
def batch_predict(texts, tfidf, classifier):
    """Predict multiple documents at once"""
    predictions = []
    probabilities = []

    for text in texts:
        cleaned_text = advanced_preprocess_text(text)
        text_vector = tfidf.transform([cleaned_text])

        pred = classifier.predict(text_vector)[0]
        prob = classifier.predict_proba(text_vector)[0]

        predictions.append(pred)
        probabilities.append(dict(zip(classifier.classes_, prob)))

    return predictions, probabilities

# Test with sample contracts
test_contracts = [
    "This employment agreement is between ABC Corp and John Doe for the position of software engineer with a salary of $100,000 per year.",
    "Confidential information disclosed under this Non-Disclosure Agreement must be kept secret by all parties for a period of 5 years.",
    "The partnership between Smith and Jones will share all profits equally and both parties agree to contribute capital to the business.",
    "Vendor shall deliver goods pursuant to the following terms and conditions outlined in this vendor agreement.",
    "Service Level Agreement: The provider guarantees 99.9% uptime and will provide 24/7 support for all critical issues."
]

print("🧪 Testing model on new examples:")
batch_preds, batch_probs = batch_predict(test_contracts, tfidf, classifier)

for i, (text, pred, prob) in enumerate(zip(test_contracts, batch_preds, batch_probs)):
    print(f"\n📄 Example {i+1}:")
    print(f"   Text: {text[:60]}...")
    print(f"   Prediction: {pred}")
    print(f"   Confidence: {prob[pred]:.3f}")
    # Show top 3 predictions
    sorted_probs = sorted(prob.items(), key=lambda x: x[1], reverse=True)[:3]
    print(f"   Top predictions: {[(p, f'{c:.3f}') for p, c in sorted_probs]}")

print("\n" + "="*60)

🧪 Testing model on new examples:

📄 Example 1:
   Text: This employment agreement is between ABC Corp and John Doe f...
   Prediction: Employment
   Confidence: 0.225
   Top predictions: [(np.str_('Employment'), '0.225'), (np.str_('Partnership'), '0.212'), (np.str_('NDA'), '0.192')]

📄 Example 2:
   Text: Confidential information disclosed under this Non-Disclosure...
   Prediction: NDA
   Confidence: 0.230
   Top predictions: [(np.str_('NDA'), '0.230'), (np.str_('Partnership'), '0.202'), (np.str_('Employment'), '0.200')]

📄 Example 3:
   Text: The partnership between Smith and Jones will share all profi...
   Prediction: Partnership
   Confidence: 0.247
   Top predictions: [(np.str_('Partnership'), '0.247'), (np.str_('NDA'), '0.194'), (np.str_('Vendor'), '0.192')]

📄 Example 4:
   Text: Vendor shall deliver goods pursuant to the following terms a...
   Prediction: Vendor
   Confidence: 0.209
   Top predictions: [(np.str_('Vendor'), '0.209'), (np.str_('Employment'), '0.207'), (np.str_(