In [21]:
import pandas as pd
import numpy as np
import pickle
import re

***Loading All Models and Processors***

In [22]:
print("="*50)
print("LOADING MODELS")
print("="*50)

# Loading models
with open(r'D:\AutoJudge_v2\models\classification_model.pkl', 'rb') as f:
    classification_model = pickle.load(f)
print("✓ Classification model loaded")

with open(r'D:\AutoJudge_v2\models\regression_model.pkl', 'rb') as f:
    regression_model = pickle.load(f)
print("✓ Regression model loaded")

# Loading preprocessing tools
with open(r'D:\AutoJudge_v2\models\tfidf_vectorizer.pkl', 'rb') as f:
    tfidf_vectorizer = pickle.load(f)
print("✓ TF-IDF vectorizer loaded")

with open(r'D:\AutoJudge_v2\models\feature_scaler.pkl', 'rb') as f:
    feature_scaler = pickle.load(f)
print("✓ Feature scaler loaded")

with open(r'D:\AutoJudge_v2\models\label_encoder.pkl', 'rb') as f:
    label_encoder = pickle.load(f)
print("✓ Label encoder loaded")

with open(r'D:\AutoJudge_v2\models\feature_names.pkl', 'rb') as f:
    feature_info = pickle.load(f)
print("✓ Feature names loaded")

print("\nAll models and processors loaded successfully!")

LOADING MODELS
✓ Classification model loaded
✓ Regression model loaded
✓ TF-IDF vectorizer loaded
✓ Feature scaler loaded
✓ Label encoder loaded
✓ Feature names loaded

All models and processors loaded successfully!


***Feature Extraction Functions***

In [23]:
def clean_text(text):
    """Clean text by removing extra whitespace"""
    if pd.isna(text) or text == '':
        return ''
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def extract_manual_features(text):
    """Extract manual features from text"""
    if pd.isna(text) or text == '':
        return np.zeros(15)  # Return zeros for all manual features
    
    words = text.split()
    text_lower = text.lower()
    
    # Basic features
    char_count = len(text)
    word_count = len(words)
    sentence_count = len(re.findall(r'[.!?]+', text))
    avg_word_length = np.mean([len(w) for w in words]) if words else 0
    uppercase_count = sum(1 for c in text if c.isupper())
    digit_count = sum(1 for c in text if c.isdigit())
    
    # Math features
    math_symbol_count = len(re.findall(r'[+\-*/=<>≤≥≠]', text))
    equation_count = len(re.findall(r'\$.*?\$', text))
    bracket_count = len(re.findall(r'[\(\)\[\]\{\}]', text))
    dollar_sign_count = text.count('$')
    
    # Keyword features
    graph_words = ['graph', 'node', 'edge', 'tree', 'dfs', 'bfs', 'dijkstra', 'spanning']
    dp_words = ['dynamic', 'dp', 'memoization', 'optimal', 'subproblem', 'recursion']
    sort_words = ['sort', 'sorted', 'order', 'arrange', 'ascending', 'descending']
    ds_words = ['array', 'list', 'stack', 'queue', 'heap', 'hash', 'map', 'set']
    complexity_words = ['O(n)', 'O(log n)', 'complexity', 'efficient', 'optimize', 'time limit']
    
    graph_keywords = sum(text_lower.count(w) for w in graph_words)
    dp_keywords = sum(text_lower.count(w) for w in dp_words)
    sorting_keywords = sum(text_lower.count(w) for w in sort_words)
    data_structure_keywords = sum(text_lower.count(w) for w in ds_words)
    complexity_keywords = sum(text_lower.count(w) for w in complexity_words)
    
    return np.array([
        char_count, word_count, sentence_count, avg_word_length, 
        uppercase_count, digit_count, math_symbol_count, equation_count,
        bracket_count, dollar_sign_count, graph_keywords, dp_keywords,
        sorting_keywords, data_structure_keywords, complexity_keywords
    ])

def preprocess_problem(title, description, input_desc, output_desc):
    """Preprocess a new problem for prediction"""
    # Clean texts
    title = clean_text(title)
    description = clean_text(description)
    input_desc = clean_text(input_desc)
    output_desc = clean_text(output_desc)
    
    # Combine text
    combined_text = f"{title} {description} {input_desc} {output_desc}"
    
    # Extract manual features
    manual_features = extract_manual_features(combined_text)
    
    # Extract TF-IDF features
    tfidf_features = tfidf_vectorizer.transform([combined_text]).toarray()[0]
    
    # Scale manual features
    manual_features_scaled = feature_scaler.transform(manual_features.reshape(1, -1))[0]
    
    # Combine features
    all_features = np.concatenate([manual_features_scaled, tfidf_features])
    
    return all_features.reshape(1, -1)

def predict_difficulty(title, description, input_desc, output_desc):
    """Predict both class and score for a problem"""
    # Preprocess
    features = preprocess_problem(title, description, input_desc, output_desc)
    
    # Predict class
    class_encoded = classification_model.predict(features)[0]
    class_label = label_encoder.inverse_transform([class_encoded])[0]
    
    # Get class probabilities (if available)
    if hasattr(classification_model, 'predict_proba'):
        class_probabilities = classification_model.predict_proba(features)[0]
        class_probs_dict = {
            label_encoder.inverse_transform([i])[0]: prob 
            for i, prob in enumerate(class_probabilities)
        }
    else:
        class_probs_dict = None
    
    # Predict score
    predicted_score = regression_model.predict(features)[0]
    
    return {
        'predicted_class': class_label,
        'predicted_score': round(predicted_score, 2),
        'class_probabilities': class_probs_dict
    }

***Testing on Sample Problems***

In [24]:
# Sample problem 1: Easy problem
sample1 = {
    'title': 'Sum of Two Numbers',
    'description': 'Given two integers a and b, compute their sum.',
    'input_desc': 'Two integers a and b separated by space.',
    'output_desc': 'Print a single integer, the sum of a and b.'
}

print("\nSample 1: Easy Problem")
print(f"Title: {sample1['title']}")
result1 = predict_difficulty(**sample1)
print(f"Predicted Class: {result1['predicted_class']}")
print(f"Predicted Score: {result1['predicted_score']}")
if result1['class_probabilities']:
    print("Class Probabilities:")
    for cls, prob in result1['class_probabilities'].items():
        print(f"  {cls}: {prob:.3f}")


Sample 1: Easy Problem
Title: Sum of Two Numbers
Predicted Class: medium
Predicted Score: 4.52
Class Probabilities:
  easy: 0.160
  hard: 0.296
  medium: 0.544




In [25]:
# Sample problem 2: Medium problem
sample2 = {
    'title': 'Binary Search',
    'description': 'Given a sorted array of n integers, find if a target value x exists. Use binary search algorithm.',
    'input_desc': 'First line contains n and x. Second line contains n sorted integers.',
    'output_desc': 'Print YES if x exists, otherwise NO.'
}

print("\n" + "-"*50)
print("\nSample 2: Medium Problem")
print(f"Title: {sample2['title']}")
result2 = predict_difficulty(**sample2)
print(f"Predicted Class: {result2['predicted_class']}")
print(f"Predicted Score: {result2['predicted_score']}")
if result2['class_probabilities']:
    print("Class Probabilities:")
    for cls, prob in result2['class_probabilities'].items():
        print(f"  {cls}: {prob:.3f}")


--------------------------------------------------

Sample 2: Medium Problem
Title: Binary Search
Predicted Class: hard
Predicted Score: 5.3
Class Probabilities:
  easy: 0.110
  hard: 0.466
  medium: 0.424




In [26]:
# Sample problem 3: Hard problem
sample3 = {
    'title': 'Maximum Flow in Graph',
    'description': 'Given a directed graph with edge capacities, find the maximum flow from source to sink using Ford-Fulkerson algorithm. The graph has n nodes and m edges. You need to implement an efficient solution with time complexity O(VE^2).',
    'input_desc': 'First line: n, m, source, sink. Next m lines: u, v, capacity for each edge.',
    'output_desc': 'Print a single integer - the maximum flow value.'
}

print("\n" + "-"*50)
print("\nSample 3: Hard Problem")
print(f"Title: {sample3['title']}")
result3 = predict_difficulty(**sample3)
print(f"Predicted Class: {result3['predicted_class']}")
print(f"Predicted Score: {result3['predicted_score']}")
if result3['class_probabilities']:
    print("Class Probabilities:")
    for cls, prob in result3['class_probabilities'].items():
        print(f"  {cls}: {prob:.3f}")


--------------------------------------------------

Sample 3: Hard Problem
Title: Maximum Flow in Graph
Predicted Class: hard
Predicted Score: 6.67
Class Probabilities:
  easy: 0.037
  hard: 0.844
  medium: 0.120




***Batch Prediction Function***

In [27]:
def batch_predict(problems_df):
    """Predict for multiple problems at once"""
    results = []
    
    for idx, row in problems_df.iterrows():
        result = predict_difficulty(
            row['title'],
            row['description'],
            row['input_description'],
            row['output_description']
        )
        results.append(result)
    
    results_df = pd.DataFrame(results)
    return pd.concat([problems_df.reset_index(drop=True), results_df], axis=1)

***Interactive Prediction Function***

In [28]:
def interactive_prediction():
    """Interactive mode for single problem prediction"""
    print("\n" + "="*50)
    print("INTERACTIVE PREDICTION MODE")
    print("="*50)
    
    print("\nEnter problem details:")
    title = input("Title: ")
    print("\nDescription (press Enter twice when done):")
    description_lines = []
    while True:
        line = input()
        if line == "":
            break
        description_lines.append(line)
    description = " ".join(description_lines)
    
    print("\nInput Description (press Enter twice when done):")
    input_lines = []
    while True:
        line = input()
        if line == "":
            break
        input_lines.append(line)
    input_desc = " ".join(input_lines)
    
    print("\nOutput Description (press Enter twice when done):")
    output_lines = []
    while True:
        line = input()
        if line == "":
            break
        output_lines.append(line)
    output_desc = " ".join(output_lines)
    
    # Predict
    result = predict_difficulty(title, description, input_desc, output_desc)
    
    # Display results
    print("\n" + "="*50)
    print("PREDICTION RESULTS")
    print("="*50)
    print(f"\nPredicted Class: {result['predicted_class'].upper()}")
    print(f"Predicted Score: {result['predicted_score']}/10.0")
    
    if result['class_probabilities']:
        print("\nClass Probabilities:")
        for cls, prob in result['class_probabilities'].items():
            bar = "█" * int(prob * 50)
            print(f"  {cls:8s}: {bar} {prob:.2%}")

***Model Performance Summary***

In [29]:
# results
with open(r'D:\AutoJudge_v2\models\classification_results.pkl', 'rb') as f:
    class_results = pickle.load(f)

with open(r'D:\AutoJudge_v2\models\regression_results.pkl', 'rb') as f:
    reg_results = pickle.load(f)

print("\nClassification Model:")
print(f"  Model: {class_results['model_name']}")
print(f"  Test Accuracy: {class_results['test_accuracy']:.4f}")

print("\nRegression Model:")
print(f"  Model: {reg_results['model_name']}")
print(f"  Test MAE: {reg_results['test_mae']:.4f}")
print(f"  Test RMSE: {reg_results['test_rmse']:.4f}")
print(f"  Test R² Score: {reg_results['test_r2']:.4f}")


Classification Model:
  Model: Gradient Boosting
  Test Accuracy: 0.6630

Regression Model:
  Model: Random Forest
  Test MAE: 1.5578
  Test RMSE: 1.9447
  Test R² Score: 0.4840


***Saving Prediction Pipeline***

In [30]:
prediction_pipeline = {
    'classification_model': classification_model,
    'regression_model': regression_model,
    'tfidf_vectorizer': tfidf_vectorizer,
    'feature_scaler': feature_scaler,
    'label_encoder': label_encoder,
    'predict_function': predict_difficulty
}

with open(r'D:\AutoJudge_v2\models\prediction_pipeline.pkl', 'wb') as f:
    pickle.dump(prediction_pipeline, f)

print("\n✓ Complete prediction pipeline saved to 'prediction_pipeline.pkl'")

print("\n" + "="*50)
print("READY FOR PREDICTIONS!")
print("="*50)


✓ Complete prediction pipeline saved to 'prediction_pipeline.pkl'

READY FOR PREDICTIONS!
