In [5]:
import joblib
import numpy as np
import pandas as pd

print("🔍 TESTING MODEL WITH CORRECT FEATURE ORDER")
print("=" * 45)

# Your data with CORRECT feature order (matching training)
data = {
    'Semantic Similarity': [0.775255],
    'Length Ratio': [0.147485],
    'question_student_similarity': [0.761586],
    'POS_noun_ratio_diff': [-0.057548],
    'POS_similarity': [0.893383],  # This was 5th in training
    'flesch_kincaid_ratio': [0.605556],  # This was 6th in training
    'POS_pos_diversity_diff': [0.267306],  # This was 7th in training
    'POS_verb_ratio_diff': [-0.004448],  # This was 8th in training
    'POS_adv_ratio_diff': [0.107451],  # This was 9th in training
    'POS_adj_ratio_diff': [-0.09633]  # This was 10th in training
}

# Create DataFrame with correct order
test_df = pd.DataFrame(data)
print("✅ Test data created with correct feature order")
print(f"Shape: {test_df.shape}")

try:
    # Load model and scaler
    scaler = joblib.load('scaler.pkl')
    model = joblib.load('optimized_logistic_at_model.pkl')
    print("✅ Model and scaler loaded successfully")
    
    # Scale the data
    scaled_data = scaler.transform(test_df)
    print("✅ Data scaled successfully")
    
    # Make prediction
    prediction = model.predict(scaled_data)
    print(f"\n🎯 PREDICTION RESULT:")
    print(f"Predicted Score: {prediction[0]}")
    
    print("\n✅ SUCCESS! Model working correctly!")
    
except Exception as e:
    print(f"❌ Error: {e}")


🔍 TESTING MODEL WITH CORRECT FEATURE ORDER
✅ Test data created with correct feature order
Shape: (1, 10)
✅ Model and scaler loaded successfully
✅ Data scaled successfully

🎯 PREDICTION RESULT:
Predicted Score: 2

✅ SUCCESS! Model working correctly!


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [2]:
import os
import glob
from pathlib import Path
import PyPDF2
import pandas as pd
from typing import List, Dict, Any
import re
from datetime import datetime
from reportlab.lib.pagesizes import letter, A4
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.lib import colors

class PDFTemplateProcessor:
    def __init__(self, input_directory: str, output_directory: str = "formatted_outputs"):
        self.input_directory = Path(input_directory)
        self.output_directory = Path(output_directory)
        self.output_directory.mkdir(exist_ok=True)
        
    def extract_text_from_pdf(self, pdf_path: Path) -> str:
        """Extract all text from a PDF file"""
        try:
            with open(pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                full_text = ""
                
                for page_num, page in enumerate(pdf_reader.pages):
                    page_text = page.extract_text()
                    full_text += f"\n--- PAGE {page_num + 1} ---\n"
                    full_text += page_text + "\n"
                
                return full_text
        except Exception as e:
            return f"Error extracting text: {str(e)}"
    
    def parse_extracted_text(self, text: str, filename: str) -> Dict[str, Any]:
        """Parse extracted text into structured format"""
        lines = text.split('\n')
        
        # Initialize structured data
        parsed_data = {
            'document_name': filename,
            'extraction_date': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            'total_lines': len([line for line in lines if line.strip()]),
            'sections': [],
            'full_text': text
        }
        
        # Group text into sections
        current_section = {'title': 'Main Content', 'content': []}
        
        for line in lines:
            line = line.strip()
            if not line:
                continue
                
            # Check if line looks like a header/title
            if (line.isupper() and len(line) < 100) or \
               (line.startswith('---') and line.endswith('---')) or \
               any(keyword in line.lower() for keyword in ['chapter', 'section', 'part', 'question']):
                
                # Save previous section if it has content
                if current_section['content']:
                    parsed_data['sections'].append(current_section)
                
                # Start new section
                current_section = {'title': line, 'content': []}
            else:
                current_section['content'].append(line)
        
        # Add the last section
        if current_section['content']:
            parsed_data['sections'].append(current_section)
        
        return parsed_data
    
    def create_formatted_template(self, parsed_data: Dict[str, Any], output_path: Path):
        """Create a formatted PDF template from parsed data"""
        doc = SimpleDocTemplate(str(output_path), pagesize=A4)
        styles = getSampleStyleSheet()
        story = []
        
        # Custom styles
        title_style = ParagraphStyle(
            'CustomTitle',
            parent=styles['Heading1'],
            fontSize=16,
            spaceAfter=30,
            alignment=1,  # Center alignment
            textColor=colors.darkblue
        )
        
        header_style = ParagraphStyle(
            'CustomHeader',
            parent=styles['Heading2'],
            fontSize=14,
            spaceAfter=12,
            textColor=colors.darkgreen
        )
        
        content_style = ParagraphStyle(
            'CustomContent',
            parent=styles['Normal'],
            fontSize=10,
            spaceAfter=6,
            leftIndent=20
        )
        
        # Document header
        story.append(Paragraph("EXTRACTED PDF CONTENT", title_style))
        story.append(Spacer(1, 20))
        
        # Document information table
        doc_info = [
            ['Document Name:', parsed_data['document_name']],
            ['Extraction Date:', parsed_data['extraction_date']],
            ['Total Lines:', str(parsed_data['total_lines'])],
            ['Total Sections:', str(len(parsed_data['sections']))]
        ]
        
        info_table = Table(doc_info, colWidths=[2*inch, 4*inch])
        info_table.setStyle(TableStyle([
            ('BACKGROUND', (0, 0), (0, -1), colors.lightgrey),
            ('TEXTCOLOR', (0, 0), (-1, -1), colors.black),
            ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
            ('FONTNAME', (0, 0), (-1, -1), 'Helvetica'),
            ('FONTSIZE', (0, 0), (-1, -1), 10),
            ('BOTTOMPADDING', (0, 0), (-1, -1), 12),
            ('GRID', (0, 0), (-1, -1), 1, colors.black)
        ]))
        
        story.append(info_table)
        story.append(Spacer(1, 30))
        
        # Content sections
        for i, section in enumerate(parsed_data['sections']):
            # Section header
            story.append(Paragraph(f"Section {i+1}: {section['title']}", header_style))
            story.append(Spacer(1, 10))
            
            # Section content
            for line in section['content'][:20]:  # Limit lines per section
                if line.strip():
                    story.append(Paragraph(line, content_style))
            
            if len(section['content']) > 20:
                story.append(Paragraph(f"... ({len(section['content']) - 20} more lines)", content_style))
            
            story.append(Spacer(1, 20))
        
        # Build PDF
        doc.build(story)
    
    def create_text_template(self, parsed_data: Dict[str, Any], output_path: Path):
        """Create a formatted text template from parsed data"""
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write("=" * 80 + "\n")
            f.write("EXTRACTED PDF CONTENT TEMPLATE\n")
            f.write("=" * 80 + "\n\n")
            
            f.write(f"Document Name: {parsed_data['document_name']}\n")
            f.write(f"Extraction Date: {parsed_data['extraction_date']}\n")
            f.write(f"Total Lines: {parsed_data['total_lines']}\n")
            f.write(f"Total Sections: {len(parsed_data['sections'])}\n")
            f.write("\n" + "-" * 80 + "\n\n")
            
            for i, section in enumerate(parsed_data['sections']):
                f.write(f"SECTION {i+1}: {section['title']}\n")
                f.write("-" * 50 + "\n")
                
                for line in section['content']:
                    if line.strip():
                        f.write(f"  {line}\n")
                
                f.write("\n" + "=" * 30 + "\n\n")
    
    def create_excel_template(self, all_parsed_data: List[Dict[str, Any]], output_path: Path):
        """Create an Excel template with all extracted data"""
        # Prepare data for Excel
        excel_data = []
        
        for parsed_data in all_parsed_data:
            base_row = {
                'Document_Name': parsed_data['document_name'],
                'Extraction_Date': parsed_data['extraction_date'],
                'Total_Lines': parsed_data['total_lines'],
                'Total_Sections': len(parsed_data['sections'])
            }
            
            # Add sections as separate columns
            for i, section in enumerate(parsed_data['sections'][:10]):  # Limit to 10 sections
                base_row[f'Section_{i+1}_Title'] = section['title']
                base_row[f'Section_{i+1}_Content'] = '\n'.join(section['content'][:5])  # First 5 lines
            
            excel_data.append(base_row)
        
        # Create DataFrame and save
        df = pd.DataFrame(excel_data)
        df.to_excel(output_path, index=False)
    
    def process_all_pdfs(self):
        """Process all PDFs in the directory and create templates"""
        pdf_files = list(self.input_directory.glob("*.pdf"))
        
        if not pdf_files:
            print(f"No PDF files found in {self.input_directory}")
            return
        
        print(f"Found {len(pdf_files)} PDF files to process...")
        
        all_parsed_data = []
        
        for i, pdf_file in enumerate(pdf_files, 1):
            print(f"Processing {i}/{len(pdf_files)}: {pdf_file.name}")
            
            # Extract text
            extracted_text = self.extract_text_from_pdf(pdf_file)
            
            # Parse text
            parsed_data = self.parse_extracted_text(extracted_text, pdf_file.name)
            all_parsed_data.append(parsed_data)
            
            # Create individual templates
            base_name = pdf_file.stem
            
            # PDF template
            pdf_output = self.output_directory / f"{base_name}_formatted.pdf"
            self.create_formatted_template(parsed_data, pdf_output)
            
            # Text template
            txt_output = self.output_directory / f"{base_name}_template.txt"
            self.create_text_template(parsed_data, txt_output)
        
        # Create combined Excel template
        excel_output = self.output_directory / "all_pdfs_combined_template.xlsx"
        self.create_excel_template(all_parsed_data, excel_output)
        
        print(f"\nProcessing complete! Check {self.output_directory} for results.")
        print(f"Created {len(pdf_files)} PDF templates, {len(pdf_files)} text templates, and 1 Excel summary.")

# Usage example
def main():
    # Set your input directory path here
    input_dir = r"E:\ai_powered_answer_sheet_evalution_system\backend\samples\AI\New folder"  # Change this to your PDF directory
    
    # Initialize processor
    processor = PDFTemplateProcessor(input_dir)
    
    # Process all PDFs
    processor.process_all_pdfs()

if __name__ == "__main__":
    main()


Found 2 PDF files to process...
Processing 1/2: Aarya_AI.pdf
Processing 2/2: Aditya_AI.pdf

Processing complete! Check formatted_outputs for results.
Created 2 PDF templates, 2 text templates, and 1 Excel summary.
