In [None]:
# Optional environment check
from psutil import *
print("Number of CPU: ", cpu_count())
!cat /proc/cpuinfo

In [None]:
# Wrap output for reading more conveniently
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
# Install dependencies not available in Colab environment and download SpaCy large dataset
!pip install python-dotenv langchain_mistralai

import spacy.cli
spacy.cli.download("en_core_web_lg")

In [None]:
import os
from google.colab import drive
from dotenv import load_dotenv

drive.mount('/content/drive')

load_dotenv('/content/drive/MyDrive/Colab Notebooks/.env')
mistral_key = os.getenv('MISTRAL_KEY')

In [None]:
import torch
import torch.nn.functional as F
import numpy as np
from transformers import AutoTokenizer, AutoModel
from collections import Counter
from string import punctuation
from sklearn.metrics.pairwise import cosine_similarity
import spacy
from dataclasses import dataclass
from typing import List, Dict, Tuple, Optional
from scipy.stats import percentileofscore
from sentence_transformers import SentenceTransformer

class Grader:
    def __init__(self, model_name = "sentence-transformers/all-mpnet-base-v2"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.nlp = spacy.load('en_core_web_lg')

    def _get_embedding(self, text: str) -> np.ndarray:
        inputs = self.tokenizer(text, padding=True, truncation=True,
                              return_tensors="pt", max_length=512)

        with torch.no_grad():
            outputs = self.model(**inputs)

        attention_mask = inputs['attention_mask']
        token_embeddings = outputs.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sentence_embedding = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

        return sentence_embedding.numpy()

    def _string_similarity(self, str1: str, str2: str) -> float:
      set1 = set(str1.lower())
      set2 = set(str2.lower())

      if not set1 or not set2:
          return 0.0

      intersection = len(set1.intersection(set2))
      union = len(set1.union(set2))

      return intersection / union

    def _calculate_coherence(self, answer: str) -> float:
        doc = self.nlp(answer)
        sentences = list(doc.sents)

        if len(sentences) < 2:
            return 1.0

        coherence_scores = []
        for i in range(len(sentences) - 1):
            sent1_embed = self._get_embedding(sentences[i].text)
            sent2_embed = self._get_embedding(sentences[i + 1].text)
            similarity = cosine_similarity(sent1_embed, sent2_embed)[0][0]
            coherence_scores.append(similarity)

        return np.mean(coherence_scores)

    def assess_answer(self,
                 student_answer: str,
                 model_answer: str,
                 rubric = None):

      student_embedding = self._get_embedding(student_answer)
      model_embedding = self._get_embedding(model_answer)

      similarity_score = cosine_similarity(student_embedding, model_embedding)[0][0]
      coherence_score = self._calculate_coherence(student_answer)

      default_weights = {
          'similarity': 0.7,
          'coherence': 0.3,
      }

      weights = rubric.get('weights', default_weights) if rubric else default_weights

      # Normalize weights to sum to 1
      total = sum(weights.values())
      if total > 0:
          weights = {k: v/total for k, v in weights.items()}

      final_score = (weights['similarity'] * similarity_score +
                    weights['coherence'] * coherence_score)

      return {
          "similarity_score": similarity_score,
          "coherence_score": coherence_score,
          "final_score": final_score
      }

In [None]:
import json
from langchain_mistralai import ChatMistralAI
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain.schema.output_parser import StrOutputParser
from tenacity import retry, stop_after_attempt, wait_exponential

class LLMEnhancedAssessor:
    def __init__(self, base_assessor, api_key, model = "mistral-large-latest"):

        self.base_assessor = base_assessor
        self.model = model
        self.llm = ChatMistralAI(
            model_name=model,
            mistral_api_key=api_key,
            temperature=0.2
        )

    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
    async def _generate_llm_analysis(self, student_answer, benchmark_answer, base_assessment):

        analysis_prompt = ChatPromptTemplate.from_messages([
            ("system", "You are an AI tutor who interprets assessment scores. Use a conversational second-person tone without any formatting (no bold, italic, or other text formatting). Analyze the student's answer compared to the benchmark, focusing on conceptual understanding. Identify specific strengths, point out areas needing improvement, and offer practical suggestions without referencing specific study materials. Use no section headers, greetings, or introductory paragraphs. You can use bullet points when necessary. Keep your responses concise and conversational. Do not mention the similarity or coherence scores. Do not compare the student answer to benchmark directly. Do not mention the benchmark answer."),
            ("user", """
            I've received scores on my assessment and would like your help understanding my performance and how to improve.

            The benchmark answer was: {benchmark_answer}
            My answer was: {student_answer}

            Answer scores:
            - Similarity to benchmark answer: {similarity_score}
            - Coherence: {coherence_score}

            Can you analyze my work, highlight my strengths and areas for improvement, and suggest what I should focus on to improve?
            """)
        ])

        chain = analysis_prompt | self.llm

        result = await chain.ainvoke({
            "benchmark_answer": benchmark_answer,
            "student_answer": student_answer,
            "similarity_score": base_assessment['similarity_score'],
            "coherence_score": base_assessment['coherence_score']
        })

        return result

    async def generate_feedback(self, student_answer, benchmark_answer):

        rubric = {
        'weights': {
          'similarity': 0.7,
          'coherence': 0.3,
      }
    }

        base_result = self.base_assessor.assess_answer(
            student_answer, benchmark_answer, rubric
        )

        llm_analysis = await self._generate_llm_analysis(
            student_answer,
            benchmark_answer,
            base_result,
        )

        return {
            "score": base_result,
            "tutor_feedback": llm_analysis.content,
        }

In [None]:
base_assessor = Grader()

rubric = {
        'weights': {
          'similarity': 0.7,
          'coherence': 0.3,
      }
    }

enhanced_assessor = LLMEnhancedAssessor(
        base_assessor=base_assessor,
        api_key=mistral_key
    )

benchmark_answer =
"""
"""

student_answer =
"""
"""

result = await enhanced_assessor.generate_feedback(
        student_answer=student_answer,
        benchmark_answer=benchmark_answer,
    )

print(f"{result['score']['similarity_score']:.2f} {result['score']['coherence_score']:.2f} {result['score']['final_score']:.2f}\n")
print(result['tutor_feedback'])