In [None]:
!pip install collections

In [None]:
import random
from collections import defaultdict

# Simulated Question Bank (60 questions)
questions = [
    {
        "id": i,
        "topic": f"Topic {topic}",
        "difficulty": difficulty,
        "text": f"{difficulty} question {i//12 + 1} for Topic {topic}"
    }
    for topic in range(1, 6)
    for i, difficulty in enumerate(
        ["Easy"]*4 + ["Medium"]*4 + ["Hard"]*4,
        start=(topic-1)*12
    )
]

class CATSystem:
    def __init__(self):
        self.user_performance = defaultdict(list)
        self.current_test = 1

    def run_test(self):
        if self.current_test == 1:
            test_questions = self._select_initial_questions()
        else:
            test_questions = self._select_adaptive_questions()

        print(f"\n=== Starting Test {self.current_test} ===")
        self._display_test_structure(test_questions)
        results = self._simulate_test_taking(test_questions)
        self._analyze_performance(results)

        self.current_test += 1
        return results

    def _display_test_structure(self, test_questions):
        """Show difficulty distribution per topic"""
        topic_stats = defaultdict(lambda: defaultdict(int))
        for q in test_questions:
            topic_stats[q['topic']][q['difficulty']] += 1

        print("\nTest Structure:")
        for topic in [f"Topic {i}" for i in range(1, 6)]:
            stats = topic_stats[topic]
            print(f"{topic}:")
            print(f"  Easy: {stats.get('Easy', 0)} questions")
            print(f"  Medium: {stats.get('Medium', 0)} questions")
            print(f"  Hard: {stats.get('Hard', 0)} questions")
            print()

    def _select_initial_questions(self):
        """Select 4 easy questions from each topic (20 total)"""
        selected = []
        for topic in range(1, 6):
            topic_questions = [q for q in questions
                             if q["topic"] == f"Topic {topic}"
                             and q["difficulty"] == "Easy"][:4]
            selected.extend(topic_questions)
        return selected

    def _select_adaptive_questions(self):
        """Select questions based on previous performance"""
        selected = []
        for topic in range(1, 6):
            topic_perf = self.user_performance[f"Topic {topic}"]

            # Get all questions from previous test in this topic
            prev_questions = [q for q in self.user_performance["Test1"]
                            if q["question"]["topic"] == f"Topic {topic}"]

            # Check if all answers were correct
            if all(res["correct"] for res in prev_questions):
                difficulty = "Medium"
            else:
                difficulty = "Easy"

            # Select 4 questions of appropriate difficulty
            topic_questions = [q for q in questions
                             if q["topic"] == f"Topic {topic}"
                             and q["difficulty"] == difficulty][:4]
            selected.extend(topic_questions)
        return selected

    def _simulate_test_taking(self, test_questions):
        """Simulate user answering questions with random performance"""
        results = []
        for question in test_questions:
            # Simulate user response (80% chance correct for demonstration)
            is_correct = random.random() < 0.8  # Adjust this for different outcomes
            time_spent = round(random.uniform(10, 60), 2)  # 10-60 seconds

            result = {
                "question": question,
                "correct": is_correct,
                "time_spent": time_spent,
                "attempts": 1  # Simulating single attempt
            }
            results.append(result)

            # Track performance per topic
            self.user_performance[question["topic"]].append(result)

        return results

    def _analyze_performance(self, results):
        """Analyze and display test results"""
        correct = sum(1 for r in results if r["correct"])
        print(f"\nTest {self.current_test} Results:")
        print(f"Total Correct: {correct}/{len(results)}")

        # Display topic-wise performance
        topic_perf = defaultdict(list)
        for res in results:
            topic_perf[res["question"]["topic"]].append(res["correct"])

        print("\nTopic Performance:")
        for topic, scores in topic_perf.items():
            correct = sum(scores)
            print(f"{topic}: {correct}/{len(scores)} correct")

    def get_upgraded_topics(self):
        """Get list of topics that will get harder questions"""
        upgraded = []
        for topic in range(1, 6):
            prev_results = self.user_performance.get(f"Topic {topic}", [])
            if prev_results and all(res["correct"] for res in prev_results):
                upgraded.append(f"Topic {topic}")
        return upgraded

# Simulation Usage
if __name__ == "__main__":
    cat = CATSystem()

    # Take Test 1
    test1_results = cat.run_test()

    # Show which topics will be upgraded
    upgraded_topics = cat.get_upgraded_topics()
    print("\nUpgraded Topics for Test 2:", upgraded_topics or "None")

    # Take Test 2
    test2_results = cat.run_test()


=== Starting Test 1 ===

Test Structure:
Topic 1:
  Easy: 4 questions
  Medium: 0 questions
  Hard: 0 questions

Topic 2:
  Easy: 4 questions
  Medium: 0 questions
  Hard: 0 questions

Topic 3:
  Easy: 4 questions
  Medium: 0 questions
  Hard: 0 questions

Topic 4:
  Easy: 4 questions
  Medium: 0 questions
  Hard: 0 questions

Topic 5:
  Easy: 4 questions
  Medium: 0 questions
  Hard: 0 questions


Test 1 Results:
Total Correct: 17/20

Topic Performance:
Topic 1: 4/4 correct
Topic 2: 3/4 correct
Topic 3: 3/4 correct
Topic 4: 4/4 correct
Topic 5: 3/4 correct

Upgraded Topics for Test 2: ['Topic 1', 'Topic 4']

=== Starting Test 2 ===

Test Structure:
Topic 1:
  Easy: 0 questions
  Medium: 4 questions
  Hard: 0 questions

Topic 2:
  Easy: 0 questions
  Medium: 4 questions
  Hard: 0 questions

Topic 3:
  Easy: 0 questions
  Medium: 4 questions
  Hard: 0 questions

Topic 4:
  Easy: 0 questions
  Medium: 4 questions
  Hard: 0 questions

Topic 5:
  Easy: 0 questions
  Medium: 4 questions
  

In [None]:
import random
from collections import defaultdict
from sklearn.tree import DecisionTreeClassifier
import numpy as np

# Simulated Question Bank (60 questions)
questions = [
    {
        "id": i,
        "topic": f"Topic {topic}",
        "difficulty": difficulty,
        "text": f"{difficulty} question {i//12 + 1} for Topic {topic}"
    }
    for topic in range(1, 6)
    for i, difficulty in enumerate(
        ["Easy"]*4 + ["Medium"]*4 + ["Hard"]*4,
        start=(topic-1)*12
    )
]

class CATSystem:
    def __init__(self):
        self.user_performance = defaultdict(list)
        self.current_test = 1
        self.model = DecisionTreeClassifier()  # Simple ML model

    def run_test(self):
        if self.current_test == 1:
            test_questions = self._select_initial_questions()
        else:
            test_questions = self._select_adaptive_questions()

        print(f"\n=== Starting Test {self.current_test} ===")
        self._display_test_structure(test_questions)
        results = self._simulate_test_taking(test_questions)
        self._analyze_performance(results)

        # Train ML model after Test 1
        if self.current_test == 1:
            self._train_model()

        self.current_test += 1
        return results

    def _display_test_structure(self, test_questions):
        """Show difficulty distribution per topic"""
        topic_stats = defaultdict(lambda: defaultdict(int))
        for q in test_questions:
            topic_stats[q['topic']][q['difficulty']] += 1

        print("\nTest Structure:")
        for topic in [f"Topic {i}" for i in range(1, 6)]:
            stats = topic_stats[topic]
            print(f"{topic}:")
            print(f"  Easy: {stats.get('Easy', 0)} questions")
            print(f"  Medium: {stats.get('Medium', 0)} questions")
            print(f"  Hard: {stats.get('Hard', 0)} questions")
            print()

    def _select_initial_questions(self):
        """Select 4 easy questions from each topic (20 total)"""
        selected = []
        for topic in range(1, 6):
            topic_questions = [q for q in questions
                             if q["topic"] == f"Topic {topic}"
                             and q["difficulty"] == "Easy"][:4]
            selected.extend(topic_questions)
        return selected

    def _select_adaptive_questions(self):
        """Select questions based on ML model predictions"""
        selected = []
        for topic in range(1, 6):
            # Get topic performance features
            avg_time, accuracy = self._calculate_topic_features(f"Topic {topic}")
            predicted_difficulty = self.model.predict([[avg_time, accuracy]])[0]

            # Select 4 questions of predicted difficulty
            topic_questions = [q for q in questions
                             if q["topic"] == f"Topic {topic}"
                             and q["difficulty"] == predicted_difficulty][:4]
            selected.extend(topic_questions)
        return selected

    def _simulate_test_taking(self, test_questions):
        """Simulate user answering questions with random performance"""
        results = []
        for question in test_questions:
            # Simulate user response (80% chance correct for demonstration)
            is_correct = random.random() < 0.8  # Adjust this for different outcomes
            time_spent = round(random.uniform(10, 60), 2)  # 10-60 seconds

            result = {
                "question": question,
                "correct": is_correct,
                "time_spent": time_spent,
                "attempts": 1  # Simulating single attempt
            }
            results.append(result)

            # Track performance per topic
            self.user_performance[question["topic"]].append(result)

        return results

    def _analyze_performance(self, results):
        """Analyze and display test results"""
        correct = sum(1 for r in results if r["correct"])
        print(f"\nTest {self.current_test} Results:")
        print(f"Total Correct: {correct}/{len(results)}")

        # Display topic-wise performance
        topic_perf = defaultdict(list)
        for res in results:
            topic_perf[res["question"]["topic"]].append(res["correct"])

        print("\nTopic Performance:")
        for topic, scores in topic_perf.items():
            correct = sum(scores)
            print(f"{topic}: {correct}/{len(scores)} correct")

    def _train_model(self):
        """Train ML model on Test 1 data"""
        X, y = [], []
        for topic in range(1, 6):
            avg_time, accuracy = self._calculate_topic_features(f"Topic {topic}")
            X.append([avg_time, accuracy])
            # Assign difficulty for the next test (e.g., Easy if low accuracy)
            y.append("Easy" if accuracy < 0.6 else "Medium")

        self.model.fit(X, y)

    def _calculate_topic_features(self, topic):
        """Calculate average time spent and accuracy for a topic"""
        results = self.user_performance[topic]
        if not results:
            return 0, 0
        avg_time = sum(r["time_spent"] for r in results) / len(results)
        accuracy = sum(r["correct"] for r in results) / len(results)
        return avg_time, accuracy

# Simulation Usage
if __name__ == "__main__":
    cat = CATSystem()

    # Take Test 1
    test1_results = cat.run_test()

    # Take Test 2
    test2_results = cat.run_test()



=== Starting Test 1 ===

Test Structure:
Topic 1:
  Easy: 4 questions
  Medium: 0 questions
  Hard: 0 questions

Topic 2:
  Easy: 4 questions
  Medium: 0 questions
  Hard: 0 questions

Topic 3:
  Easy: 4 questions
  Medium: 0 questions
  Hard: 0 questions

Topic 4:
  Easy: 4 questions
  Medium: 0 questions
  Hard: 0 questions

Topic 5:
  Easy: 4 questions
  Medium: 0 questions
  Hard: 0 questions


Test 1 Results:
Total Correct: 15/20

Topic Performance:
Topic 1: 4/4 correct
Topic 2: 3/4 correct
Topic 3: 4/4 correct
Topic 4: 1/4 correct
Topic 5: 3/4 correct

=== Starting Test 2 ===

Test Structure:
Topic 1:
  Easy: 0 questions
  Medium: 4 questions
  Hard: 0 questions

Topic 2:
  Easy: 0 questions
  Medium: 4 questions
  Hard: 0 questions

Topic 3:
  Easy: 0 questions
  Medium: 4 questions
  Hard: 0 questions

Topic 4:
  Easy: 4 questions
  Medium: 0 questions
  Hard: 0 questions

Topic 5:
  Easy: 0 questions
  Medium: 4 questions
  Hard: 0 questions


Test 2 Results:
Total Correct: 1

In [None]:
import random
from collections import defaultdict
from sklearn.tree import DecisionTreeClassifier
import numpy as np

# Simulated Question Bank (60 questions)
questions = [
    {
        "id": i,
        "topic": f"Topic {topic}",
        "difficulty": difficulty,
        "text": f"{difficulty} question {i//12 + 1} for Topic {topic}"
    }
    for topic in range(1, 6)
    for i, difficulty in enumerate(
        ["Easy"]*4 + ["Medium"]*4 + ["Hard"]*4,
        start=(topic-1)*12
    )
]

class CATSystem:
    def __init__(self):
        self.user_performance = defaultdict(list)
        self.current_test = 1
        self.model = DecisionTreeClassifier()  # Simple ML model

    def run_test(self):
        if self.current_test == 1:
            test_questions = self._select_initial_questions()
        else:
            test_questions = self._select_adaptive_questions()

        print(f"\n=== Starting Test {self.current_test} ===")
        self._display_test_structure(test_questions)
        results = self._simulate_test_taking(test_questions)
        self._analyze_performance(results)

        # Train ML model after Test 1
        if self.current_test == 1:
            self._train_model()

        self.current_test += 1
        return results

    def _display_test_structure(self, test_questions):
        """Show difficulty distribution per topic"""
        topic_stats = defaultdict(lambda: defaultdict(int))
        for q in test_questions:
            topic_stats[q['topic']][q['difficulty']] += 1

        print("\nTest Structure:")
        for topic in [f"Topic {i}" for i in range(1, 6)]:
            stats = topic_stats[topic]
            print(f"{topic}:")
            print(f"  Easy: {stats.get('Easy', 0)} questions")
            print(f"  Medium: {stats.get('Medium', 0)} questions")
            print(f"  Hard: {stats.get('Hard', 0)} questions")
            print()

    def _select_initial_questions(self):
        """Select 4 easy questions from each topic (20 total)"""
        selected = []
        for topic in range(1, 6):
            topic_questions = [q for q in questions
                             if q["topic"] == f"Topic {topic}"
                             and q["difficulty"] == "Easy"][:4]
            selected.extend(topic_questions)
        return selected

    def _select_adaptive_questions(self):
        """Select questions based on ML model predictions"""
        selected = []
        for topic in range(1, 6):
            # Get topic performance features
            avg_time, accuracy, avg_attempts = self._calculate_topic_features(f"Topic {topic}")
            predicted_difficulty = self.model.predict([[avg_time, accuracy, avg_attempts]])[0]

            # Select 4 questions of predicted difficulty
            topic_questions = [q for q in questions
                             if q["topic"] == f"Topic {topic}"
                             and q["difficulty"] == predicted_difficulty][:4]
            selected.extend(topic_questions)
        return selected

    def _simulate_test_taking(self, test_questions):
        """Simulate user answering questions with random performance"""
        results = []
        for question in test_questions:
            # Simulate user response (80% chance correct for demonstration)
            is_correct = random.random() < 0.8  # Adjust this for different outcomes
            time_spent = round(random.uniform(10, 60), 2)  # 10-60 seconds
            attempts = random.randint(1, 3)  # Simulating up to 3 attempts

            result = {
                "question": question,
                "correct": is_correct,
                "time_spent": time_spent,
                "attempts": attempts
            }
            results.append(result)

            # Track performance per topic
            self.user_performance[question["topic"]].append(result)

        return results

    def _analyze_performance(self, results):
        """Analyze and display test results"""
        correct = sum(1 for r in results if r["correct"])
        print(f"\nTest {self.current_test} Results:")
        print(f"Total Correct: {correct}/{len(results)}")

        # Display topic-wise performance
        topic_perf = defaultdict(list)
        for res in results:
            topic_perf[res["question"]["topic"]].append(res["correct"])

        print("\nTopic Performance:")
        for topic, scores in topic_perf.items():
            correct = sum(scores)
            print(f"{topic}: {correct}/{len(scores)} correct")

    def _train_model(self):
        """Train ML model on Test 1 data"""
        X, y = [], []
        for topic in range(1, 6):
            avg_time, accuracy, avg_attempts = self._calculate_topic_features(f"Topic {topic}")
            X.append([avg_time, accuracy, avg_attempts])
            # Assign difficulty for the next test (e.g., Easy if low accuracy)
            y.append("Easy" if accuracy < 0.6 else "Medium")

        self.model.fit(X, y)

    def _calculate_topic_features(self, topic):
        """Calculate average time spent, accuracy, and attempts for a topic"""
        results = self.user_performance[topic]
        if not results:
            return 0, 0, 0
        avg_time = sum(r["time_spent"] for r in results) / len(results)
        accuracy = sum(r["correct"] for r in results) / len(results)
        avg_attempts = sum(r["attempts"] for r in results) / len(results)
        return avg_time, accuracy, avg_attempts

# Simulation Usage
if __name__ == "__main__":
    cat = CATSystem()

    # Take Test 1
    test1_results = cat.run_test()

    # Take Test 2
    test2_results = cat.run_test()



=== Starting Test 1 ===

Test Structure:
Topic 1:
  Easy: 4 questions
  Medium: 0 questions
  Hard: 0 questions

Topic 2:
  Easy: 4 questions
  Medium: 0 questions
  Hard: 0 questions

Topic 3:
  Easy: 4 questions
  Medium: 0 questions
  Hard: 0 questions

Topic 4:
  Easy: 4 questions
  Medium: 0 questions
  Hard: 0 questions

Topic 5:
  Easy: 4 questions
  Medium: 0 questions
  Hard: 0 questions


Test 1 Results:
Total Correct: 16/20

Topic Performance:
Topic 1: 3/4 correct
Topic 2: 4/4 correct
Topic 3: 2/4 correct
Topic 4: 3/4 correct
Topic 5: 4/4 correct

=== Starting Test 2 ===

Test Structure:
Topic 1:
  Easy: 0 questions
  Medium: 4 questions
  Hard: 0 questions

Topic 2:
  Easy: 0 questions
  Medium: 4 questions
  Hard: 0 questions

Topic 3:
  Easy: 4 questions
  Medium: 0 questions
  Hard: 0 questions

Topic 4:
  Easy: 0 questions
  Medium: 4 questions
  Hard: 0 questions

Topic 5:
  Easy: 0 questions
  Medium: 4 questions
  Hard: 0 questions


Test 2 Results:
Total Correct: 1

In [None]:
import random
import numpy as np
from collections import defaultdict
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Simulated Question Bank (60 questions)
questions = [
    {
        "id": i,
        "topic": f"Topic {topic}",
        "difficulty": difficulty,
        "difficulty_score": {"Easy": 0, "Medium": 1, "Hard": 2}[difficulty],
        "text": f"{difficulty} question {i//12 + 1} for Topic {topic}"
    }
    for topic in range(1, 6)
    for i, difficulty in enumerate(
        ["Easy"]*4 + ["Medium"]*4 + ["Hard"]*4,
        start=(topic-1)*12
    )
]

class CATSystem:
    def __init__(self):
        self.user_performance = defaultdict(list)
        self.current_test = 1
        self.models = {}  # ML model per topic
        self.scalers = {} # Scaler per topic

        # Initialize synthetic training data for demonstration
        self._initialize_ml_models()

    def _initialize_ml_models(self):
        """Create pre-trained models with synthetic data"""
        for topic in range(1, 6):
            # Synthetic training data: [avg_time, accuracy] -> can_handle_higher_difficulty
            X = np.array([
                [20, 1.0],  # Fast + perfect
                [50, 0.8],  # Medium speed + good
                [60, 0.5],  # Slow + mediocre
                [45, 0.6],
                [30, 0.9]
            ])
            y = np.array([1, 1, 0, 0, 1])  # 1 = can handle harder questions

            self.scalers[topic] = StandardScaler().fit(X)
            X_scaled = self.scalers[topic].transform(X)

            self.models[topic] = LogisticRegression()
            self.models[topic].fit(X_scaled, y)

    def _calculate_topic_features(self, topic):
        """Extract features for ML model: [avg_time, accuracy]"""
        topic_results = self.user_performance.get(topic, [])
        if not topic_results:
            return None

        total_time = sum(r['time_spent'] for r in topic_results)
        avg_time = total_time / len(topic_results)
        accuracy = sum(r['correct'] for r in topic_results) / len(topic_results)

        return np.array([[avg_time, accuracy]])

    def _display_test_structure(self, test_questions):
        """Show difficulty distribution per topic"""
        difficulty_counts = defaultdict(int)
        for q in test_questions:
          difficulty_counts[q['difficulty']] += 1

        print("\nTest Structure:")
        for difficulty, count in difficulty_counts.items():
          print(f"{difficulty}: {count} questions")

    def _predict_difficulty(self, topic, features):
        """Predict if user can handle higher difficulty"""
        scaled_features = self.scalers[topic].transform(features)
        prob = self.models[topic].predict_proba(scaled_features)[0][1]
        return prob > 0.65  # Threshold for difficulty increase

    def run_test(self):
        test_questions = self._select_questions()

        print(f"\n=== Starting Test {self.current_test} ===")
        self._display_test_structure(test_questions)
        results = self._simulate_test_taking(test_questions)
        self._analyze_performance(results)

        self.current_test += 1
        return results

    def _analyze_performance(self, results):
        correct_counts = defaultdict(int)
        difficulty_counts = defaultdict(int)

        for res in results:
            difficulty_counts[res["question"]["difficulty"]] += 1
            if res["correct"]:
                correct_counts[res["question"]["difficulty"]] += 1

        print("\nPerformance by Difficulty Level:")
        for difficulty in ["Easy", "Medium", "Hard"]:
            total = difficulty_counts[difficulty]
            correct = correct_counts[difficulty]
            print(f"{difficulty}: {correct}/{total} correct")

        total_correct = sum(correct_counts.values())
        print(f"\nOverall: {total_correct}/{len(results)} correct")

    def _select_questions(self):
        """Select questions based on current test number and ML predictions"""
        if self.current_test == 1:
            return self._select_initial_questions()

        selected = []
        for topic in range(1, 6):
            # Get ML features for the topic
            features = self._calculate_topic_features(f"Topic {topic}")
            if features is None:
                difficulty = "Easy"
            else:
                # Predict if user can handle higher difficulty
                topic_num = topic
                should_upgrade = self._predict_difficulty(topic_num, features)
                difficulty = "Medium" if should_upgrade else "Easy"

            # Select questions
            topic_questions = [q for q in questions
                             if q["topic"] == f"Topic {topic}"
                             and q["difficulty"] == difficulty][:4]
            selected.extend(topic_questions)
        return selected

    def _select_initial_questions(self):
        """Select 4 easy questions from each topic (20 total)"""
        return [q for q in questions if q["difficulty"] == "Easy"][:20]

    # ... (keep other methods like _display_test_structure,
    # _simulate_test_taking, _analyze_performance from previous version)

    def _simulate_test_taking(self, test_questions):
        """Simulate user answering questions with time-based performance"""
        results = []
        for question in test_questions:
            # Base correct probability based on difficulty
            base_prob = {
                "Easy": 0.85,
                "Medium": 0.65,
                "Hard": 0.4
            }[question["difficulty"]]

            # Time effect: faster answers more likely to be correct
            time_spent = random.uniform(15, 60)
            time_factor = np.interp(time_spent, [15, 60], [1.2, 0.8])
            is_correct = random.random() < base_prob * time_factor

            result = {
                "question": question,
                "correct": is_correct,
                "time_spent": round(time_spent, 2),
                "attempts": 1
            }
            results.append(result)
            self.user_performance[question["topic"]].append(result)

        return results

# Simulation Usage
if __name__ == "__main__":
    cat = CATSystem()

    # Take Test 1
    test1_results = cat.run_test()

    # Take Test 2 with ML-based recommendations
    test2_results = cat.run_test()


=== Starting Test 1 ===

Test Structure:
Easy: 20 questions

Performance by Difficulty Level:
Easy: 17/20 correct
Medium: 0/0 correct
Hard: 0/0 correct

Overall: 17/20 correct

=== Starting Test 2 ===

Test Structure:
Easy: 8 questions
Medium: 12 questions

Performance by Difficulty Level:
Easy: 6/8 correct
Medium: 10/12 correct
Hard: 0/0 correct

Overall: 16/20 correct


**Dataset Generation**

In [1]:
import pandas as pd
import numpy as np

# Generate synthetic data
np.random.seed(42)
n_samples = 1000  # Number of data points

data = {
    "question_id": np.arange(1, n_samples + 1),
    "time_spent": np.random.normal(30, 10, n_samples).clip(5, 60),  # Time in seconds
    "difficulty": np.random.choice(["Easy", "Medium", "Hard"], n_samples, p=[0.3, 0.5, 0.2]),
    "attempts": np.random.randint(1, 5, n_samples),
    "avg_time_spent": np.random.normal(30, 5, n_samples).clip(10, 50),
    "is_correct": np.random.choice([0, 1], n_samples, p=[0.4, 0.6]),  # 60% correct answers
}

df = pd.DataFrame(data)
df.to_csv("cat_dataset.csv", index=False)

In [2]:
data = pd.read_csv("cat_dataset.csv")
data.head()

Unnamed: 0,question_id,time_spent,difficulty,attempts,avg_time_spent,is_correct
0,1,34.967142,Easy,2,30.948531,1
1,2,28.617357,Easy,4,26.690089,1
2,3,36.476885,Medium,3,32.129436,0
3,4,45.230299,Medium,4,30.095739,1
4,5,27.658466,Easy,4,26.792565,1


*Preprocessing*

In [3]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Load data
df = pd.read_csv("cat_dataset.csv")

# Encode difficulty labels
le = LabelEncoder()
df["difficulty"] = le.fit_transform(df["difficulty"])  # Easy=0, Medium=1, Hard=2

# Features and target
X = df[["time_spent", "attempts", "avg_time_spent", "is_correct"]]
y = df["difficulty"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features (optional, based on model)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Train

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Train a classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Predictions
y_pred = model.predict(X_test_scaled)

# Evaluate
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred, target_names=le.classes_))

Accuracy: 0.415
              precision    recall  f1-score   support

        Easy       0.38      0.19      0.26        62
        Hard       0.24      0.20      0.22        40
      Medium       0.47      0.64      0.54        98

    accuracy                           0.41       200
   macro avg       0.36      0.35      0.34       200
weighted avg       0.39      0.41      0.39       200



**Dataset Gen**

In [5]:
import pandas as pd
import numpy as np

# Generate synthetic data
np.random.seed(42)
n_users = 500  # Number of users
topics = [1, 2, 3, 4, 5]
difficulties = ["Easy", "Medium", "Hard"]

data = []
for user in range(n_users):
    for topic in topics:
        # Randomly assign current difficulty
        current_diff = np.random.choice(difficulties, p=[0.3, 0.4, 0.3])

        # Simulate performance metrics
        avg_time = np.random.normal(loc=30 if current_diff == "Easy" else 45 if current_diff == "Medium" else 60, scale=10)
        avg_attempts = np.random.randint(1, 4) if current_diff == "Easy" else np.random.randint(2, 5)
        success_rate = np.clip(np.random.normal(loc=0.7 if current_diff == "Easy" else 0.5 if current_diff == "Medium" else 0.3, scale=0.15), 0, 1)

        # Define next difficulty based on rules (simulate ground truth)
        if success_rate > 0.75:
            next_diff = "Hard" if current_diff == "Medium" else "Medium" if current_diff == "Easy" else "Hard"
        elif success_rate < 0.4:
            next_diff = "Easy" if current_diff == "Medium" else "Medium" if current_diff == "Hard" else "Easy"
        else:
            next_diff = current_diff

        data.append([
            user, topic, current_diff,
            np.abs(avg_time), avg_attempts, success_rate, next_diff
        ])

df = pd.DataFrame(data, columns=["user_id", "topic", "current_difficulty",
                                  "avg_time_spent", "avg_attempts", "success_rate",
                                  "next_difficulty"])
df.to_csv("adaptive_test_dataset.csv", index=False)

In [6]:
df

Unnamed: 0,user_id,topic,current_difficulty,avg_time_spent,avg_attempts,success_rate,next_difficulty
0,0,1,Medium,33.881199,4,0.547835,Medium
1,0,2,Easy,9.890371,3,0.626079,Easy
2,0,3,Easy,24.191219,1,0.621225,Easy
3,0,4,Medium,45.222218,4,0.435831,Medium
4,0,5,Easy,20.919759,2,0.488154,Easy
...,...,...,...,...,...,...,...
2495,499,1,Easy,24.678561,2,0.641715,Easy
2496,499,2,Medium,61.765979,3,0.509648,Medium
2497,499,3,Easy,40.270459,1,0.605377,Easy
2498,499,4,Medium,63.420001,4,0.516709,Medium


Preprocessing

In [7]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Load data
df = pd.read_csv("adaptive_test_dataset.csv")

# Encode categorical features
le_diff = LabelEncoder()
le_topic = LabelEncoder()

df["current_difficulty"] = le_diff.fit_transform(df["current_difficulty"])
df["topic"] = le_topic.fit_transform(df["topic"])
df["next_difficulty"] = le_diff.transform(df["next_difficulty"])

# Features and target
X = df[["topic", "current_difficulty", "avg_time_spent", "avg_attempts", "success_rate"]]
y = df["next_difficulty"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

train

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Evaluate
y_pred = model.predict(X_test_scaled)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred, target_names=le_diff.classes_))

Accuracy: 1.00
              precision    recall  f1-score   support

        Easy       1.00      1.00      1.00       138
        Hard       1.00      1.00      1.00        54
      Medium       1.00      1.00      1.00       308

    accuracy                           1.00       500
   macro avg       1.00      1.00      1.00       500
weighted avg       1.00      1.00      1.00       500



Difficulty Suggestion

In [10]:
import joblib

# Save artifacts
joblib.dump(model, "difficulty_predictor.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(le_diff, "label_encoder.pkl")

# Load artifacts
model = joblib.load("difficulty_predictor.pkl")
scaler = joblib.load("scaler.pkl")
le_diff = joblib.load("label_encoder.pkl")

def suggest_difficulty(topic, current_diff, avg_time, avg_attempts, success_rate):
    # Encode inputs
    topic_encoded = le_topic.transform([topic])[0]
    current_diff_encoded = le_diff.transform([current_diff])[0]

    # Prepare features
    features = np.array([[topic_encoded, current_diff_encoded, avg_time, avg_attempts, success_rate]])
    features_scaled = scaler.transform(features)

    # Predict
    pred = model.predict(features_scaled)
    return le_diff.inverse_transform(pred)[0]

# Example usage
print(suggest_difficulty(
    topic=2,
    current_diff="Easy",
    avg_time=45,
    avg_attempts=3,
    success_rate=0.8
))  # Output: "Hard"

Medium


