#Load & inspect

In [None]:
!unzip dataset.zip

In [None]:
import os
data_dir = "MATH"

files = os.listdir(data_dir)
print("Number of files:", len(files))
print("First 5 files:", files[:5])

In [None]:
topic_dir = "MATH/train/algebra"
algebra_files = os.listdir(topic_dir)

print("Number of algebra questions:", len(algebra_files))
print("First 3 files:", algebra_files[:3])


###inside json file

In [None]:
import json

sample_file = os.path.join(topic_dir, algebra_files[0])

with open(sample_file, "r") as f:
    sample_data = json.load(f)

sample_data


###check - if all the subjects has the same configration (ex- problem,lvl,type etc)


In [None]:
topics = os.listdir("MATH/train")

for topic in topics[:3]:
    topic_path = os.path.join("MATH/train", topic)
    file = os.listdir(topic_path)[0]
    with open(os.path.join(topic_path, file)) as f:
        print(topic, json.load(f).keys())


###Loading full training data

In [None]:
import os
import json

def clean_text(text):
    text = text.lower()
    text = text.replace("\n", " ")
    text = " ".join(text.split())
    return text


texts = []
labels = []

train_dir = "MATH/train"

for topic in os.listdir(train_dir):
    topic_path = os.path.join(train_dir, topic)

    if not os.path.isdir(topic_path):
        continue

    for file in os.listdir(topic_path):
        file_path = os.path.join(topic_path, file)

        with open(file_path, "r") as f:
            data = json.load(f)

            if "problem" in data:
                texts.append(clean_text(data["problem"]))
                labels.append(topic)


####verification

In [None]:
print(len(texts), len(labels))
from collections import Counter
Counter(labels)
print(texts[0])
print(labels[0])
set(labels)

In [None]:
import numpy as np

lengths = [len(t.split()) for t in texts]

print("Total samples:", len(texts))
print("avg q length:", np.mean(lengths))
print("Max length:", max(lengths))


#preprocessing

In [None]:
clean_texts = [clean_text(t) for t in texts]

filtered_texts = []
filtered_labels = []
for t,l in zip(clean_texts,labels):  #zip creates pairs
  filtered_texts.append(t)
  filtered_labels.append(l)

print("After filtering:",len(filtered_texts))

In [None]:
set(filtered_labels)

#vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer(
    lowercase=False,
    stop_words="english", #removes the,is,and ...
    ngram_range=(1,2),
    max_features=5000,
    token_pattern=r'(?u)\b[a-zA-Z][a-zA-Z]+\b'
)

###fit and transform the text

In [None]:
X = vectorizer.fit_transform(filtered_texts)
print("TF-IDF shape:", X.shape)

In [None]:
#check
#code extracts the top TF-IDF weighted words
import numpy as np

sample_vector = X[0].toarray()[0]
top_indices = np.argsort(sample_vector)[-10:]

top_words = vectorizer.get_feature_names_out()[top_indices]
top_words


#LabelEncoding


In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
#fitting
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(filtered_labels)

In [None]:
#label mapping
for i, label in enumerate(label_encoder.classes_):
    print(i, "â†’", label)


#Train/Test/Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y   #for non biased distribution in the testing data
)


##training on logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    max_iter=1000,
    n_jobs=-1
)

model.fit(X_train, y_train)

#evaluation

In [None]:
from sklearn.metrics import accuracy_score, classification_report
y_pred = model.predict(X_test)

print("accuracy:", accuracy_score(y_test, y_pred))

####manual Evaluation

In [None]:
sample_question = "Find the derivative of x^2 + 3x + 1"
sample_vec = vectorizer.transform([clean_text(sample_question)])
pred = model.predict(sample_vec)

print("Predicted topic:",
      label_encoder.inverse_transform(pred)[0])

#LLM soln generation


In [None]:
!pip install -q transformers accelerate bitsandbytes torch

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM


In [None]:
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",        # automatically use GPU
    load_in_4bit=True,        # makes 7B fit on Colab
    torch_dtype=torch.float16
)


In [None]:
def build_prompt(question):
    return f"""
You are a high school mathematics teacher.
Explain the following problem step by step in a clear and student-friendly way.

Problem:
{question}

Solution:
"""


In [None]:
def generate_solution_local(question):
    prompt = build_prompt(question)

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=300,
        temperature=0.3,
        do_sample=True
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [None]:
question = texts[0]   #check on small sample

print("Question:")
print(question)

print("\nLLM Generated Explanation:")
print(generate_solution_local(question))
