In [31]:
!pip install sentence_transformers

import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle
from google.colab import files

print("ðŸ“Œ Loading dataset...")

# Load your dataset safely
df = pd.read_csv("ai_dataset.tsv", sep="\t", engine="python")

# Clean correctness column
df["correctness"] = pd.to_numeric(df["correctness"], errors="coerce")
df = df.dropna(subset=["correctness"])
df["correctness"] = df["correctness"].astype(int)

print("Dataset loaded:", df.shape)

# Load model
print("ðŸ“Œ Loading embedding model...")
model = SentenceTransformer("all-MiniLM-L6-v2")

# Compute similarity
print("ðŸ“Œ Computing similarities...")
def safe_sim(a, b):
    try:
        return util.cos_sim(
            model.encode(a, convert_to_tensor=True),
            model.encode(b, convert_to_tensor=True)
        ).item()
    except:
        return 0.0

df["similarity"] = df.apply(lambda r: safe_sim(r["ai_answer"], r["expected_answer"]), axis=1)

# Train model
print("ðŸ“Œ Training classifier...")
X = df[["similarity"]].values
y = df["correctness"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

clf = LogisticRegression()
clf.fit(X_train, y_train)

acc = accuracy_score(y_test, clf.predict(X_test))
print("ðŸŽ‰ Model trained. Accuracy:", acc)

# Save classifier only (fast download)
with open("classifier.pkl", "wb") as f:
    pickle.dump(clf, f)

files.download("classifier.pkl")

print("ðŸŽ‰ DONE! Classifier downloaded. You can submit the project now.")


ðŸ“Œ Loading dataset...
Dataset loaded: (49, 5)
ðŸ“Œ Loading embedding model...
ðŸ“Œ Computing similarities...
ðŸ“Œ Training classifier...
ðŸŽ‰ Model trained. Accuracy: 0.8


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

ðŸŽ‰ DONE! Classifier downloaded. You can submit the project now.
